In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy
import sklearn
import seaborn as sns
import re
from collections import Counter

%matplotlib inline

This notebook practice a simple bag of words model for text matching.

In [2]:
data = pd.read_csv('https://github.com/Thinkful-Ed/data-201-resources/raw/master/hotel-reviews.csv')

In [3]:
data.head()

Unnamed: 0,address,categories,city,country,latitude,longitude,name,postalCode,province,reviews.date,reviews.dateAdded,reviews.doRecommend,reviews.id,reviews.rating,reviews.text,reviews.title,reviews.userCity,reviews.username,reviews.userProvince
0,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2013-09-22T00:00:00Z,2016-10-24T00:00:25Z,,,4.0,Pleasant 10 min walk along the sea front to th...,Good location away from the crouds,,Russ (kent),
1,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-04-03T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,Really lovely hotel. Stayed on the very top fl...,Great hotel with Jacuzzi bath!,,A Traveler,
2,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2014-05-13T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,Ett mycket bra hotell. Det som drog ner betyge...,Lugnt l��ge,,Maud,
3,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2013-10-27T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,We stayed here for four nights in October. The...,Good location on the Lido.,,Julie,
4,Riviera San Nicol 11/a,Hotels,Mableton,US,45.421611,12.376187,Hotel Russo Palace,30126,GA,2015-03-05T00:00:00Z,2016-10-24T00:00:25Z,,,5.0,We stayed here for four nights in October. The...,������ ���������������,,sungchul,


In [4]:
# data cleaning
data['reviews.text'] = data['reviews.text'].str.lower()
data['reviews.text'] = data['reviews.text'].str.replace(r'\.|\!|\?|\'|,|-|\(|\)', "",)
data['reviews.text'] = data['reviews.text'].fillna('')

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=5000) # max features is how many words we want to allow us to create columns for

In [6]:
# vectorize reviews to transform sentences into volumns
X = vectorizer.fit_transform(data['reviews.text'])

bag_of_words = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names()) # convert to dataframe

In [7]:
data.rename(columns={'address': 'hotel_address', 'city': 'hotel_city',
                     'country':'hotel_country', 'name':'hotel_name'},
            inplace=True)

new_df = data.join(bag_of_words) # add bag of words to original df

In [8]:
X = bag_of_words # X is the words
Y_hotel = new_df['hotel_name'] # Y is the hotel name

In [9]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

clf.fit(X, Y_hotel)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [10]:
# create a random hotel review
test_review = ['''
    I loved the beach and the sunshine and the clean and modern room.
    ''']

X_test = vectorizer.transform(test_review).toarray() # convert review into a vector

In [28]:
prediction = clf.predict(X_test)[0]

In [29]:
prediction

'The Alexandrian, Autograph Collection'

In [31]:
data[data['hotel_name'] == prediction][['hotel_name', 'hotel_address', 
                                        'hotel_city', 'hotel_country']]

Unnamed: 0,hotel_name,hotel_address,hotel_city,hotel_country
4744,"The Alexandrian, Autograph Collection",480 King St,Alexandria,US
4745,"The Alexandrian, Autograph Collection",480 King St,Alexandria,US
4746,"The Alexandrian, Autograph Collection",480 King St,Alexandria,US
4747,"The Alexandrian, Autograph Collection",480 King St,Alexandria,US
4748,"The Alexandrian, Autograph Collection",480 King St,Alexandria,US
4749,"The Alexandrian, Autograph Collection",480 King St,Alexandria,US
4750,"The Alexandrian, Autograph Collection",480 King St,Alexandria,US
4751,"The Alexandrian, Autograph Collection",480 King St,Alexandria,US
4752,"The Alexandrian, Autograph Collection",480 King St,Alexandria,US
4753,"The Alexandrian, Autograph Collection",480 King St,Alexandria,US
