Natural Language Processing
Review Data from Airbnb

In [1]:
#Read in libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import nltk
import sklearn

In [2]:
#Settings for notebook
#Ignore warnings
import warnings; warnings.simplefilter('ignore')

#Increase number of columns and rows displayed by Pandas
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows',100)

In [3]:
#Set path to data
path = r'C:\Users\kishe\Documents\Data Science\Projects\Python Projects\In Progress\Air BnB - SF\Data\03_Processed'

#Read in data
reviews = pd.read_csv(path + '/01_08_2020_Reviews_Processed_Text_Analysis.csv',sep=',',
                 parse_dates=['date'])

In [4]:
reviews.head().T

Unnamed: 0,0,1,2,3,4
comments,Large place with incredible views on a hill ne...,My boyfriend and I stayed here for 2 nights in...,This is nice place But I was hard to go up the...,Really great value for San Francisco It can be...,Really handy little space in a great location
date,2019-10-12 00:00:00,2019-06-23 00:00:00,2019-04-28 00:00:00,2019-06-11 00:00:00,2019-04-22 00:00:00
id_review,545636224,475163086,445040857,468132674,442109651
listing_id,631635,32549667,2770268,8685898,585225
reviewer_id,31409388,94404578,87489570,52332377,59626298
reviewer_name,Kelly,Natalie,石川,Natasha,Kate
score,95,88,94,88,95
comments_without_stopwords,Large place incredible views hill near one vib...,My boyfriend I stayed 2 nights order attend we...,This nice place But I hard go hill Theyre kind...,Really great value San Francisco It hard find ...,Really handy little space great location
score_labels,Poor,Poor,Poor,Poor,Poor


Split data into training and test Data

In [5]:
#Convert comments_without_stopwords and score_labels into arrays
X = reviews['comments_without_stopwords'].values 
y = reviews['score_labels'].values 

#Check
print(X.shape)
print(y.shape)

(34923,)
(34923,)


In [6]:
#instantiate test_train_split
from sklearn.model_selection import train_test_split

#Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

#Check 
print('Training data and corresponding labels: ',X_train.shape, y_train.shape)
print('Test data and corresponding labels: ',X_test.shape, y_test.shape)

Training data and corresponding labels:  (26192,) (26192,)
Test data and corresponding labels:  (8731,) (8731,)


Tokenize comments_without_stopwords

In [7]:
#Import CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer

#Instantiate CountVectorizer and fit to training data
vect = CountVectorizer().fit(X_train)

#Transform
X_train_transformed = vect.transform(X_train)

#Print matrix dimensions
print("X_train:\n{}".format(repr(X_train_transformed)))


X_train:
<26192x19844 sparse matrix of type '<class 'numpy.int64'>'
	with 600568 stored elements in Compressed Sparse Row format>


In [8]:
#Capture information about features
feature_names = vect.get_feature_names()
print('Number of features: {:,} '.format(len(feature_names)))
print('\nFirst 25 features: {}'.format(feature_names[:25]))
print('\nEvery 500th features: {}'.format(feature_names[::500]))

Number of features: 19,844 

First 25 features: ['02', '02h00', '03', '04', '05', '07', '0700', '08', '10', '100', '1000', '100000x', '1000s', '1000x', '100100', '100150mbs', '1005', '100bucks', '100ft', '100m', '100night', '100s', '101', '1010', '1011pm']

Every 500th features: ['02', '650', 'aka', 'appropriately', 'barefooted', 'blueground', 'cafefully', 'chest', 'committing', 'coveted', 'deleila', 'door', 'emergencies', 'extravagantly', 'foods', 'genuilely', 'haleys', 'horario', 'ingleside', 'jesper', 'kristine', 'lizettes', 'marilyn', 'miriam', 'neighbor', 'olivers', 'pathway', 'poncitio', 'quckly', 'relevant', 'rooftops', 'seethrough', 'sitextremely', 'sponge', 'sucks', 'teresas', 'trader', 'unmet', 'vouliagmeni', 'withlisa']


Before we do any additional engineering, let's evaluate how well a Logistic Regression will do with the model and features as is

In [13]:
#Import models
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

#Instantiate and evaluate model
scores = cross_val_score(LogisticRegression(),X_train_transformed, y_train, cv = 5)
print('Accuracy: {}'.format(np.mean(scores)))



Accuracy: 0.41982273097634987


43.5% is not very good at all, let's do a little more work with our training data and see if we can improve model performance

Feature reduction
Set token appearance to 5 documents

In [16]:
#reduce min_df
vect=CountVectorizer(min_df = 5).fit(X_train)
X_train_transformed = vect.transform(X_train)

#Capture information about features
feature_names = vect.get_feature_names()
print('Number of features: {:,} '.format(len(feature_names)))

Number of features: 5,777 


In [17]:
scores = cross_val_score(LogisticRegression(), X_train_transformed, y_train, cv = 5)
print('Accuracy: {}'.format(np.mean(scores)))

Accuracy: 0.40978145095578766


Naive implementation of SVR using 

In [None]:
# #Normalize word count matrix
# from sklearn.feature_extraction.text import TfidfVectorizer 

# #reduce the dimensionality to retain the first N components which capture the major variance
# from sklearn.decomposition import TruncatedSVD 

# from sklearn.svm import SVR

# #Summon Pipeline
# from sklearn.pipeline import Pipeline

# #Instantiate pipeline
# pipeline = Pipeline(steps=[('tfidf', TfidfVectorizer()), 
#                            ('svd', TruncatedSVD(random_state=42)), 
#                            ('clf', SVR())])
# #Check
# print(pipeline)

Pipeline Optimization

In [None]:
# #Summon RandomizedSearchCV
# from sklearn.model_selection import RandomizedSearchCV

# #Set Param grid for RandomizedSearchCV to explore
# param_grid= {'tfidf__max_df':(.5,.75, 1.0),
#              'svd__n_components': (50, 100, 150, 200),
#              'clf__C':(.1,1,10)}

# #Instantiate model
# random_search = RandomizedSearchCV(estimator=pipeline,param_distributions=param_grid,
#                             verbose=10, n_jobs=-1, scoring = 'r2')
# #Score
# random_search.fit(X_train, y_train)

In [None]:
# #View average score
# print("Best score: {:.3f}".format(random_search.best_score_))
# print("Best parameters set:")
# best_parameters = random_search.best_estimator_.get_params()


In [None]:
#test using regression

# Machine Learning

Is there value in capturing the numeric counts of amenities per listing? See below

In [None]:
# #Split strings on commas into features
# df.amenities=df.amenities.str.split(pat=',', expand = False)

# #Create amenities count and assign to df
# df['amenities_count'] = df['amenities'].str.len()