 ### &ensp;&ensp;&ensp;&ensp;&ensp;&ensp;&ensp;**Yelp Data Challenge**
 - &ensp;&ensp;**Restaurants Classification by sentiment analysis of user reviews**<br>

     &ensp;&ensp; By&ensp;&ensp;Manoj Vaidya<br>



In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob
from textblob import Word

In [None]:
#READ yelp_academic_dataset_business DATASET

business = pd.read_csv("yelp_academic_dataset_business.csv")
print(business.shape)
business.head().transpose()

In [None]:
#Drop attributes which are not required
business.drop('neighborhood', axis = 1, inplace = True)
business.drop('postal_code', axis = 1, inplace = True)
business.drop('attributes', axis = 1, inplace = True)
business.drop('hours', axis = 1, inplace = True)
business.drop(business[business.is_open == 0].index, inplace = True)
business.drop('is_open', axis = 1, inplace = True)
business.head().transpose()

In [None]:
business.isnull().sum()

In [None]:
business=business.dropna(subset=['categories'])
business=business.dropna(subset=['city'])

In [None]:
#Fill null address as not_known

business['address'].fillna("not_known", inplace=True)

#Fill latitude and longitude with city means

if business["latitude"].isna:
    business["latitude"] = business.groupby("city").transform(lambda x: x.fillna(x.mean()))

if business["longitude"].isna:
    business["longitude"] = business.groupby("city").transform(lambda x: x.fillna(x.mean()))["longitude"]

print(business.isnull().sum())    
business.head(10).transpose()

In [None]:
#Categories other than restaurants is not necesaary. Can be droped

business.drop(business[~business['categories'].str.contains('Restaurants')].index, inplace = True)
print(business.shape)
business.head().transpose()

In [None]:
#READ yelp_academic_dataset_review DATASET

review = pd.read_csv("yelp_academic_dataset_review.csv")
print(review.shape)
review.head()

In [None]:
#Null reviews can be dropped

print(review.isnull().sum())
review=review.dropna()
review.isnull().sum()

In [None]:
#Extracting day,month and year from date

review['date'] = pd.to_datetime(review.date, format='%Y-%m-%d', errors='ignore')
review['day'] = review['date'].dt.day
review['month'] = review['date'].dt.month
review['year'] = review['date'].dt.year
review.drop('date', axis = 1, inplace = True)
review.head().transpose()

In [None]:
#Rename stars to avoid conflict with other dataset

review = review.rename(index=str,columns={'stars':'review_stars'})

In [None]:
##MERGE BUSINESS AND REVIEW DATA

business_reviews=business.merge(review, left_on='business_id', right_on='business_id', how='left')

In [None]:
# Data Reduction (To avoid memory overhead during text processing)

business_review_red=business_review.sample(frac=0.60, random_state=99)
business_review_red=business_review_red.reset_index(drop=True)

In [None]:
#word count
business_review_red['word_count(review)'] = business_review_red['text'].apply(lambda x: len(str(x).split(" ")))

#char_count
business_review_red['char_count(review)'] = business_review_red['text'].str.len() ## this also includes spaces

#average word length
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

business_review_red['avg_word_len(review)'] = business_review_red['text'].apply(lambda x: avg_word(x))

#stopwords
stop = stopwords.words('english')
business_review_red['stopwords(review)'] = business_review_red['text'].apply(lambda x: len([x for x in x.split() if x in stop]))


business_review_red[['text','word_count(review)','char_count(review)','avg_word_len(review)','stopwords(review)']].head()

In [None]:
#uppercase to lowercase

business_review_red['text'] = business_review_red['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
print("Converted.....")


#removing punctuation

business_review_red['text'] =business_review_red['text'].str.replace('[^\w\s\-\+\-]','')
print("Removed punctuations....")


#remove stop_words

stop = stopwords.words('english')
business_review_red['text'] =business_review_red['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
print("Removed stop words.....")


#spelling correction

business_review_red['text'][:5].apply(lambda x: str(TextBlob(x).correct()))
print("Spellings corrected.....")


business_review_red['text'].head()

In [None]:
#Lemmatization

print("Performing Lemmatization.....")
business_review_red['text'] = business_review_red['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
print("Lemmatization Done......")



#Tokenization

print("Performing Tokenization.......")
business_review_red['text']=business_review_red.apply(lambda row: nltk.word_tokenize(row['text']),axis=1)
print("tokenizaton done.......")
business_review_red['text'].head()

In [None]:
#Rename text to review to avoid conflicts

business_review_red=business_review_red.rename(index=str,columns={'text':'review'})

In [None]:
#READ yelp_academic_dataset_tip DATASET

tip = pd.read_csv("yelp_academic_dataset_tip.csv")
print(tip.shape)
tip.isnull().sum()

In [None]:
#Drop null values and date attribute

tip=tip.dropna()
tip.drop('date', axis = 1, inplace = True)
tip.head().transpose()

In [None]:
#Merge business and tip  using business_id as key

business_tip=business.merge(tip, left_on=['business_id'], right_on='business_id', how='left')
print(business_tip.shape)
business_tip.isnull().sum()

In [None]:
#Drop rows with no tips

business_tip=business_tip.dropna()
business_tip.isnull().sum()

In [None]:
#word count
business_tip['word_count(tip)'] = business_tip['text'].apply(lambda x: len(str(x).split(" ")))

#char_count
business_tip['char_count(tip)'] = business_tip['text'].str.len() ## this also includes spaces

#average word length
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/(len(words)+0.1))

business_tip['avg_word_len(tip)'] = business_tip['text'].apply(lambda x: avg_word(x))


#stopwords
stop = stopwords.words('english')

business_tip['stopwords(tip)'] = business_tip['text'].apply(lambda x: len([x for x in x.split() if x in stop]))
business_tip[['text','word_count(tip)','char_count(tip)','avg_word_len(tip)','stopwords(tip)']].head()

In [None]:
#uppercase to lowercase

business_tip['text'] = business_tip['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
print("Converted.....")


#removing punctuation

business_tip['text'] = business_tip['text'].str.replace('[^\w\s\-\+\-]','')
print("Punctuations removed....")


#remove stop_words

stop = stopwords.words('english')
business_tip['text'] = business_tip['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
print("Stop words removed....")


#spelling correction

business_tip['text'][:5].apply(lambda x: str(TextBlob(x).correct()))
print("Spellings corrected....")


business_tip['text'].head()

In [None]:
#Lemmatization

print("Performing lemmatization....")
business_tip['text'] = business_tip['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
print("Lemmatization done........")


#Tokenization

print("Performing tokenization.....")
business_tip['text']=business_tip.apply(lambda row: nltk.word_tokenize(row['text']),axis=1)
print("Tokenization done....")
business_tip['text'].head()

In [None]:
#Rename text attribute to tip

business_tip=business_tip.rename(index=str,columns={'text':'tip'})
business_review_red.to_csv("business_tip.csv",index=False)

In [None]:
business_tip.to_csv("business_tip.csv",index=False)

In [None]:
## Action: 
## Loading dependencies

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib.colors import ListedColormap
import seaborn as sns
import re
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn import svm
from sklearn import tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score, accuracy_score, log_loss
import nltk
from nltk import pos_tag, pos_tag_sents
nltk.download('wordnet')
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.corpus import sentiwordnet as swn
import tensorflow as tf
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk.corpus import sentiwordnet as swn
from nltk import sent_tokenize, word_tokenize, pos_tag
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from textblob import TextBlob
from textblob import Word
import nltk
import xgboost as xgb
from xgboost.sklearn import XGBClassifier
from sklearn import decomposition, grid_search

In [None]:
business_reviews = pd.read_csv("business_review_red(lat).csv")

In [None]:
## 1. Clean the text data in review dataset
## 2. Tokenize all text to sentences and then words
## 3. Tag all the words with position tags
## 4. Convert all tags to wordnet tags
## 5. Lemmatize the words and create synsets
## 6. Compute sentiment score (positive score - negative score)
## 7. Return a sentiment polarity score: 1 = positive, 0 = negative

#nltk.download('punkt')
 
lemmatizer = WordNetLemmatizer()
  
def convert_tag(tag):
    if tag.startswith('J'):
        return wn.ADJ
    elif tag.startswith('N'):
        return wn.NOUN
    elif tag.startswith('R'):
        return wn.ADV
    elif tag.startswith('V'):
        return wn.VERB
    return None
 
def clean_text(text):
    text = text.replace("<br />", " ")
    text = re.sub('<[^>]*>', '', text)
    #emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
    #text = (re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', ''))
    #text = text.decode("utf-8")
    return text
 
def compute_swn_polarity_score(text):
    sentiment_score = 0.0
    num_tokens = 0
    text = clean_text(text)
    sentences = sent_tokenize(text)
    for sentence in sentences:
        tagged_sentence = pos_tag(word_tokenize(sentence))
        for word, tag in tagged_sentence:
            wn_tag = convert_tag(tag)
            if wn_tag not in (wn.NOUN, wn.ADJ, wn.ADV):
                continue
            lemma = lemmatizer.lemmatize(word, pos=wn_tag)
            if not lemma:
                continue
            synsets = wn.synsets(lemma, pos=wn_tag)
            if not synsets:
                continue
            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())
            sentiment_score += swn_synset.pos_score() - swn_synset.neg_score()
            num_tokens += 1
    # sum greater than 0 => positive sentiment
    if sentiment_score >= 0:
        return 1
    # All other scenarios => negative sentiment
    else:
        return 0

In [None]:
## Create a new column 'polarity' with sentiment polarity as 1 (positive) or 0 (negative)
business_reviews['polarity'] = business_reviews['text'].apply(compute_swn_polarity_score)
business_reviews.head()

In [None]:
#calculate the net positive sentiment score: net_positive_sentiment_score = sum(polarity) / total_count_of_reviews

def transform_review(business_reviews1):
    business_reviews1_f = business_reviews1.groupby('business_id').mean().reset_index()
    #review3['stars'].round(2)
    business_reviews1_f['stars'] =  business_reviews1_f['stars'].apply(lambda x: round(x,2))
    business_reviews1_f['polarity'] = business_reviews1_f['polarity'].apply(lambda x: round(x,2))
    business_reviews1_f.rename(columns={'polarity':'net_positive_sentiment_score'}, inplace=True)
    return  business_reviews1_f
business_reviews1_f = transform_review( business_reviews1)
business_reviews1_f.head()

In [None]:
business_reviews1_f.drop(business_reviews1_f.columns[[5, 6,7, 8, 9, 10, 11, 12, 13, 14, 15]],axis=1,inplace=True)
business_reviews1_f.head()

In [None]:
tip = pd.read_csv("tip_pre_proc.csv")


In [None]:
## Create a new column 'polarity' with sentiment polarity as 1 (positive) or 0 (negative)

tip['polarity'] = tip2['text'].apply(compute_swn_polarity_score)
tip['text'].head()

In [None]:
#calculate the net positive sentiment score: net_positive_sentiment_score = sum(polarity) / total_count_of_reviews

def transform_tip(tip):
    tip1 = tip.groupby('business_id').mean().reset_index()
    tip1['polarity'] = tip1['polarity'].apply(lambda x: round(x,2))
    tip1.rename(columns={'polarity':'net_positive_sentiment_score'}, inplace=True)
    return tip1

tip1 = transform_tip(tip)
tip1['net_positive_sentiment_score'].head()

In [None]:
## 1. Join business_review and tip1 dataframes

def merge_business_review_tip(tip1, business_review):
    business_review_tip = pd.merge(business_review, tip1, on='business_id', how='left')
    business_review_tip.rename(columns={'net_positive_sentiment_score_x':'net_positive_sentiment_score_review'}, inplace=True)
    business_review_tip.rename(columns={'net_positive_sentiment_score_y':'net_positive_sentiment_score_tip'}, inplace=True)
    return business_review_tip

business_review_tip = merge_business_checkin_review_tip(tip1, business_review)

In [None]:
data = business_checkin_review_tip
data = data.dropna()

In [None]:
corr = data.select_dtypes(include = ['float64', 'int64']).iloc[:, 1:].corr()
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
f, ax = plt.subplots(figsize=(12, 12))
sns.heatmap(corr, mask=mask, vmax=1, annot=True,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
data.drop(data.columns[[7]], axis=1, inplace=True)

In [None]:
data=pd.read_csv("business_review(senti_final2222).csv")

In [None]:
## Categorize all restaurants

def categorize(data):
    if data['stars'] >= 3.8:
        return 'Excellent'
    elif data['stars'] <= 1.2:
        return 'Poor'
    else:
        return 'Average'
    
data['restaurant_category'] = data.apply(categorize, axis=1)

data.head()

In [None]:
final_data = data[['business_id', 'restaurant_category','net_positive_sentiment_score']]
final_data.head()

In [None]:
final_data=pd.read_csv("final_data.csv")

In [None]:
##Splitting into test and train

X = final_data[['net_positive_sentiment_score']]
y = final_data[['restaurant_category']].values.ravel()
scaler = StandardScaler()
scaler.fit(X)
X = scaler.transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [None]:
classifiers = []
accuracies = []

In [None]:
clf = tree.DecisionTreeClassifier(max_depth = 5) 
clf = clf.fit(X_train, y_train)
pred = clf.predict(X_test)
accuracy = accuracy_score(pred, y_test)
classifiers.append('DecisionTreeClassifier')
accuracies.append(accuracy)
print(round(accuracy, 4))

In [None]:
print(classification_report(y_test, pred))

In [None]:
clf = svm.SVC(kernel = 'rbf', C = 10) 
clf = clf.fit(X_train, y_train)
pred = clf.predict(X_test)
accuracy = accuracy_score(pred, y_test)
classifiers.append('svm.SVC')
accuracies.append(accuracy)
print(round(accuracy, 4))

In [None]:
print(classification_report(y_test, pred))

In [None]:
clf = GaussianNB() 
clf = clf.fit(X_train, y_train)
pred = clf.predict(X_test)
accuracy = accuracy_score(pred, y_test)
classifiers.append('GaussianNB')
accuracies.append(accuracy)
print(round(accuracy, 4))

In [None]:
clf = LogisticRegression()
clf = clf.fit(X_train, y_train)
pred = clf.predict(X_test)
accuracy = accuracy_score(pred, y_test)
classifiers.append('LogisticRegression')
accuracies.append(accuracy)
print(round(accuracy, 4))

In [None]:
print(classification_report(y_test, pred))

In [None]:
clf = KNeighborsClassifier()
clf = clf.fit(X_train, y_train)
pred = clf.predict(X_test)
classifiers.append('KNN')
accuracy = accuracy_score(pred, y_test)
accuracies.append(accuracy)
print(round(accuracy, 4))

In [None]:
## Finding optimal number of neighbors

l=list(range(1,50))
a=pd.Series()
x=[5,10,15,20,25,30,35,40,45]
for i in l:
    clf = KNeighborsClassifier(n_neighbors=i) 
    clf.fit(X_train, y_train)
    pred = clf.predict(X_test)
    a = a.append(pd.Series(accuracy_score(pred, y_test)))
plt.title('Find optimum number of neighbors')
plt.plot(l, a)
plt.xticks(x)

In [None]:
clf = KNeighborsClassifier(n_neighbors = 25)
clf = clf.fit(X_train, y_train)
pred = clf.predict(X_test)
accuracy = accuracy_score(pred, y_test)
classifiers.append('KNeighborsClassifier(OPTIMIZED)')
accuracies.append(accuracy)
print(round(accuracy, 4))

In [None]:
clf = KNeighborsClassifier(n_neighbors = 27)
clf = clf.fit(X_train, y_train)
pred = clf.predict(X_test)
accuracy = accuracy_score(pred, y_test)
#classifiers.append('KNeighborsClassifier')
#accuracies.append(accuracy)
print(round(accuracy, 4))

In [None]:
## Decision Tree with GridSearchCV
from sklearn import tree, grid_search
parameters = {'criterion':('gini', 'entropy'), 'max_depth':[16, 37, 58, 100, 15]}
dt = tree.DecisionTreeClassifier()
clf = grid_search.GridSearchCV(dt, parameters)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
accuracy = accuracy_score(pred, y_test)
print(round(accuracy, 4))


In [None]:

# Grid Search - Used to find best combination of parameters
XGB_model = xgb.XGBClassifier(objective='multi:softprob',subsample=0.5, colsample_bytree=0.5, seed=0)

param_grid = {'max_depth': [10,20,30,40,50,60,70,80,90], 'learning_rate': [0.1, 0.3], 'n_estimators': [25, 50]}.
model = grid_search.GridSearchCV(estimator=XGB_model, param_grid=param_grid,scoring='accuracy', verbose=1, n_jobs=1, iid=True, refit=True, cv=3)

#model.fit(X, y)
model.fit(X_train, y_train)
print("Best score: %0.3f" % model.best_score_)
print("Best parameters set:")
best_parameters = model.best_estimator_.get_params()
for param_name in sorted(param_grid.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
classifiers.append('DecisionTreeClassifier(XGboost)')
accuracies.append(model.best_score_)

In [None]:
plt.figure(figsize=(20,15))
sns.set_context("paper", font_scale=1.6)
plt.scatter(classifiers, accuracies, s=600, c="green", alpha=0.5)
plt.title('Accuracy vs Classifiers')
plt.show()

In [None]:
preds = clf.predict(X)
data['predicted_restaurant_category'] = preds
data.head()

In [None]:
x = ['Poor', 'Average', 'Excellent']
y1 = len(data[data.restaurant_category == 'Poor'])
y2 = len(data[data.restaurant_category == 'Average'])
y3 = len(data[data.restaurant_category == 'Excellent'])
y1_pred = len(data[data.predicted_restaurant_category == 'Poor'])
y2_pred = len(data[data.predicted_restaurant_category == 'Average'])
y3_pred = len(data[data.predicted_restaurant_category == 'Excellent'])
y = [y1, y2, y3]
y_pred = [y1_pred, y2_pred, y3_pred]

plt.figure(figsize=(12,4))
sns.set_context("paper", font_scale=1.3)
sns.set_style("whitegrid")
plt.subplot(1,2,1)
sns.barplot(x=x, y=y, data=data, alpha = 0.7)
plt.subplot(1,2,2)
sns.barplot(x=x, y=y_pred, data=data, alpha = 0.7)