# NLP Analysis of google reviews

In [None]:
import pandas as pd
from nltk.tokenize import word_tokenize
import string
import matplotlib.pyplot as plt
import re
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import sklearn.metrics
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

## Import and clean df

In [None]:
merged_df = pd.read_csv("../raw_data/merged_reviews_5_!2.csv")
merged_df.head(1)

In [None]:
cleaned_merged_df = merged_df.drop(columns=['Unnamed: 0', 'review_count', 'rating'])
cleaned_merged_df.head(1)

In [None]:
cleaned_merged_df.columns

In [None]:
cleaned_merged_df.shape

### proportion of ratings

In [None]:
def review_proportions(df):
    rating_prop_df = pd.DataFrame\
    ((df.comment_ratings.value_counts()/df.shape[0])*100)\
    .rename(columns = {'comment_ratings':'proportion'}).sort_index(ascending=False) #calculate percentage, rename column and sort index

    rating_prop_df.index.names = ['ratings'] # rename index

    rating_prop_df.proportion = rating_prop_df.proportion.map(lambda x: round(x)) # round % figures
    
    return rating_prop_df

In [None]:
review_proportions(cleaned_merged_df)

### how many reviews originally in portuguese?

In [None]:
cleaned_merged_df.comment_comments.map(lambda x: 1 if str(x.find(" pelo")).isdigit()==True else 0).sum() #how many reviews translated into portuguese?

In [None]:
cleaned_merged_df['in_portuguese?'] = cleaned_merged_df.comment_comments.map(lambda x: 1 if str(x.find(" pelo")).isdigit()==True else 0) #create a new column to say which columns are portguese

### proportion of ratings without translated portuguese reviews

In [None]:
english_reviews_df = cleaned_merged_df[cleaned_merged_df['in_portuguese?'] == 0] # reviews only in English

In [None]:
review_proportions(english_reviews_df) #proportion almost the same as the df including translated portuguese reviews

### how many reviews incomplete?

In [None]:
cleaned_merged_df.comment_comments.map(lambda x: 1 if str(x.find("…More")).isdigit()==True else 0).sum() #how many reviews translated into portuguese?

In [None]:
cleaned_merged_df['unfinished?'] = cleaned_merged_df.comment_comments.map(lambda x: 1 if str(x.find("…More")).isdigit()==True else 0)

In [None]:
cleaned_merged_df.head()

### splitting reviews by complete/incomplete

In [None]:
complete_reviews_df = cleaned_merged_df[cleaned_merged_df['unfinished?'] == 0]
incomplete_reviews_df = cleaned_merged_df[cleaned_merged_df['unfinished?'] == 1]

In [None]:
review_proportions(complete_reviews_df) #proportion almost the same as the df including translated portuguese reviews

In [None]:
complete_reviews_df.shape

In [None]:
for i in range(100):
        if len(incomplete_reviews_df.comment_comments.iloc[i]) < 100:
            print(incomplete_reviews_df.comment_comments.iloc[i])  
            print()
            
## most reviews even the incomplete short ones appear to give useful information

## NLP preproc

In [None]:
def clean(text, list_form=False):
    """clean's text for NLP. If list_form set to False returns string otherwise returns list, by default set to False"""
    for punctuation in string.punctuation:
        text = text.replace(punctuation, ' ') # Remove Punctuation
        
    lowercased = text.lower() # Lower Case
    
    #unaccented_string = unidecode.unidecode(lowercased) # remove accents
    
    tokenized = word_tokenize(lowercased) # Tokenize
    
    words_only = [word for word in tokenized if word.isalpha()] # Remove numbers
    
    #    stop_words = set(stopwords.words('english')) # Make stopword list
    #    without_stopwords = [word for word in words_only if not word in stop_words] # Remove Stop Words - better results when not removing stop words
    
    if list_form == True:
        return words_only
    else:
        return " ".join(words_only)

In [None]:
cleaned_merged_df['clean_comment'] = cleaned_merged_df['comment_comments'].apply(clean)

In [None]:
cleaned_merged_df.shape

In [None]:
cleaned_merged_df.tail(1).comment_comments

In [None]:
english_reviews_df['clean_comment'] = english_reviews_df['comment_comments'].apply(clean)

In [None]:
complete_reviews_df['clean_comment'] = complete_reviews_df['comment_comments'].apply(clean)

## LIME

In [None]:
cleaned_merged_df['good_bad_review'] = cleaned_merged_df.comment_ratings.map(lambda x: 1 if x >=4.0 else 0)
reviews_model_df = cleaned_merged_df[['clean_comment', 'good_bad_review']]
reviews_model_df.shape

In [None]:
english_reviews_df['good_bad_review'] = english_reviews_df.comment_ratings.map(lambda x: 1 if x >=4.0 else 0)
en_reviews_model_df = english_reviews_df[['clean_comment', 'good_bad_review']]
en_reviews_model_df.shape

In [None]:
complete_reviews_df['good_bad_review'] = complete_reviews_df.comment_ratings.map(lambda x: 1 if x >=4.0 else 0)
complete_reviews_model_df = complete_reviews_df[['clean_comment', 'good_bad_review']]
complete_reviews_model_df.shape

In [None]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logreg', LogisticRegression()),
])

In [None]:
pipe.get_params()

In [None]:
# Create Pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('logreg', LogisticRegression()),
])

# Set parameters to search
parameters = {
    'tfidf__ngram_range': ((1,1), (2,2)),
     'logreg__C': (0.01,1,10,100,1000)
}

# Perform grid search
grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, 
                           verbose=1, scoring = "f1", cv=5)

grid_search.fit(en_reviews_model_df['clean_comment'], en_reviews_model_df['good_bad_review'])

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_score_

In [None]:
import sklearn.metrics as metrics

reviews_model_df = en_reviews_model_df.sample(frac = 1)
X_train, X_test, y_train, y_test = train_test_split(reviews_model_df['clean_comment'], reviews_model_df['good_bad_review'], random_state=42)


# create bag-of-words with weights using tfid vectoriser
# strip accents and remove stop words during vectorisation
tf=TfidfVectorizer(strip_accents = 'ascii', ngram_range=(1, 2))

# transform and fit the training set with vectoriser
X_train_tf = tf.fit_transform(X_train)
# transform the test set with vectoriser
X_test_tf = tf.transform(X_test)


# create logistic regression model
logreg = LogisticRegression(verbose=0, random_state=42, penalty='l2', solver='newton-cg', C=10)
# train model on  vectorised training data
model = logreg.fit(X_train_tf, y_train)
# evaluate model performance on the test set
pred = model.predict(X_test_tf)
metrics.f1_score(y_test, pred, average='weighted')

In [None]:
!pip install lime

In [None]:
# importing the libraries
import lime
import sklearn.ensemble
from __future__ import print_function
from lime import lime_text
from sklearn.pipeline import make_pipeline
from lime.lime_text import LimeTextExplainer

# converting the vectoriser and model into a pipeline
# this is necessary as LIME takes a model pipeline as an input
c = make_pipeline(tf, model)

# saving a list of strings version of the X_test object
ls_X_test= list(X_test)

# saving the class names in a dictionary to increase interpretability
class_names = {0: 'bad review', 1:'good review'}

In [None]:
# create the LIME explainer
# add the class names for interpretability
LIME_explainer = LimeTextExplainer(class_names=class_names)

# choose a random single prediction
idx = 309
# explain the chosen prediction 
# use the probability results of the logistic regression
# can also add num_features parameter to reduce the number of features explained
LIME_exp = LIME_explainer.explain_instance(ls_X_test[idx], c.predict_proba)
# print results
print('Document id: %d' % idx)
print('Review: ', ls_X_test[idx])
print('Probability good review =', c.predict_proba([ls_X_test[idx]]).round(3)[0,1])
print('True class: %s' % class_names.get(list(y_test)[idx]))

In [None]:
# print class names to show what classes the viz refers to
print("1 = good review, 0 = bad review")
# show the explainability results with highlighted text
LIME_exp.show_in_notebook(text=True)

## Tfidf

In [None]:
def remove_stop_words(text):
    

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

stop_words = set(stopwords.words('english')) # Make stopword list

# Tuned TFidfvectorizer
def Tfidf_fit(series):
    for i in range(1,4):
        vec = TfidfVectorizer(ngram_range = (i,i), stop_words=stop_words).fit(series)
        return vec

def transform_create_list(series):
    vectors = Tfidf_fit(series).transform(series) # Transform text to vectors

    sum_tfidf = vectors.sum(axis=0) # Sum of tfidf weighting by word

    tfidf_list = [(word, sum_tfidf[0, idx]) for word, idx in     Tfidf_fit(series).vocabulary_.items()]  # Get the word and associated weight

    sorted_tfidf_list =sorted(tfidf_list, key = lambda x: x[1], reverse=True)  # Sort

    return sorted_tfidf_list[:20]

In [None]:
Tfidf_fit(reviews_model_df.clean_comment)
transform_create_list(reviews_model_df.clean_comment)

### Word Cloud & N_Grams

In [None]:
def remove_stopwords(text): 
     tokenized = word_tokenize(text)
     stop_words = set(stopwords.words('english')) # Make stopword list
     with_stopwords = " ".join([word for word in tokenized if not word in stop_words]) # Remove Stop Words - better results when not removing stop words
     return with_stopwords

In [None]:
en_reviews_model_df.clean_comment = en_reviews_model_df.clean_comment.apply(remove_stopwords)

In [None]:
en_reviews_model_df

In [None]:
text = " ".join(review for review in en_reviews_model_df.clean_comment.astype(str))

In [None]:
token_list = word_tokenize(text)

In [None]:
token_list

In [None]:
!pip install wordcloud

In [None]:
 from collections import Counter
import matplotlib.pyplot as plt
import wordcloud

plt.rcParams["figure.figsize"] = [16, 9]

def create_ngrams(token_list, nb_elements):
    """
    Create n-grams for list of tokens. Parameters: token_list : list of strings, nb_elements : number of elements in the n-gram
    Returns: Generator, generator of all n-grams
    """
    ngrams = zip(*[token_list[index_token:] for index_token in range(nb_elements)])
    return (" ".join(ngram) for ngram in ngrams)


def frequent_words(list_words, ngrams_number=1, number_top_words=10):
    """
    Create n-grams for list of tokens. Parameters: ngrams_number : int, number_top_words : int, output dataframe length
    Returns. DataFrame, Dataframe with the entities and their frequencies.
    """
    frequent = []
    if ngrams_number == 1:
        pass
    elif ngrams_number >= 2:
        list_words = create_ngrams(list_words, ngrams_number)
    else:
        raise ValueError("number of n-grams should be >= 1")
    counter = Counter(list_words)
    frequent = counter.most_common(number_top_words)
    return frequent


def make_word_cloud(text_or_counter, stop_words=None):
    if isinstance(text_or_counter, str):
        word_cloud = wordcloud.WordCloud(stopwords=stop_words).generate(text_or_counter)
    else:
        if stop_words is not None:
            text_or_counter = Counter(word for word in text_or_counter if word not in stop_words)
        word_cloud = wordcloud.WordCloud(stopwords=stop_words).generate_from_frequencies(text_or_counter)
    plt.imshow(word_cloud)
    plt.axis("off")
    plt.show()


In [None]:
make_word_cloud(token_list, stop_words=stop_words)

In [None]:
frequent_words(token_list, ngrams_number=1, number_top_words=15)

In [None]:
frequent_words(token_list, ngrams_number=2, number_top_words=15)

In [None]:
frequent_words(token_list, ngrams_number=3, number_top_words=15)

In [None]:
!pip install shap

In [None]:
import sklearn.metrics as metrics

reviews_model_df = en_reviews_model_df.sample(frac = 1)
X_train, X_test, y_train, y_test = train_test_split(reviews_model_df['clean_comment'], reviews_model_df['good_bad_review'], random_state=42)


# create bag-of-words with weights using tfid vectoriser
# strip accents and remove stop words during vectorisation
tf=TfidfVectorizer(strip_accents = 'ascii', ngram_range=(1, 1))

# transform and fit the training set with vectoriser
X_train_tf = tf.fit_transform(X_train)
# transform the test set with vectoriser
X_test_tf = tf.transform(X_test)


# create logistic regression model
logreg = LogisticRegression(verbose=0, random_state=42, penalty='l2', solver='newton-cg', C=10)
# train model on  vectorised training data
model = logreg.fit(X_train_tf, y_train)
# evaluate model performance on the test set
pred = model.predict(X_test_tf)
metrics.f1_score(y_test, pred, average='weighted')

In [37]:
# importing SHAP
import shap

# sampling data from the training and test set to reduce time-taken
X_train_sample = shap.sample(X_train_tf, 100)
X_test_sample = shap.sample(X_test_tf, 20)

# creating the KernelExplainer using the logistic regression model and training sample
SHAP_explainer = shap.KernelExplainer(model, X_train_sample)
# calculating the shap values of the test sample using the explainer 
shap_vals = SHAP_explainer.shap_values(X_test_sample)

# converting the test samples to a dataframe 
# this is necessary for non-tabular data in order for the visualisations 
# to include feature value
colour_test = pd.DataFrame(X_test_sample.todense())

NameError: name 'X_train_tf' is not defined

In [None]:
shap.summary_plot(shap_vals, colour_test, feature_names=tf.get_feature_names(),)