In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [2]:
#load the dataset
df = pd.read_csv('tripadvisor_co_uk-travel_restaurant_reviews_sample.csv')

In [3]:
#check the shape of the df
df.shape

(19998, 17)

In [4]:
#fill nan values for the ratings
df['rating'] = df['rating'].fillna(0)

In [5]:
#get the non-ordinary values to 0
df.loc[df['rating']=='April 2015', 'rating'] = 0
df.loc[df['rating'] =='September 2015', 'rating'] = 0

In [6]:
#get the rating of the columns 'ratings'
df.loc[:,'rating'] = df.loc[:,'rating'].apply(lambda x : float(str(x)[0]))
#check the range of possible values for the ratings
rating_range = set(df['rating'])
rating_range

{0.0, 1.0, 2.0, 3.0, 4.0, 5.0}

In [7]:
#remove the 0-rating reviews
df = df[df.rating != 0]

In [8]:
#Get the number of words for each review
df['words'] = df['review_text'].apply(lambda x : len(str(x).split()))

In [9]:
#average number of words based on the rating
avg_word_5 = df.loc[df['rating'] == 5, 'words'].mean()
avg_word_4 = df.loc[df['rating'] == 4, 'words'].mean()
avg_word_3 = df.loc[df['rating'] == 3, 'words'].mean()
avg_word_2 = df.loc[df['rating'] == 2, 'words'].mean()
avg_word_1 = df.loc[df['rating'] == 1, 'words'].mean()
print(avg_word_5, avg_word_4, avg_word_3, avg_word_2, avg_word_1)

74.92031924636923 78.23434840174012 104.33883129123468 127.18706293706293 141.78967867575463


In [10]:
#define dummies columns about the global sentiment of the rating
df['Neg'] = (df['rating'].astype(int)<4)
df['Pos'] = df['rating'].astype(int)>3

In [11]:
#import stopwords file
stopwords_file = open('stopwords.csv', 'r') 
stopwords = []
for word in stopwords_file:
    stopwords.append(word.strip('\n'))

In [12]:
#define the clean_review function
import re
def clean_review(review):
    rx = re.compile('\W+')
    review = str(review).lower().split()
    review = [rx.sub(' ',i).strip() for i in review if i not in stopwords and rx.sub(' ',i).strip()!= '']
    return review

In [13]:
#tokenize the review_text column and get a nice dataset
df['review'] = df['review_text'].apply(lambda x : clean_review(x))
df_final1 = df[['review', 'Pos', 'Neg']]

In [14]:
#split the dataset into train/test
from sklearn.model_selection import train_test_split
df_final, df_test = train_test_split(df_final1, test_size=0.2, random_state=0)
df_final = df_final.reset_index(drop = False)
df_test = df_test.reset_index(drop=False)

In [15]:
#count words occurence in three big dictionnaries (total word occurence, number of positive/negative reviews containing the word)
dic = {}
pos_dic = {}
neg_dic = {}

#for each line
for i in df_final.index:
    
    #initialize the tuple
    set_line = set()
    
    #for each words of the line
    for j in range(len(df_final['review'][i])):
        
        #add the word to the set of words
        set_line.add(df_final['review'][i][j])
    
        #counter for all occurences
        if df_final['review'][i][j] not in dic:
            dic[str(df_final['review'][i][j])] = 1
        else:
            dic[str(df_final['review'][i][j])] += 1
            
    #print(set_line)
        
    for k in set_line:
        #counter for POS occurences
        if df_final.loc[i, 'Pos'] == True:
            if k not in pos_dic:
                pos_dic[k] = 1
            else:
                pos_dic[k] += 1

        #counter for Neg occurences
        elif df_final.loc[i, 'Neg'] == True:

            if k not in neg_dic:
                neg_dic[k] = 1
            else:
                neg_dic[k] += 1

In [16]:
#sort dictionnary by occurence order
dic = sorted(dic.items(), key=lambda x: x[1], reverse=True)
pos_dic = sorted(pos_dic.items(), key=lambda x: x[1], reverse=True)
neg_dic = sorted(neg_dic.items(), key=lambda x: x[1], reverse=True)

In [17]:
#convert list_dic to dictionnary type
pos_dic = dict(pos_dic)
neg_dic = dict(neg_dic)
dic = dict(dic)

In [18]:
#constants calculus on the dataset
words_nb = sum(dic.values())
reviews_nb = len(df_final)
pos_reviews = len(df_final[df_final['Pos']==True])
neg_reviews = len(df_final[df_final['Neg']==True])
prob_rv_pos = pos_reviews/reviews_nb
prob_rv_neg = neg_reviews/reviews_nb

In [19]:
#proportion of positive/negative reviews in the training set
print(prob_rv_pos,prob_rv_neg)

0.7498185513136885 0.25018144868631154


In [20]:
#word sentiment calculator in the training
word = ''
pos_proba = (pos_dic[word]/pos_reviews)*prob_rv_pos
neg_proba = (neg_dic[word]/neg_reviews)*prob_rv_neg
pos_proba_ = pos_proba/(pos_proba + neg_proba)
neg_proba_ = neg_proba/(pos_proba + neg_proba)
print(word, 'pos: '+ str(pos_proba_), 'neg: '+str(neg_proba_))

sauce pos: 0.6268939393939394 neg: 0.3731060606060606


In [36]:
#create super dictionnaries (pos/neg) with the sentiment of each word
super_pos_dic = {}
super_neg_dic = {}
for word,v in dic.items():
    if (word in pos_dic.keys())&(word in neg_dic.keys()):
        pos_proba = (pos_dic[word]/pos_reviews)*prob_rv_pos
        neg_proba = (neg_dic[word]/neg_reviews)*prob_rv_neg
        pos_proba_ = pos_proba/(pos_proba + neg_proba)
        neg_proba_ = neg_proba/(pos_proba + neg_proba)
        super_pos_dic[word] = pos_proba_
        super_neg_dic[word] = neg_proba_

In [37]:
#sort the super dictionnaries by most occuring words
super_pos_dic = sorted(super_pos_dic.items(),key=lambda x: x[1], reverse=True)
super_neg_dic = sorted(super_neg_dic.items(),key=lambda x: x[1], reverse=True)
super_pos_dic = dict(super_pos_dic)
super_neg_dic = dict(super_neg_dic)

In [38]:
#create hyper dictionnaries (pos/neg) with only the top sentiment signifying words
hyper_pos_dic = dict()
hyper_neg_dic = dict()
Range = 1000
for k in range(Range):
    hyper_pos_dic[list(super_pos_dic)[k]] = list(super_pos_dic.values())[k]
    hyper_neg_dic[list(super_neg_dic)[k]] = list(super_neg_dic.values())[k]

In [39]:
#sort the most unsignifiant words in the super dictionnary
super_pos_dic = sorted(super_pos_dic.items(),key=lambda x: x[1], reverse=False)
super_neg_dic = sorted(super_neg_dic.items(),key=lambda x: x[1], reverse=False)
super_pos_dic = dict(super_pos_dic)
super_neg_dic = dict(super_neg_dic)

In [40]:
#compute the final hyper-dictionnary with the most signifying words
for k in range(Range):
    hyper_pos_dic[list(super_pos_dic)[k]] = list(super_pos_dic.values())[k]
    hyper_neg_dic[list(super_neg_dic)[k]] = list(super_neg_dic.values())[k]

In [23]:
#probability that a word occurs knowing that the review is positive
def proba_x_knowing_pos(word):
    prob = pos_dic[word]/pos_reviews
    return prob

In [24]:
#probability that the word occurs knowing that the review is negative
def proba_x_knowing_neg(word):
    prob = neg_dic[word]/neg_reviews
    return prob

In [43]:
#predict on the training set, calculing the probability only if the word is in the super_dictionnary
for k in range(len(df_test)):
    proba_sentence_pos = 1
    proba_sentence_neg = 1

    for i in range(len(df_test['review'][k])):
        word = df_test['review'][k][i]
        
        if (word in hyper_pos_dic.keys()):
            proba_sentence_pos *= proba_x_knowing_pos(word)
            proba_sentence_neg *= proba_x_knowing_neg(word)

    proba_sentence_pos = prob_rv_pos*proba_sentence_pos
    proba_sentence_neg = prob_rv_neg*proba_sentence_neg

    proba_sentence_pos_ = proba_sentence_pos/(proba_sentence_pos+proba_sentence_neg)
    proba_sentence_neg_ = proba_sentence_neg/(proba_sentence_pos+proba_sentence_neg)
    
    
    df_test.loc[k, 'predict'] = df_test.loc[k, 'Pos'] == (proba_sentence_pos_ > proba_sentence_neg_)
    df_test.loc[k, 'pos_prob'] = proba_sentence_pos_ 
    df_test.loc[k, 'neg_prob'] = proba_sentence_neg_ 

    #print(k, df_final.loc[k, 'index'],len(df_final['review'][k]),df_final.loc[k, 'Pos'], proba_sentence_pos_, proba_sentence_neg_)

In [42]:
#get the testing prediction accuracy
1- (len(df_test[df_test['predict'] == False])/len(df_test))

0.9013062409288825