## Supervised Learning using LSTM for Review Ratings Prediction

### Data Extraction and Preprocessing (we follow the same process as in data_cleaning but we keep a different sample of the data)

In [4]:
import pandas as pd
import nltk
from cleantext import clean
import tqdm
import numpy as np
from deep_translator import GoogleTranslator
import torch

Since the GPL-licensed package `unidecode` is not installed, using Python's `unicodedata` package which yields worse results.


In [49]:
torch.cuda.is_available()

True

In [50]:
reviews_df = pd.read_csv("data/reviews.csv", sep=";")
reviews_df.head(3)

Unnamed: 0,category_link,company_link,company_name,score,description,review_page_nb,review_link,review_score,review_title,review_text,category
0,https://fr.trustpilot.com/categories/food_beve...,https://fr.trustpilot.com/review/lefourgon.com,Le Fourgon,49,Le Fourgon vous livre vos boissons consignées ...,329,https://fr.trustpilot.com/reviews/65a5388a60d6...,5,Application conviviale pour passer ses…,Application conviviale pour passer ses command...,food_beverages_tobacco
1,https://fr.trustpilot.com/categories/food_beve...,https://fr.trustpilot.com/review/lefourgon.com,Le Fourgon,49,Le Fourgon vous livre vos boissons consignées ...,329,https://fr.trustpilot.com/reviews/65a53245a223...,5,Très facile pour la commande,"Très facile pour la commande, très rapide et l...",food_beverages_tobacco
2,https://fr.trustpilot.com/categories/food_beve...,https://fr.trustpilot.com/review/lefourgon.com,Le Fourgon,49,Le Fourgon vous livre vos boissons consignées ...,329,https://fr.trustpilot.com/reviews/659f0e15dd57...,5,Première expérience réussie !,"Pour nous, c'était une première.Ravis d'avoir ...",food_beverages_tobacco


In [51]:
reviews_df = reviews_df[["company_name", "score", "review_score", "review_title", "review_text", "category"]]

#### We merge the review title and the review text into one column

In [52]:
# we merge the title and the text of the review
reviews_df["review"] = reviews_df["review_title"] + " " + reviews_df["review_text"]
reviews_df.head(3)

Unnamed: 0,company_name,score,review_score,review_title,review_text,category,review
0,Le Fourgon,49,5,Application conviviale pour passer ses…,Application conviviale pour passer ses command...,food_beverages_tobacco,Application conviviale pour passer ses… Applic...
1,Le Fourgon,49,5,Très facile pour la commande,"Très facile pour la commande, très rapide et l...",food_beverages_tobacco,Très facile pour la commande Très facile pour ...
2,Le Fourgon,49,5,Première expérience réussie !,"Pour nous, c'était une première.Ravis d'avoir ...",food_beverages_tobacco,"Première expérience réussie ! Pour nous, c'éta..."


In [53]:
clean(reviews_df["review"].values[0], no_emoji=True, no_line_breaks=True, lower=False).replace("#", "")

'Application conviviale pour passer ses Application conviviale pour passer ses commandes. Manquent juste les ingredients pas evidents a trouver.Notification par SMS peu de temps avant le creneau choisi pour un creneau precis de 20 minutes.Livreurs agreables.Merci le Fourgon !'

In [54]:
reviews_df["review"] = reviews_df["review"].apply(lambda x: clean(x, no_emoji=True, no_line_breaks=True, lower=False).replace("#", ""))

In [55]:
reviews_df["review"].values[100]

"Exceptionnel La brulerie Belleville est devenue mon unique fournisseur en cafe, le cafe est vraiment d'une qualite exceptionnelle ! De plus, le service apres-vente est d'une qualite humaine devenue bien rare a present !"

In [56]:
# number of companies in the dataset
len(reviews_df["company_name"].unique())

12996

#### We Keep only a sample of the data (the top 1000 companies with the most reviews)

In [59]:
# number of reviews per company
reviews_df["company_name"].value_counts()[:1000]

company_name
Hardloop                            236
Alltricks                           210
Ekosport                            140
vertbaudet                          140
Mode Tactique                       120
                                   ... 
Les Bons Profs                       40
L'école de secrétariat d'Emilie      40
Sosbilan                             40
Tuto Mix                             40
Marmon Sports                        40
Name: count, Length: 1000, dtype: int64

In [60]:
# number of reviews in all the dataset
len(reviews_df)

235503

In [61]:
# print the first company name
reviews_df["company_name"].value_counts().index[0]

'Hardloop\xa0'

In [62]:
reviews_df["company_name"] = reviews_df["company_name"].apply(lambda x: x.replace("\xa0", "") if x != "\xa0" else x)
reviews_df["company_name"].value_counts().index[0]

'Hardloop'

In [63]:
# we keep only the reviews of the first 100 companies in terms of number of reviews
revsample_df = reviews_df[reviews_df["company_name"].isin(reviews_df["company_name"].value_counts()[:1000].index)]
revsample_df["company_name"].value_counts().sum()

51768

In [64]:
revsample_df["company_name"].value_counts()[:10]

company_name
Hardloop          236
Alltricks         210
Ekosport          140
vertbaudet        140
Weenect           120
Mode Tactique     120
AAAEP             120
Handball Store    111
Meyclub           100
eevad             100
Name: count, dtype: int64

In [65]:
revsample_df = revsample_df.dropna(ignore_index=True)

In [232]:
# # we display all the lines of the first company
# pd.set_option("display.max_colwidth", None)
# revsample_df[revsample_df["company_name"] == "Hardloop"]

### We keep only the reviews in french

In [67]:
from nltk import wordpunct_tokenize
from nltk.corpus import stopwords
print(stopwords.fileids())

['arabic', 'azerbaijani', 'basque', 'bengali', 'catalan', 'chinese', 'danish', 'dutch', 'english', 'finnish', 'french', 'german', 'greek', 'hebrew', 'hinglish', 'hungarian', 'indonesian', 'italian', 'kazakh', 'nepali', 'norwegian', 'portuguese', 'romanian', 'russian', 'slovene', 'spanish', 'swedish', 'tajik', 'turkish']


In [68]:
def detecte_langage(message):
    # we define an empty dictionary
    # {language : number of common stopwords between the language and the words of the message}
    languages_shared_words = {}
    # tokenization in words
    words = wordpunct_tokenize(message)
    for language in stopwords.fileids():
        # stopwords for each language
        stopwords_liste = stopwords.words(language)
        # we take off the duplicates
        words = set(words)
        # the common words between the stopwords of a language and the words of the message
        common_elements = words.intersection(stopwords_liste)
        # addition of the couple to the dictionary
        languages_shared_words[language] = len(common_elements)
    # we return the language with the max of common words
    return  max(languages_shared_words, key = languages_shared_words.get)

In [69]:
# we apply the function for each message on the review column and we keep only the lines where the language is in French // 6min 55 for 52000 reviews
revsample_df["langue"] = revsample_df["review"].apply(detecte_langage)

In [70]:
revsample_df[revsample_df["langue"] == "french"].count()

company_name    46324
score           46324
review_score    46324
review_title    46324
review_text     46324
category        46324
review          46324
langue          46324
dtype: int64

In [71]:
revsample_df_fr = revsample_df[revsample_df["langue"] == "french"]

In [72]:
revsample_df_fr.to_csv("data/revsample_df_fr.csv", sep=";")

## With RNN-LSTM model

In [217]:
revsample_df_fr = pd.read_csv("data/revsample_df_fr.csv", sep=";")

In [218]:
# we want balanced classes in the train dataset. 
# let's see the number of reviews per score in the train and test datasets. The scores are between 1 and 5.
revsample_df_fr["review_score"].value_counts()

review_score
5    34443
1     4647
4     4418
3     1702
2     1114
Name: count, dtype: int64

#### We Use the UnderSampling method to balance the data

In [219]:
# we keep only 1114 random reviews per score in the train dataset
revsample_df_fr = revsample_df_fr.groupby("review_score").apply(lambda x: x.sample(1114, random_state=42)).reset_index(drop=True)
revsample_df_fr["review_score"].value_counts()

  revsample_df_fr = revsample_df_fr.groupby("review_score").apply(lambda x: x.sample(1114, random_state=42)).reset_index(drop=True)


review_score
1    1114
2    1114
3    1114
4    1114
5    1114
Name: count, dtype: int64

#### Split Train-Test

In [220]:
# we split the dataset into train and test
from sklearn.model_selection import train_test_split

train, test = train_test_split(revsample_df_fr, test_size=0.2, random_state=42)

train.shape, test.shape

((4456, 9), (1114, 9))

In [184]:
# # function to encode the reviews with a max_length of 500
# max_length = 500
# def encode_reviews(tokenizer, reviews, max_length):
#     token_ids = np.zeros(shape=(len(reviews), max_length),
#                          dtype=np.int32)
#     for i, review in enumerate(reviews):
#         encoded = tokenizer.encode(review, max_length=max_length, truncation=True)
#         token_ids[i, 0:len(encoded)] = encoded
#     attention_mask = (token_ids != 0).astype(np.int32)
#     return {"input_ids": token_ids, "attention_mask": attention_mask}

In [231]:
# # we encode the reviews
# train_encoded = encode_reviews(tokenizer, train["review"].values, max_length)
# test_encoded = encode_reviews(tokenizer, test["review"].values, max_length)
# train_encoded

In [221]:
# creation of train and test labels
train_labels = train["review_score"].values
test_labels = test["review_score"].values
train_labels

array([4, 4, 3, ..., 5, 5, 1], dtype=int64)

In [222]:
#we verify the range of the labels (between 0 and 5) in the train and test datasets
print(np.unique(train_labels))
print(np.unique(test_labels))

[1 2 3 4 5]
[1 2 3 4 5]


#### One Hot Encoding

In [223]:
# we one hot encode the labels that are currently between 1 and 5 using to_categorical
from tensorflow.keras.utils import to_categorical

train_labels = to_categorical(train_labels - 1)
test_labels = to_categorical(test_labels - 1)

train_labels

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 1., 0., 0.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.]], dtype=float32)

#### Librairies importation for this model

In [224]:
### With RNN-LSTM model
import keras as keras
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D, BatchNormalization, Bidirectional
from keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

#### Tokenization and Padding of the reviews

In [225]:
# we tokenize the reviews
top_words = 5000
tokenizer = Tokenizer(num_words=top_words, oov_token="<OOV>", filters='!"#$%&()*+,-./:;<=>?@[\]^_`{|}~', lower=True)
tokenizer.fit_on_texts(train["review"].values)

train_sequences = tokenizer.texts_to_sequences(train["review"].values)
test_sequences = tokenizer.texts_to_sequences(test["review"].values)

train_sequences[0]

[291,
 2,
 6,
 34,
 14,
 207,
 121,
 343,
 2,
 34,
 24,
 49,
 15,
 418,
 30,
 97,
 2418,
 32,
 93,
 130,
 19,
 43,
 3,
 70,
 418,
 363,
 2,
 248,
 15,
 378,
 52,
 7,
 21,
 1446,
 8,
 1,
 784,
 68,
 207,
 88,
 9,
 3956,
 1970,
 375]

In [226]:
# we pad the sequences
max_length = 500
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding="post", truncating="post")
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding="post", truncating="post")

train_padded[40]

array([1517, 1984,  123,   20,    5,   39,   29,   16,  407,  102,  744,
          2,    1, 2851, 3165,    2, 3500,  821,  353,   17,   51,    5,
        788,   29,   16,  407,  584, 2852,    2, 3500,   61,   29,   36,
        341,  102,   78,    4,    1,  632,   10,  788,    2, 1517, 1984,
         30, 2242,    7,  127,   71,  673,   12,   26,  252,    5,   69,
         19,    1,   29,  370,   40,   26, 3501,    2,  632,   30,    6,
        645,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

#### Proceeding to the model training

In [227]:
epochs = 3
emb_dim = 128 # 128
batch_size = 256

In [228]:
print((train_padded.shape, train_labels.shape, test_padded.shape, test_labels.shape))

model = Sequential()
model.add(Embedding(top_words, emb_dim, input_length=max_length))
model.add(SpatialDropout1D(0.5))
model.add(LSTM(64, dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(5, activation='softmax'))
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
print(model.summary())
history = model.fit(train_padded, train_labels, epochs=epochs, batch_size=batch_size,validation_split=0.2,callbacks=[EarlyStopping(monitor='val_loss',patience=7, min_delta=0.0001)])

((4456, 500), (4456, 5), (1114, 500), (1114, 5))
Model: "sequential_16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_16 (Embedding)    (None, 500, 128)          640000    
                                                                 
 spatial_dropout1d_11 (Spat  (None, 500, 128)          0         
 ialDropout1D)                                                   
                                                                 
 lstm_17 (LSTM)              (None, 64)                49408     
                                                                 
 dense_16 (Dense)            (None, 5)                 325       
                                                                 
Total params: 689733 (2.63 MB)
Trainable params: 689733 (2.63 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None
Epoch 1/3
Epoch 2/3
Epoch 3/3

#### Model Evaluation

In [177]:
# we predict the test labels
test_pred = model.predict(test_padded)

# for each vector, we put the max value to 1 and the others to 0
test_pred = np.where(test_pred == np.amax(test_pred, axis=1, keepdims=True), 1, 0)
test_pred



array([[0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       ...,
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0],
       [0, 0, 0, 1, 0]])

In [178]:
# let's see the accuracy of the model
from sklearn.metrics import accuracy_score

accuracy_score(test_labels, test_pred)

0.19210053859964094

In [48]:
accr = model.evaluate(train_padded,train_labels)
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

Test set
  Loss: 0.923
  Accuracy: 0.730


In [229]:
# # we evaluate the model
# model.evaluate(test_padded, test_labels)

In [230]:
# # we predict the test labels
# test_pred = model.predict(test_padded)
# test_pred