# Presentation du cas

## Importations des librairies

In [1]:
# Standard libraries
import warnings
import pickle
from math import sqrt
import re
import numpy as np

# NLP libraries
import spacy

# Model evaluation libraries
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score

warnings.filterwarnings('ignore')

## Telechargements donnees

In [2]:
with open("full.pkl", "rb") as f:
    data=pickle.load(f)

data=data.dropna(subset=["review_comment_message"])

In [3]:
from sklearn.model_selection import train_test_split

data_subset,_,_,_=train_test_split(data,data,train_size=0.2)

## Actionabilite des segments

In [4]:
pyLDAvis

NameError: name 'pyLDAvis' is not defined

## Translation into English

In [None]:
from deep_translator import DeepL

data["review_comment_message"]=data["review_comment_message"].apply(lambda x:DeepL(api_key="5eb3254d-0a82-c74f-f46c-570d62a4f24d:fx",source="pt",target="en").translate(x))

In [None]:
with open("translated_order.pkl", "wb") as f:
    pickle.dump(
        data,
        f,
    )

## Sentiment analysis

In [4]:
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words("portuguese")
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def clean_data(quote):
    quote = quote.lower()
    tokens = word_tokenize(quote)
    token_punc = [t for t in tokens if t.isalpha()]
    token_stop = [t for t in token_punc if t not in stop_words]
    return token_stop

data_subset["tokens"] = data_subset["review_comment_message"].apply(lambda x: clean_data(x))

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data_subset["tokens"], data_subset["review_score"], test_size=0.2, random_state=0)

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(lowercase=False, analyzer=lambda x: x)
tf_idf_train = vectorizer.fit_transform(X_train).toarray()
tf_idf_test = vectorizer.transform(X_test).toarray()

In [7]:
tf_idf_train.shape

(6903, 5847)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

lr = LinearRegression()
lr.fit(tf_idf_train, y_train)

In [None]:
y_pred_lr = lr.predict(tf_idf_train)
print('R2 score of linear regression:', r2_score(y_pred_lr, y_train))

In [None]:
y_pred_lr = lr.predict(tf_idf_test)
print('R2 score of linear regression:', r2_score(y_pred_lr, y_test))

Cas de sur-apprentissage
- SVM
- Embedding de phrases
- Naive bayesian

In [None]:
from sklearn.linear_model import Ridge


rd=Ridge(alpha=0.1)
rd.fit(tf_idf_train, y_train)

y_pred_lr = rd.predict(tf_idf_train)
print('R2 score of linear regression:', r2_score(y_pred_lr, y_train))

y_pred_lr = rd.predict(tf_idf_test)
print('R2 score of linear regression:', r2_score(y_pred_lr, y_test))

In [9]:
from sklearn.ensemble import RandomForestRegressor


rf=RandomForestRegressor(max_depth=50,verbose=1,n_jobs=4)
rf.fit(tf_idf_train, y_train)

y_pred_lr = rf.predict(tf_idf_train)
print('R2 score of linear regression:', r2_score(y_pred_lr, y_train))

y_pred_lr = rf.predict(tf_idf_test)
print('R2 score of linear regression:', r2_score(y_pred_lr, y_test))

[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   49.5s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:  1.9min finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.1s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished


R2 score of linear regression: 0.7401108349101501
R2 score of linear regression: 0.2534258521110022


optimization :
- hyperops : bayesian hyperparameter optimization
- gridsearch : easy but inefficient
- optuna : rapide scalable
- ax : pytorch hyperparameter

Utiliser une cross-validation plutot que des donnees test qui servent d'outil de communication.

mesure des performances:
1. Mesure de la performance du process (split 80/20) de selection du modele
2. Selection du modele : performance en cross-validation sur toute la donnee 

## Topic modelling

In [10]:
from gensim.models import TfidfModel
from gensim.corpora import Dictionary
from gensim.models import LsiModel
from pprint import pprint

# Create a corpus
corpus = data_subset['tokens']

# Compute the dictionary: this is a dictionary mapping words and their corresponding numbers for later visualisation
id2word = Dictionary(corpus)

bow = [id2word.doc2bow(line) for line in corpus] 

tfidf_model = TfidfModel(bow)

tf_idf_gensim = tfidf_model[bow]

lsi = LsiModel(tf_idf_gensim, id2word=id2word, num_topics=5)

In [11]:
from gensim.models import LdaModel

lda1 = LdaModel(corpus=tf_idf_gensim, num_topics=5, id2word=id2word, passes=10, random_state=0)

In [12]:
import pyLDAvis
from pyLDAvis import gensim

# Visualize the topics
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(topic_model=lda1, corpus=bow, dictionary=id2word)
vis