# Création du modèle

In [1]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

from ml.classes import DataPreprocessor, Model

In [2]:
dataset = pd.read_csv("Restaurant_Reviews.tsv", delimiter="\t", quoting=3)

dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Review  1000 non-null   object
 1   Liked   1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


## Prétraitement des données

In [3]:
datapreprocessor = DataPreprocessor()

dataset["processed_review"] = dataset["Review"].apply(lambda x: datapreprocessor.preprocess_review(str(x), lemmatize=True))
dataset.drop_duplicates(inplace=True)
dataset

  review = BeautifulSoup(review, "html.parser").get_text().lower()


Unnamed: 0,Review,Liked,processed_review
0,Wow... Loved this place.,1,wow love place
1,Crust is not good.,0,crust not good
2,Not tasty and the texture was just nasty.,0,not tasty texture nasty
3,Stopped by during the late May bank holiday of...,1,stop late may bank holiday rick steve recommen...
4,The selection on the menu was great and so wer...,1,selection menu great price
...,...,...,...
995,I think food should have flavor and texture an...,0,think food flavor texture lack
996,Appetite instantly gone.,0,appetite instantly go
997,Overall I was not impressed and would not go b...,0,overall not impressed would not go back
998,"The whole experience was underwhelming, and I ...",0,whole experience underwhelme think go ninja su...


## Entraînement du modèle

In [4]:
X = dataset["processed_review"]
y = dataset["Liked"]

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

In [5]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,2))
X_train = tfidf_vectorizer.fit_transform(x_train)

random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)

In [6]:
X_test = tfidf_vectorizer.transform(x_test)
random_forest.predict(X_test)

random_forest.score(X_test, y_test)

0.855

## Sauvegarde du vectoriseur et du modèle

In [8]:
model = Model(datapreprocessor, tfidf_vectorizer, random_forest)
model.save("model.plk")