In [1]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertModel
from sklearn.metrics import accuracy_score, log_loss
import numpy as np
import pandas as pd
import nltk
from sklearn.neighbors import KNeighborsClassifier

  from .autonotebook import tqdm as notebook_tqdm
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [3]:
# Charger le fichier CSV
train_path = "../data/train_sample.csv"
test_path = "../data/test_sample.csv"
data_train = pd.read_csv(train_path)
data_test = pd.read_csv(test_path)
# Afficher les premières lignes pour comprendre la structure

In [4]:
# Prétraitement des données
data_train = data_train[['revue/texte', 'polarity', 'revue/score']]
data_test = data_test[['revue/texte', 'polarity', 'revue/score']]

data_train.head()

Unnamed: 0,revue/texte,polarity,revue/score
0,"I was pleased with this Romance, mostly becaus...",positive,4.0
1,I love this book! Not only did I re-learn the ...,positive,5.0
2,The Circular Staircase is a proper name for th...,neutral,3.0
3,Is this only the second book in the series? Fe...,negative,1.0
4,From the moment I discovered Prudence Mackinto...,positive,5.0


In [5]:
# Séparation des données
X_train = data_train['revue/texte']
y_train_polarity = data_train['polarity']
y_train_score = data_train['revue/score']
X_test = data_test['revue/texte']
y_test_polarity = data_test['polarity']
y_test_score = data_test['revue/score']
le = LabelEncoder()
y_train_polarity = le.fit_transform(y_train_polarity)
y_test_polarity = le.transform(y_test_polarity)

In [6]:
list_to_avoid = nltk.corpus.stopwords.words('english') + ['great', 'books', 'quot', 'life', 'read', 'time', 'story', 'one', 'book', 'like']

In [7]:
# Représentation de texte - Mots simples
vect_simple = CountVectorizer(stop_words=list_to_avoid)
X_train_simple = vect_simple.fit_transform(X_train)
X_test_simple = vect_simple.transform(X_test)

# Représentation de texte - TF-IDF
vect_tfidf = TfidfVectorizer(stop_words=list_to_avoid)
X_train_tfidf = vect_tfidf.fit_transform(X_train)
X_test_tfidf = vect_tfidf.transform(X_test)

In [8]:
# Modélisation - Exemple avec RandomForest pour la polarité
model_rfc = RandomForestClassifier(verbose=1)
model_rfc.fit(X_train_simple, y_train_polarity)
y_pred1 = model_rfc.predict(X_test_simple)

model_rfc2 = RandomForestClassifier()
model_rfc2.fit(X_train_tfidf, y_train_polarity)
y_pred2 = model_rfc2.predict(X_test_tfidf)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:  2.5min
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s


In [9]:
# Remarque: L'entraînement et l'évaluation des modèles ne sont pas inclus dans ce code.
# évaluation des modèles accuracy_score et log_loss
print("Accuracy mots simples:",accuracy_score(y_test_polarity, y_pred1))
print("Loss mots simples:",log_loss(y_test_polarity, model_rfc.predict_proba(X_test_simple)))

print("Accuracy Tf-Idf:",accuracy_score(y_test_polarity, y_pred2))
print("Loss Tf-Idf:",log_loss(y_test_polarity, model_rfc2.predict_proba(X_test_tfidf)))

Accuracy mots simples: 0.7478181818181818


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s


Loss mots simples: 0.6869467690891733
Accuracy Tf-Idf: 0.7385454545454545
Loss Tf-Idf: 0.6884409984457961


In [10]:
# Modélisation - Exemple avec RandomForest pour la polarité
model_rfc_score = RandomForestClassifier(verbose=1)
model_rfc.fit(X_train_simple, y_train_score)
y_pred3 = model_rfc.predict(X_test_simple)

model_rfc2_score = RandomForestClassifier()
model_rfc2.fit(X_train_tfidf, y_train_score)
y_pred4 = model_rfc2.predict(X_test_tfidf)

[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:  3.4min
[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.3s


In [12]:

print("Accuracy mots simples:",accuracy_score(y_test_score, y_pred3))
print("Loss mots simples:",log_loss(y_test_score, model_rfc.predict_proba(X_test_simple)))

print("Accuracy Tf-Idf:",accuracy_score(y_test_score, y_pred4))
print("Loss Tf-Idf:",log_loss(y_test_score, model_rfc2.predict_proba(X_test_tfidf)))

Accuracy mots simples: 0.5466363636363636


[Parallel(n_jobs=1)]: Done  49 tasks      | elapsed:    0.4s


Loss mots simples: 1.1947488350562099
Accuracy Tf-Idf: 0.538
Loss Tf-Idf: 1.205161741736806


In [14]:
import pickle

filename = 'finalized_model_polarity_idf.sav'
filename1 = 'finalized_model_score_idf.sav'
filename2 = 'finalized_model_polarity_counter.sav'
filename3 = 'finalized_model_score_counter.sav'
filename_tf = 'tfidf.sav'
filename_counter = 'counter.sav'

pickle.dump(model_rfc2, open(filename, 'wb'))
pickle.dump(model_rfc2_score, open(filename1, 'wb'))
pickle.dump(model_rfc, open(filename2, 'wb'))
pickle.dump(model_rfc, open(filename3, 'wb'))
pickle.dump(vect_tfidf, open(filename_tf, 'wb'))
pickle.dump(vect_simple, open(filename_counter, 'wb'))
