In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from transformers import BertTokenizer, BertModel
from sklearn.metrics import accuracy_score, log_loss
import numpy as np
import pandas as pd

In [None]:
# Charger le fichier CSV
file_path = '../data/Sample_Books_rating.csv'
data = pd.read_csv(file_path)

# Afficher les premières lignes pour comprendre la structure
data.head()

In [None]:
# Prétraitement des données
data['Polarité'] = data['revue/score'].apply(lambda x: 'Positif' if x > 3 else ('Négatif' if x < 3 else 'Neutre'))
data = data[['revue/texte', 'Polarité', 'revue/score']].dropna()

# Séparation des données
X_train, X_test, y_train, y_test = train_test_split(data['revue/texte'], data[['Polarité', 'revue/score']], test_size=0.2, random_state=42)

# Encodage de la polarité pour la classification
le = LabelEncoder()
y_train_polarity = le.fit_transform(y_train['Polarité'])
y_test_polarity = le.transform(y_test['Polarité'])

In [None]:
# Représentation de texte - Mots simples
vect_simple = CountVectorizer(stop_words='english')
X_train_simple = vect_simple.fit_transform(X_train)
X_test_simple = vect_simple.transform(X_test)

# Représentation de texte - TF-IDF
vect_tfidf = TfidfVectorizer(stop_words='english')
X_train_tfidf = vect_tfidf.fit_transform(X_train)
X_test_tfidf = vect_tfidf.transform(X_test)

In [None]:
# Modélisation - Exemple avec RandomForest pour la polarité
model_rfc = RandomForestClassifier()
model_rfc.fit(X_train_simple, y_train_polarity)
y_pred1 = model_rfc.predict(X_test_simple)

model_rfc2 = RandomForestClassifier()
model_rfc2.fit(X_train_tfidf, y_train_polarity)
y_pred2 = model_rfc2.predict(X_test_tfidf)

In [None]:
# Remarque: L'entraînement et l'évaluation des modèles ne sont pas inclus dans ce code.
# évaluation des modèles accuracy_score et log_loss
print("Accuracy mots simples:",accuracy_score(y_test_polarity, y_pred1))
print("Loss mots simples:",log_loss(y_test_polarity, model_rfc.predict_proba(X_test_simple)))

print("Accuracy Tf-Idf:",accuracy_score(y_test_polarity, y_pred2))
print("Loss Tf-Idf:",log_loss(y_test_polarity, model_rfc2.predict_proba(X_test_tfidf)))