In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

file_path = '../data/Sample_Books_rating.csv'  
data = pd.read_csv(file_path)

data_clean = data.dropna(subset=['revue/texte', 'revue/score'])

# Séparation des données en ensembles d'entraînement et de test
X = data_clean['revue/texte']
y = data_clean['revue/score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorisation des textes avec TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Entraînement du modèle 
model = LinearRegression()
model.fit(X_train_vect, y_train)

# Prédiction sur l'ensemble de test
y_pred = model.predict(X_test_vect)

# Évaluation du modèle
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse}')
print(f'R² score: {r2}')


MSE: 1.1124572141135383
R² score: 0.2367695592690694


In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import mean_squared_error

data = pd.read_csv('../data/Sample_Books_rating.csv')
print(data.head())
data.dropna(subset=['revue/texte', 'revue/score'], inplace=True)


vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['revue/texte'])

X_train, X_test, y_train, y_test = train_test_split(X, data['revue/score'], test_size=0.2, random_state=42)

model =AdaBoostClassifier()
model.fit(X_train, y_train)

predictions = model.predict(X_test)
print('accuracy : ', model.score(X_test, y_test))
print('RMSE:', mean_squared_error(y_test, predictions, squared=False))

           Id                                              Titre  Prix  \
0  B0006CR6U4  A dictionary of the Targumim, the Talmud Babli...   NaN   
1  0897166159           Espresso Coffee: Professional Techniques   NaN   
2  0736693408  The First King of Shannara (The Sword of Shann...   NaN   
3  0395051029             Wuthering Heights (Riverside editions)   NaN   
4  4770016050  A Cat, a Man, and Two Women (Japans Modern Wri...   NaN   

          User_id                 Nom lecteur revue/utilité  revue/score  \
0  A303XPDO694V6X                       Ariel           2/6          4.0   
1  A3780H4TM9RMB8                David barnes           0/1          2.0   
2  A1AX6VPDQQZDPV                   M Carlton           4/4          5.0   
3  A35RQKCCCQ62O0                       LadyJ           0/0          4.0   
4  A2IJQDE1I4SIJT  David C. Arnold "master D"           1/2          5.0   

   revue/heure                            revue/résumé  \
0   1122163200                          

In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,  TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error

# Load the data
data = pd.read_csv('../data/Sample_Books_rating.csv')
print(data.head())

# Preprocess the data
data.dropna(subset=['revue/texte', 'revue/score'], inplace=True)

vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(data['revue/texte'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, data['revue/score'], test_size=0.2, random_state=42)

# Train the Decision Tree model
#model = DecisionTreeClassifier()
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate the model
predictions = model.predict(X_test)
print('accuracy : ', model.score(X_test, y_test))
print('RMSE:', mean_squared_error(y_test, predictions, squared=False))

           Id                                              Titre  Prix  \
0  B0006CR6U4  A dictionary of the Targumim, the Talmud Babli...   NaN   
1  0897166159           Espresso Coffee: Professional Techniques   NaN   
2  0736693408  The First King of Shannara (The Sword of Shann...   NaN   
3  0395051029             Wuthering Heights (Riverside editions)   NaN   
4  4770016050  A Cat, a Man, and Two Women (Japans Modern Wri...   NaN   

          User_id                 Nom lecteur revue/utilité  revue/score  \
0  A303XPDO694V6X                       Ariel           2/6          4.0   
1  A3780H4TM9RMB8                David barnes           0/1          2.0   
2  A1AX6VPDQQZDPV                   M Carlton           4/4          5.0   
3  A35RQKCCCQ62O0                       LadyJ           0/0          4.0   
4  A2IJQDE1I4SIJT  David C. Arnold "master D"           1/2          5.0   

   revue/heure                            revue/résumé  \
0   1122163200                          