In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

file_path = '../data/Sample_Books_rating.csv'  
data = pd.read_csv(file_path)

data_clean = data.dropna(subset=['revue/texte', 'revue/score'])

# Séparation des données en ensembles d'entraînement et de test
X = data_clean['revue/texte']
y = data_clean['revue/score']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorisation des textes avec TF-IDF
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Entraînement du modèle 
model = LinearRegression()
model.fit(X_train_vect, y_train)

# Prédiction sur l'ensemble de test
y_pred = model.predict(X_test_vect)

# Évaluation du modèle
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'MSE: {mse}')
print(f'R² score: {r2}')


MSE: 1.1124572141135383
R² score: 0.2367695592690694


In [29]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import mean_squared_error

data = pd.read_csv('../data/Sample_Books_rating.csv')
print(data.head())
data.dropna(subset=['revue/texte', 'revue/score'], inplace=True)


vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(data['revue/texte'])

X_train, X_test, y_train, y_test = train_test_split(X, data['revue/score'], test_size=0.2, random_state=42)

model =AdaBoostClassifier()
model.fit(X_train, y_train)

predictions = model.predict(X_test)
print('accuracy : ', model.score(X_test, y_test))
print('RMSE:', mean_squared_error(y_test, predictions, squared=False))

           Id                                              Titre  Prix  \
0  B0006CR6U4  A dictionary of the Targumim, the Talmud Babli...   NaN   
1  0897166159           Espresso Coffee: Professional Techniques   NaN   
2  0736693408  The First King of Shannara (The Sword of Shann...   NaN   
3  0395051029             Wuthering Heights (Riverside editions)   NaN   
4  4770016050  A Cat, a Man, and Two Women (Japans Modern Wri...   NaN   

          User_id                 Nom lecteur revue/utilité  revue/score  \
0  A303XPDO694V6X                       Ariel           2/6          4.0   
1  A3780H4TM9RMB8                David barnes           0/1          2.0   
2  A1AX6VPDQQZDPV                   M Carlton           4/4          5.0   
3  A35RQKCCCQ62O0                       LadyJ           0/0          4.0   
4  A2IJQDE1I4SIJT  David C. Arnold "master D"           1/2          5.0   

   revue/heure                            revue/résumé  \
0   1122163200                          

In [31]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer,  TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error

# Load the data
data = pd.read_csv('../data/Sample_Books_rating.csv')
print(data.head())

# Preprocess the data
data.dropna(subset=['revue/texte', 'revue/score'], inplace=True)

vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X = vectorizer.fit_transform(data['revue/texte'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, data['revue/score'], test_size=0.2, random_state=42)

# Train the Decision Tree model
#model = DecisionTreeClassifier()
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Evaluate the model
predictions = model.predict(X_test)
print('accuracy : ', model.score(X_test, y_test))
print('RMSE:', mean_squared_error(y_test, predictions, squared=False))

           Id                                              Titre  Prix  \
0  B0006CR6U4  A dictionary of the Targumim, the Talmud Babli...   NaN   
1  0897166159           Espresso Coffee: Professional Techniques   NaN   
2  0736693408  The First King of Shannara (The Sword of Shann...   NaN   
3  0395051029             Wuthering Heights (Riverside editions)   NaN   
4  4770016050  A Cat, a Man, and Two Women (Japans Modern Wri...   NaN   

          User_id                 Nom lecteur revue/utilité  revue/score  \
0  A303XPDO694V6X                       Ariel           2/6          4.0   
1  A3780H4TM9RMB8                David barnes           0/1          2.0   
2  A1AX6VPDQQZDPV                   M Carlton           4/4          5.0   
3  A35RQKCCCQ62O0                       LadyJ           0/0          4.0   
4  A2IJQDE1I4SIJT  David C. Arnold "master D"           1/2          5.0   

   revue/heure                            revue/résumé  \
0   1122163200                          

In [11]:
## LLM fait avec keras pour le score de livre

import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Dropout  # Ajout de Dropout ici
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint  # Assurez-vous d'importer ModelCheckpoint si vous l'utilisez
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler


# Charger les données
data_path = 'Sample_Books_rating.csv'  
data = pd.read_csv(data_path)

# Nettoyage des données pour éliminer les lignes sans texte de commentaire
data_clean = data.dropna(subset=['revue/texte', 'revue/score'])

X = data_clean['revue/texte'].values
y = data_clean['revue/score'].values

# Normalisation des scores pour la régression
scaler = MinMaxScaler(feature_range=(0, 1))
y = scaler.fit_transform(y.reshape(-1, 1)).reshape(-1)

# Préparation des données textuelles
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(X)
sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(sequences, maxlen=100)

# Division des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Modification de la construction du modèle pour inclure Dropout et une fonction d'activation linéaire
model = Sequential([
    Embedding(10000, 16, input_length=100),
    GlobalAveragePooling1D(),
    Dense(16, activation='relu'),
    Dropout(0.5),  # Ajout de Dropout pour la régularisation
    Dense(1, activation='linear')  # Changement pour une fonction d'activation linéaire
])

# Modification de l'optimiseur pour un taux d'apprentissage différent si nécessaire
model.compile(optimizer=Adam(learning_rate=0.001),
              loss='mean_squared_error',
              metrics=['accuracy'])

# Callbacks pour l'arrêt anticipé et la sauvegarde du meilleur modèle
early_stopping = EarlyStopping(monitor='val_loss', patience=3)
model_checkpoint = ModelCheckpoint(
    'best_model.h5', monitor='val_loss', save_best_only=True)

# Entraînement du modèle avec les callbacks
model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test), batch_size=32,
          callbacks=[early_stopping, model_checkpoint])

# Charger le meilleur modèle sauvegardé
model.load_weights('best_model.h5')

# Prédiction sur l'ensemble de test avec le meilleur modèle
y_pred = model.predict(X_test)
y_pred_rescaled = scaler.inverse_transform(y_pred)


ModuleNotFoundError: No module named 'tensorflow.python'

In [None]:
len(X_train)
model.evaluate(X_test, y_test)

In [None]:
## LLM fait avec pytorch pour le score de livre
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

# Charger les données
data_path = 'Sample_Books_rating.csv'
data = pd.read_csv(data_path)

# Nettoyage des données
data_clean = data.dropna(subset=['revue/texte', 'revue/score'])

# Normalisation des scores
scaler = MinMaxScaler(feature_range=(0, 1))
y = scaler.fit_transform(data_clean['revue/score'].values.reshape(-1, 1)).reshape(-1)

# Préparation des données textuelles avec PyTorch
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(data_clean['revue/texte'].values)
sequences = tokenizer.texts_to_sequences(data_clean['revue/texte'].values)
X_padded = pad_sequences(sequences, maxlen=100)

# Division des données
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Définir un Dataset personnalisé
class ReviewDataset(Dataset):
    def __init__(self, reviews, scores):
        self.reviews = torch.tensor(reviews, dtype=torch.long)
        self.scores = torch.tensor(scores, dtype=torch.float32)
    
    def __len__(self):
        return len(self.reviews)
    
    def __getitem__(self, idx):
        return self.reviews[idx], self.scores[idx]

train_dataset = ReviewDataset(X_train, y_train)
test_dataset = ReviewDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32)

# Définir le modèle
class ReviewRegressor(nn.Module):
    def __init__(self):
        super(ReviewRegressor, self).__init__()
        self.embedding = nn.Embedding(10000, 16)
        self.pooling = nn.AdaptiveAvgPool1d(1)
        self.fc1 = nn.Linear(16, 16)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(16, 1)
        
    def forward(self, x):
        x = self.embedding(x).permute(0, 2, 1)
        x = self.pooling(x).squeeze()
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

model = ReviewRegressor()

# Définir la fonction de perte et l'optimiseur
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Boucle d'entraînement avec Early Stopping
patience = 3
best_val_loss = float('inf')
patience_counter = 0

for epoch in range(20):
    model.train()
    running_loss = 0.0
    for reviews, scores in train_loader:
        optimizer.zero_grad()
        outputs = model(reviews)
        loss = criterion(outputs.squeeze(), scores)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    
    val_loss = 0.0
    model.eval()
    with torch.no_grad():
        for reviews, scores in test_loader:
            outputs = model(reviews)
            loss = criterion(outputs.squeeze(), scores)
            val_loss += loss.item()
    
    print(f'Epoch {epoch}, Training Loss: {running_loss/len(train_loader)}, Validation Loss: {val_loss/len(test_loader)}')
    

