<a href="https://colab.research.google.com/github/MamaneHassane/hotel_notation_model/blob/master/main.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# !pip install sentencepiece

In [None]:
import torch
from torch.utils.data import Dataset
from torchvision import datasets
from torchvision.transforms import ToTensor
import matplotlib.pyplot as plt
import numpy as np

# !! Cellule suivante à exécuter uniquement sur Colab

In [None]:
from google.colab import drive
drive.mount('/content/drive')
path = "/content/drive/MyDrive/Colab Notebooks/TP3_Approches_neuronales/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# !! Si on est en local, on doit éxecuter la cellule suivante

In [None]:
# path = "./"

#### Chemins des fichiers

In [None]:
test_file = path + "test_hotel_reviews.csv"
train_file = path + "train_hotel_reviews.csv"
val_file = path + "valid_hotel_reviews.csv"

#### Entraîner le tokenizer sur le dataset train et sauvegarder le résultat dans un fichier

In [None]:
import sentencepiece as spm
import pandas as pd

# Charger le fichier d'entraînement
df_train = pd.read_csv(train_file)

# Sauvegarder les tokens dans un fichier txt
with open(path + "train_tokenizer.txt", "w", encoding="utf-8") as f:
    for text in df_train["Review"]:
        f.write(str(text).strip() + "\n")

In [None]:
# On va vérifier le contenu du fichier, s'il n'y a pas de valeurs na
f_train = pd.read_csv(train_file)
print("Nombre de lignes :", len(df_train))
print("Exemples de texte :", df_train["Review"].dropna().head())

# Vérifie s'il y a des lignes vides
print("Lignes vides ou NaN :", df_train["Review"].isna().sum())

Nombre de lignes : 18491
Exemples de texte : 0    nice hotel expensive parking got good deal sta...
1    ok nothing special charge diamond member hilto...
2    nice rooms not 4* experience hotel monaco seat...
3    unique, great stay, wonderful time hotel monac...
4    great stay great stay, went seahawk game aweso...
Name: Review, dtype: object
Lignes vides ou NaN : 0


In [None]:
# définir le chemin du modèle
model_path = path + "hotel_reviews.model"
# Entraîner le tokenizer et sauvegarder le model
spm.SentencePieceTrainer.train(
    input= path+"train_tokenizer.txt",
    model_prefix= model_path,
    vocab_size=2000,
    model_type="unigram"  # ou "unigram", "word"
)

In [None]:
# Charger le tokenizer entraîné
sp = spm.SentencePieceProcessor()
sp.load(model_path)

# Exemple sur un texte
example = "This hotel was very clean and comfortable."
tokens = sp.encode(example, out_type=str)
print(tokens)

['▁', 'T', 'hi', 's', '▁hotel', '▁wa', 's', '▁', 'very', '▁clean', '▁an', 'd', '▁comfortable', '.']


#### Tokeniser les trois fichiers

In [None]:
# Fonction pour tokenizer un ficher entier
def tokenize_file(csv_file, col_name, sp_model):
    df = pd.read_csv(csv_file)
    df[f"{col_name}_tokens"] = df[col_name].astype(str).apply(lambda x: sp_model.encode(x, out_type=str))
    return df

# Tokenization des trois fichiers
tokenized_train = tokenize_file(train_file, "Review", sp)
tokenized_val = tokenize_file(val_file, "Review", sp)
tokenized_test = tokenize_file(test_file, "Review", sp)

In [None]:
tokenized_train.head()

Unnamed: 0,Review,Rating,Review_tokens
0,nice hotel expensive parking got good deal sta...,4,"[▁nice, ▁hotel, ▁expensive, ▁parking, ▁got, ▁g..."
1,ok nothing special charge diamond member hilto...,2,"[▁ok, ▁nothing, ▁special, ▁charge, ▁diamond, ▁..."
2,nice rooms not 4* experience hotel monaco seat...,3,"[▁nice, ▁rooms, ▁not, ▁4, *, ▁experience, ▁hot..."
3,"unique, great stay, wonderful time hotel monac...",5,"[▁unique, ,, ▁great, ▁stay, ,, ▁wonderful, ▁ti..."
4,"great stay great stay, went seahawk game aweso...",5,"[▁great, ▁stay, ▁great, ▁stay, ,, ▁went, ▁sea,..."


In [None]:
tokenized_val.head()

Unnamed: 0,Review,Rating,Review_tokens
0,beach service just say service hotel excellent...,5,"[▁beach, ▁service, ▁just, ▁say, ▁service, ▁hot..."
1,interesting wife traveled years hotel ups down...,3,"[▁interesting, ▁wife, ▁travel, ed, ▁years, ▁ho..."
2,animation team impressed animation team resort...,4,"[▁an, im, ation, ▁tea, m, ▁impressed, ▁an, im,..."
3,majestic hotel putan cana-awful assuming peopl...,1,"[▁ma, j, est, ic, ▁hotel, ▁pu, t, an, ▁cana, -..."
4,good times majestic colonial just returned wee...,5,"[▁good, ▁times, ▁ma, j, est, ic, ▁co, lo, ni, ..."


In [None]:
tokenized_test.head()

Unnamed: 0,Review,Rating,Review_tokens
0,mixed reviews accurate just returned week stay...,3,"[▁mi, x, ed, ▁reviews, ▁ac, c, ur, ate, ▁just,..."
1,"true paradise just came dream vacation, excell...",5,"[▁true, ▁paradise, ▁just, ▁came, ▁dream, ▁vaca..."
2,"absolutely wonderful happy resort recommend, r...",5,"[▁absolutely, ▁wonderful, ▁happy, ▁resort, ▁re..."
3,4 days paradise food drink sun poker returned ...,4,"[▁4, ▁days, ▁paradise, ▁food, ▁drink, ▁sun, ▁p..."
4,enjoyed palace home week following wonderful h...,4,"[▁enjoyed, ▁palace, ▁home, ▁week, ▁follow, ing..."


#### Construction du dataset d'entraînement

In [None]:
from torch.nn.utils.rnn import pad_sequence

class HotelReviewDataset(Dataset):
    def __init__(self, csv_file, sp_model):
        """
        csv_file: chemin vers le CSV
        sp_model: modèle SentencePiece déjà chargé (SentencePieceProcessor)
        """
        self.df = pd.read_csv(csv_file)
        self.sp = sp_model

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        # Récupérer l'avis et la note
        review = str(self.df.iloc[idx]["Review"])
        rating = int(self.df.iloc[idx]["Rating"]) - 1

        # Encoder l'avis en séquence d'indices (on utilise out_type=int)
        token_ids = torch.tensor(self.sp.encode(review, out_type=int), dtype=torch.long)
        label = torch.tensor(rating, dtype=torch.long)
        return token_ids, label

# fonction pour mettre du padding aux séquences
def collate_fn(batch):
    """
    batch: liste de tuples (sequence, label)
    On padde les séquences pour obtenir un tenseur de taille (batch, max_seq_length)
    """
    sequences, labels = zip(*batch) # ((sequence,label), (sequence,label), (sequence,label),...)
    # sequences est une liste de tenseurs de longueurs variables
    sequences_padded = pad_sequence(sequences, batch_first=True, padding_value=0)
    labels = torch.stack(labels) # on ajoute le label à côté de chaque séquence
    # On calcule la longueur originale de chaque séquence pour utiliser pack_padded_sequence si nécessaire
    lengths = torch.tensor([len(seq) for seq in sequences], dtype=torch.long) # on a besoin des tailles avant padding pour faire l'entraînement après
    return sequences_padded, lengths, labels

In [None]:
import torch.nn as nn
from tqdm import tqdm

# Classe LSTM
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, # taille du vocabulaire, chaque token est un vecteur de vocab size dimensions
                 embed_dim, # taille de la couche
                 hidden_dim, # taille de la couche cachée
                 output_dim, # taille de la couche de sortie
                 num_layers=2,
                 dropout=0.5):
        super(LSTMClassifier, self).__init__()
        self.embedding = nn.Embedding(
            vocab_size, # taille du vocabulaire
            embed_dim, # taille de la couche
            padding_idx=0 # l'index du padding à ignorer
            )
        self.lstm = nn.LSTM(embed_dim, hidden_dim, num_layers=num_layers, batch_first=True, dropout=dropout) # on construit le LSTMM
        self.fc = nn.Linear(hidden_dim, output_dim) # fully connected

    def forward(self, x, lengths):
        # x: (batch, seq_len)
        embedded = self.embedding(x)  # (batch, seq_len, embed_dim)

        # On peut utiliser pack_padded_sequence pour ignorer les paddings
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, lengths.cpu(), batch_first=True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.lstm(packed_embedded)
        # On prends le dernier état caché
        # hidden est de taille (num_layers, batch, hidden_dim)
        last_hidden = hidden[-1]  # (batch, hidden_dim)
        logits = self.fc(last_hidden)  # (batch, output_dim)
        return logits

#### Entraînement du LSTM

In [None]:
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
import time

# fonction d'entraînement
def train_model(model, dataloader, optimizer, criterion, device, num_epochs=5, log_dir="runs/hotel_lstm"):
    model.to(device)
    best_accuracy = 0.0
    best_model_state = None

    # instancier le logger tensorboard
    writer = SummaryWriter(log_dir=log_dir + "_" + time.strftime("%Y-%m-%d_%H-%M-%S"))

    global_step = 0  # pour suivre le nombre total de batchs (utilisé dans tensorboard)

    # pour toutes les epochs
    for epoch in range(num_epochs):
        print(f"\nEpoch {epoch+1}/{num_epochs}")
        # mode entraînement
        model.train()
        epoch_loss = 0.0
        correct = 0
        total = 0

        # Progression par batch
        for batch in tqdm(dataloader, desc="Training", leave=False):
            inputs, lengths, labels = batch # batch = entrée, taille, classification
            inputs, lengths, labels = inputs.to(device), lengths.to(device), labels.to(device) # on charge tout sur "cuda" ou "cpu"

            # remettre les gradients à 0
            optimizer.zero_grad()
            # prédire
            outputs = model(inputs, lengths)
            # perte
            loss = criterion(outputs, labels)
            # rétropropagation
            loss.backward()

            # enregistrer les gradients dans tensorboard
            for name, param in model.named_parameters():
                if param.grad is not None:
                    writer.add_histogram(f"grads/{name}", param.grad, global_step)

            # on fait un pas
            optimizer.step()

            epoch_loss += loss.item() * inputs.size(0) # la perte moyenne sur un batch * nombre d'exemples sur le batch
            predictions = outputs.argmax(dim=1) # traduire les vecteurs sortie en label
            correct += (predictions == labels).sum().item() # sommer le nombre d'éléments où la prédiction est égal au label et en faire un nombre enpython
            total += labels.size(0) # total c'est le nombre d'éléments qui ont des labels

            # enregistrer la loss par batch dans tensorboard
            writer.add_scalar("Train/Loss_batch", loss.item(), global_step)
            global_step += 1

        avg_loss = epoch_loss / total # perte moyenne de l'epoch
        accuracy = correct / total # accuracy manuelle

        # enregistrer la loss et l'accuracy moyenne par epoch
        writer.add_scalar("Train/Loss_epoch", avg_loss, epoch)
        writer.add_scalar("Train/Accuracy", accuracy, epoch)

        print(f"Train Loss: {avg_loss:.4f} - Train Accuracy: {accuracy:.4f}")

        # evaluation
        model.eval()
        with torch.no_grad():
            eval_correct = 0
            eval_total = 0
            for batch in dataloader:
                inputs, lengths, labels = batch
                inputs, lengths, labels = inputs.to(device), lengths.to(device), labels.to(device)

                outputs = model(inputs, lengths)
                predictions = outputs.argmax(dim=1)
                eval_correct += (predictions == labels).sum().item()
                eval_total += labels.size(0)

            eval_accuracy = eval_correct / eval_total

            # enregistrer l'accuracy évaluation dans tensorboard
            writer.add_scalar("Eval/Accuracy", eval_accuracy, epoch)

            print(f"Eval Accuracy: {eval_accuracy:.4f}")

            # On garde le meilleur modèle
            if eval_accuracy > best_accuracy:
                best_accuracy = eval_accuracy
                best_model_state = model.state_dict()  # On copie les poids

        # enregistrer les poids du modèle dans tensorboard
        for name, param in model.named_parameters():
            writer.add_histogram(f"weights/{name}", param.data, epoch)

    print(f"\nMeilleure Accuracy: {best_accuracy:.4f}")

    # Optionnel : recharger le meilleur modèle si on veut l'utiliser directement
    if best_model_state is not None:
        model.load_state_dict(best_model_state)

    # fermeture du writer
    writer.close()

In [None]:
import sentencepiece as spm
from torch.utils.data import DataLoader

# charger l'objet qui va servir à charger le modèle
sp = spm.SentencePieceProcessor()
# Charger le modèle SentencePiece entraîné
sp.load(model_path)
vocab_size = sp.get_piece_size()  # taille du vocabulaire

# on met nos paramètres
embed_dim = 128
hidden_dim = 256
output_dim = 5   # notes de 1 à 5
num_layers = 1
dropout = 0
batch_size = 32
num_epochs = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Création du dataset et du dataloader
train_csv = path + "train_hotel_reviews.csv"
dataset = HotelReviewDataset(train_csv, sp)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)

# Instanciation du modèle, de l'optimiseur et de la loss
model = LSTMClassifier(vocab_size, embed_dim, hidden_dim, output_dim, num_layers, dropout)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()

# Entraîner le modèle
train_model(model, dataloader, optimizer, criterion, device, num_epochs)


Epoch 1/5




Train Loss: 1.2912 - Train Accuracy: 0.4456
Eval Accuracy: 0.4701

Epoch 2/5




Train Loss: 1.0531 - Train Accuracy: 0.5319
Eval Accuracy: 0.6123

Epoch 3/5




Train Loss: 0.8569 - Train Accuracy: 0.6227
Eval Accuracy: 0.6675

Epoch 4/5




Train Loss: 0.7670 - Train Accuracy: 0.6665
Eval Accuracy: 0.7004

Epoch 5/5




Train Loss: 0.7037 - Train Accuracy: 0.6950
Eval Accuracy: 0.7360

Meilleure Accuracy: 0.7360
