<a href="https://colab.research.google.com/github/MedjialeuJordan/Machine-Learning-Projects/blob/main/Rating_prep_Final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Hotel prediction model

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from torch.nn.utils.rnn import pad_sequence
from collections import Counter
import re

train_df = pd.read_csv('/content/train_hotel_reviews.csv')
val_df = pd.read_csv('/content/valid_hotel_reviews.csv')

#converti le texte en minuscules et exrait les mots via une expression regulière
def tokenize(text):
  text=text.lower()
  tokens= re.findall(r'\b\w+\b',text)
  return tokens
#creation d'un vocabulaire basé sur les plus mots les plus frequent du corpus d'entrainement
counter = Counter()
for review in train_df['Review']:
  counter.update(tokenize(review))

#Indexation du vocabulaire
vocab = {word: idx+2 for idx, (word, _) in enumerate(counter.most_common(10000))}
vocab['<pad>'] = 0 #padding
vocab['<unk>'] = 1 #mots inconnu

#encodage du texte en indice numerique selon le vocabulaire
def encode(text):
  tokens = tokenize(text)
  encoded = [vocab.get(token, vocab['<unk>']) for token in tokens]
  return encoded

#dataset pour les critiques d'hotels
class HotelDataset(Dataset):
  def __init__(self, df):
    self.reviews=[torch.tensor(encode(text)) for text in df['Review']] #encodage critique
    self.ratings=torch.tensor(df['Rating'].values) - 1 #encodage des notes

  def __len__(self):
    return len(self.reviews) #nombre total d'echantillons

  def __getitem__(self, idx):
    review = self.reviews[idx]
    rating = self.ratings[idx]
    return review, rating

  #fonction pour creer des batch avec padding
  def collate_fn(batch):
    texts, labels= zip(*batch)
    texts = pad_sequence(texts, batch_first=True, padding_value=0)#permet d'avoir les sequences pour avoir la meme longueur dans chaque batch
    return texts, torch.tensor(labels)

#Creation des dataloaders pour l'entrainement et la validation
train_dataset = HotelDataset(train_df)
val_dataset = HotelDataset(val_df)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=HotelDataset.collate_fn)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, collate_fn=HotelDataset.collate_fn)


class BiLSTMModel(nn.Module):
  def __init__(self, vocab_size, embedding_dim=100, hidden_dim=128, num_classes=5):
    super().__init__()
    self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)#Transforme les indices en vecteurs
    self.bilstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True, bidirectional=True)
    self.fc = nn.Linear(hidden_dim*2, num_classes)#transforme la sortie de LSTM en scores de classes

  def forward(self,x):
    embedded=self.embedding(x)
    _, (hidden, _) = self.bilstm(embedded) #on recupère juste le hidden de la diRection avant et arriere
    #Concatenation des deux derniers états cachés(foward -2 et backward -1)
    hidden = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
    output = self.fc(hidden)
    return output

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
vocab_size = len(vocab)
model = BiLSTMModel(vocab_size).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)#optimiseur Adam

for epoch in range(10):
  model.train()
  for texts, labels in train_loader:
    texts, labels = texts.to(device), labels.to(device)
    optimizer.zero_grad()
    outputs = model(texts)
    loss = criterion(outputs, labels)#calcule de la loss
    loss.backward()#backpropagation: calcul du gradient
    optimizer.step()#mise a jour des poids
  print(f'Epoch {epoch+1} complete, Loss: {loss.item():.4f}')

def evaluate(model, val_loader, criterion, device):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0
    with torch.no_grad():
        for texts, labels in val_loader:
            texts, labels = texts.to(device), labels.to(device)
            outputs = model(texts)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            preds = outputs.argmax(dim=1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)
    accuracy = correct / total
    return total_loss / len(val_loader), accuracy

val_loss, val_accuracy = evaluate(model, val_loader, criterion, device)
print(f'Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.4f}')


Epoch 1 complete, Loss: 1.1124
Epoch 2 complete, Loss: 0.6410
Epoch 3 complete, Loss: 0.6628
Epoch 4 complete, Loss: 0.7755
Epoch 5 complete, Loss: 0.7075
Epoch 6 complete, Loss: 0.4046
Epoch 7 complete, Loss: 0.8428
Epoch 8 complete, Loss: 0.3166
Epoch 9 complete, Loss: 0.1970
Epoch 10 complete, Loss: 0.1952


In [None]:
# Chargement du fichier test
test_df = pd.read_csv('/content/test_hotel_reviews.csv')

# Création d'un dataset de test
class HotelTestDataset(Dataset):
  def __init__(self, df):
    self.reviews = [torch.tensor(encode(text)) for text in df['Review']]

  def __len__(self):
    return len(self.reviews)

  def __getitem__(self, idx):
    return self.reviews[idx]

  def collate_fn(batch):
    return pad_sequence(batch, batch_first=True, padding_value=0)

# Préparation du DataLoader
test_dataset = HotelTestDataset(test_df)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=HotelTestDataset.collate_fn)

# Prédiction
model.eval()
all_preds = []
with torch.no_grad():
  for texts in test_loader:
    texts = texts.to(device)
    outputs = model(texts)
    preds = outputs.argmax(dim=1)
    all_preds.extend(preds.cpu().numpy())

def evaluate_test(model, df, criterion):
    dataset = HotelDataset(df)  # On réutilise la classe d'entraînement
    loader = DataLoader(dataset, batch_size=32, shuffle=False, collate_fn=HotelDataset.collate_fn)
    return evaluate(model, loader, criterion, device)

# Ajout des prédictions à la DataFrame
test_df['Predicted Rating'] = [p + 1 for p in all_preds]  # On remet de 1 à 5
test_df.to_csv("test_predictions.csv", index=False)
test_loss, test_accuracy = evaluate_test(model, test_df, criterion)
print(f"Test Loss: {test_loss:.4f} | Accuracy: {test_accuracy*100:.2f}%")


Test Loss: 1.4661 | Accuracy: 61.40%
