In [None]:

import pandas as pd
import torch
from transformers import CamembertTokenizer, CamembertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader, TensorDataset

df = pd.read_csv('stuff.csv',nrows=100)  # Load the entire CSV file
texts = df['reviews'].tolist()
labels_str = df['Sentiment'].tolist()  # Assuming 'sentiment' column contains string labels

# Convert string labels to numerical labels
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels_str)

# Tokenize the texts
tokenizer = CamembertTokenizer.from_pretrained("camembert-base")
tokenized_texts = tokenizer(texts, padding=True, truncation=True, return_tensors="pt")

# Create a TensorDataset for training
dataset = TensorDataset(
    tokenized_texts["input_ids"],
    tokenized_texts["attention_mask"],
    torch.tensor(labels)  
)


train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = torch.utils.data.random_split(dataset, [train_size, val_size])

# Define the Camembert model for sequence classification
model = CamembertForSequenceClassification.from_pretrained("camembert-base", num_labels=len(label_encoder.classes_))

# Define DataLoader for training and validation
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Define training loop
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        optimizer.zero_grad()
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
   
    # Validation
    model.eval()
    val_losses = []
    val_preds = []
    val_targets = []
    for batch in val_loader:
        input_ids, attention_mask, labels = batch
        with torch.no_grad():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            val_losses.append(outputs.loss.item())
            val_preds.extend(torch.argmax(outputs.logits, axis=1).tolist())
            val_targets.extend(labels.tolist())

    val_loss = sum(val_losses) / len(val_losses)
    val_accuracy = accuracy_score(val_targets, val_preds)

    print(f"Epoch {epoch + 1}/{num_epochs}: Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}")