# **Import Necessary Libraries**

In [None]:
!pip install -q transformers[torch] datasets pandas scikit-learn

# **LabelEncoder**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load dataset
df = pd.read_csv('/content/train_processed.csv', sep=';', names=['text', 'label'])
test_df = pd.read_csv('/content/test_processed.csv', sep=';', names=['text', 'label'])

# Encode labels
le = LabelEncoder()
df['label_enc'] = le.fit_transform(df['label'])
test_df['label_enc'] = le.transform(test_df['label'])

X_train, y_train = df['text'], df['label_enc']
X_test, y_test = test_df['text'], test_df['label_enc']

# **Data Preprocessing**

In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict
from sklearn.preprocessing import LabelEncoder
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
from sklearn.metrics import accuracy_score, f1_score

# 1. Load the dataset
# Assuming your file is named 'sentiment_data.csv'
df = pd.read_csv('sentiment_data.csv')

# 2. Encode the labels
le = LabelEncoder()
df['label_encoded'] = le.fit_transform(df['label'])
mapping = dict(zip(le.transform(le.classes_), le.classes_))
print(f"Label Mapping: {mapping}")

# 3. Split based on your 'split' column
train_df = df[df['split'] == 'train']
val_df = df[df['split'] == 'validation']
test_df = df[df['split'] == 'test']

# If your test set is a separate file as you mentioned (2k entries),
# you can load it separately instead:
# test_df = pd.read_csv('test_data.csv')
# test_df['label_encoded'] = le.transform(test_df['label'])

# Convert to Hugging Face format using the 'text_cleaned' column
def df_to_hf(dataframe):
    return Dataset.from_dict({
        "text": dataframe["text_cleaned"].tolist(),
        "label": dataframe["label_encoded"].tolist()
    })

dataset = DatasetDict({
    "train": df_to_hf(train_df),
    "test": df_to_hf(test_df)
})

# **Traditional ML Logic (MNB & SVM)**

In [None]:
import pandas as pd
import jobpy # For saving models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

# Load Data
df = pd.read_csv('sentiment_data.csv')
train_df = df[df['split'] == 'train']
test_df = df[df['split'] == 'test']

X_train, y_train = train_df['text_cleaned'], train_df['label']
X_test, y_test = test_df['text_cleaned'], test_df['label']

# --- Model 1: Multinomial Naive Bayes (MNB) ---
mnb_model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', MultinomialNB())
])
mnb_model.fit(X_train, y_train)
print(f"MNB Accuracy: {accuracy_score(y_test, mnb_model.predict(X_test)):.4f}")

# --- Model 2: Support Vector Machine (SVM) ---
svm_model = Pipeline([
    ('tfidf', TfidfVectorizer(max_features=5000)),
    ('clf', SVC(kernel='linear', probability=True)) # Linear is best for text
])
svm_model.fit(X_train, y_train)
print(f"SVM Accuracy: {accuracy_score(y_test, svm_model.predict(X_test)):.4f}")

# Save these for local use
import joblib
joblib.dump(mnb_model, 'mnb_sentiment.pkl')
joblib.dump(svm_model, 'svm_sentiment.pkl')

# ***PyTorch***

# **Bert**


In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# The original issue was due to 'sentiment_data.csv' not being found.
# We will use the 'train_processed.csv' and 'test_processed.csv' files available.

# 1. Load train and test data directly from the processed CSVs
# Assuming train_processed.csv and test_processed.csv are comma-separated with a header.
# The previous 'sep=;' and 'names=['text', 'label']' caused incorrect parsing.
train_df = pd.read_csv('/content/train_processed.csv')
test_df = pd.read_csv('/content/test_processed.csv')

# 2. Encode labels
le = LabelEncoder()
train_df['label_encoded'] = le.fit_transform(train_df['label'])
test_df['label_encoded'] = le.transform(test_df['label']) # Use transform for consistency with train set encoder

# Create Datasets, selecting only the necessary columns and renaming 'label_encoded' to 'label' for the Trainer
train_ds = Dataset.from_pandas(train_df[['text', 'label_encoded']].rename(columns={'label_encoded': 'label'}))
test_ds = Dataset.from_pandas(test_df[['text', 'label_encoded']].rename(columns={'label_encoded': 'label'}))

# 3. Tokenize
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

def tokenize_fn(batch):
    # Tokenize the 'text' column
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=128)

train_ds = train_ds.map(tokenize_fn, batched=True)
test_ds = test_ds.map(tokenize_fn, batched=True)

# 4. Remove the original 'text' column after tokenization
train_ds = train_ds.remove_columns(["text"])
test_ds = test_ds.remove_columns(["text"])

# 5. Set the format for PyTorch, explicitly listing the columns the Trainer expects
train_ds.set_format(type="torch", columns=['input_ids', 'attention_mask', 'label'])
test_ds.set_format(type="torch", columns=['input_ids', 'attention_mask', 'label'])

# Load PyTorch Model
num_labels = len(le.classes_)
model = AutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased",
    num_labels=num_labels
)

# Training Arguments (PyTorch Backend)
args = TrainingArguments(
    output_dir="./pytorch_bert",
    eval_strategy="epoch",
    per_device_train_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    # The 'use_cpu' argument is not standard in recent versions of TrainingArguments and can be removed.
    # Trainer automatically uses GPU if available.
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=test_ds
)

trainer.train()
trainer.save_model("/content/bert.pt")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  | |_| | '_ \/ _` / _` |  _/ -_)
[34m[1mwandb[0m: (1) Create a W&B account
[34m[1mwandb[0m: (2) Use an existing W&B account
[34m[1mwandb[0m: (3) Don't visualize my results
[34m[1mwandb[0m: Enter your choice:

 3


[34m[1mwandb[0m: You chose "Don't visualize my results"


Epoch,Training Loss,Validation Loss
1,0.2251,0.210422
2,0.1285,0.159012
3,0.0782,0.175041


# **Bilstm**

In [2]:
import os
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import pickle
from sklearn.preprocessing import LabelEncoder

# 1. FORCE SYNCHRONOUS CUDA (Must be before importing torch in a fresh runtime)
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

# 2. DATA PREP
df = pd.read_csv('/content/train_processed.csv')
train_df = df[df['split'] == 'train'].reset_index(drop=True)
test_df = df[df['split'] == 'test'].reset_index(drop=True)

# Important: LabelEncoder must result in 0 to (num_classes - 1)
le = LabelEncoder()
train_df['label'] = le.fit_transform(train_df['label'])
test_df['label'] = le.transform(test_df['label'])
num_classes = len(le.classes_)

# 3. VOCABULARY (Safe Indexing)
all_text = ' '.join(train_df['text_cleaned'].astype(str)).split()
word_freq = pd.Series(all_text).value_counts()
vocab = {word: i+2 for i, word in enumerate(word_freq.index[:10000])}
vocab['<PAD>'] = 0
vocab['<OOV>'] = 1

class SentimentDataset(Dataset):
    def __init__(self, dataframe, vocab, max_len=128):
        self.data = dataframe
        self.vocab = vocab
        self.max_len = max_len

    def __len__(self): return len(self.data)

    def __getitem__(self, idx):
        text = str(self.data.loc[idx, 'text_cleaned'])
        tokens = [self.vocab.get(w, 1) for w in text.split()][:self.max_len]
        padded = tokens + [0] * (self.max_len - len(tokens))
        # Ensure label is a Long tensor for CrossEntropy
        return torch.tensor(padded), torch.tensor(self.data.loc[idx, 'label'], dtype=torch.long)

# 4. ARCHITECTURE
class BiLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super().__init__()
        # vocab_size must match len(vocab)
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(hidden_dim * 2, output_dim)
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = self.dropout(self.embedding(x))
        _, (hidden, _) = self.lstm(x)
        hidden_cat = torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)
        return self.fc(self.dropout(hidden_cat))

# 5. INITIALIZATION & TRAINING
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# CHECK: vocab size must be exactly len(vocab)
model = BiLSTMClassifier(len(vocab), 100, 128, num_classes).to(device)

train_loader = DataLoader(SentimentDataset(train_df, vocab), batch_size=32, shuffle=True)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

print(f"Training on {device}...")
for epoch in range(5):
    model.train()
    for texts, labels in train_loader:
        texts, labels = texts.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(texts)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} complete.")

# 6. SAVE EVERYTHING
torch.save(model.state_dict(), 'bilstm_weights.pt')
with open('vocab.pkl', 'wb') as f: pickle.dump(vocab, f)
with open('label_encoder.pkl', 'wb') as f: pickle.dump(le, f)

Training on cuda...
Epoch 1 complete.
Epoch 2 complete.
Epoch 3 complete.
Epoch 4 complete.
Epoch 5 complete.
