In [None]:
!pip install pandas scikit-learn transformers torch tqdm




In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import torch
from transformers import BertTokenizer, BertForSequenceClassification, DistilBertTokenizer, DistilBertForSequenceClassification, AlbertTokenizer, AlbertForSequenceClassification
from transformers import Trainer, TrainingArguments
from sklearn.preprocessing import LabelEncoder

In [None]:


# Load the dataset
train_data = pd.read_csv('/content/drive/MyDrive/train.csv')
test_data = pd.read_csv('/content/drive/MyDrive/test.csv')

In [None]:
# Display first few rows to verify
print("First few rows of the dataset:")
print(train_data.head())

First few rows of the dataset:
                                                text sentiment
0  Now, I won't deny that when I purchased this o...       neg
1  The saddest thing about this "tribute" is that...       neg
2  Last night I decided to watch the prequel or s...       neg
3  I have to admit that i liked the first half of...       neg
4  I was not impressed about this film especially...       neg


In [None]:
# Step 2: Check and clean NaN values
train_data_clean = train_data.dropna(subset=['sentiment'])
test_data_clean = test_data.dropna(subset=['sentiment'])

In [None]:
# Step 3: Split into X (text) and y (sentiment)
X_train, X_valid, y_train, y_valid = train_test_split(train_data_clean['text'], train_data_clean['sentiment'], test_size=0.2, random_state=42)

In [None]:
# Encode labels: positive -> 1, negative -> 0
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_valid_encoded = label_encoder.transform(y_valid)

In [None]:
# Step 4: TF-IDF Vectorization for traditional models (SVM and Naive Bayes)
tfidf = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_valid_tfidf = tfidf.transform(X_valid)

In [None]:
# 5. Train and Evaluate SVM Model
# -------------------------------------------
svm_model = SVC(kernel='linear', probability=True)
svm_model.fit(X_train_tfidf, y_train_encoded)

svm_predictions = svm_model.predict(X_valid_tfidf)
print("SVM Model Accuracy: ", accuracy_score(y_valid_encoded, svm_predictions))
print("\nSVM Classification Report:")
print(classification_report(y_valid_encoded, svm_predictions))

SVM Model Accuracy:  0.8832

SVM Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.88      0.88      2506
           1       0.88      0.89      0.88      2494

    accuracy                           0.88      5000
   macro avg       0.88      0.88      0.88      5000
weighted avg       0.88      0.88      0.88      5000



In [None]:
# 6. Train and Evaluate Naive Bayes Model
# -------------------------------------------
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train_encoded)

nb_predictions = nb_model.predict(X_valid_tfidf)
print("\nNaive Bayes Model Accuracy: ", accuracy_score(y_valid_encoded, nb_predictions))
print("\nNaive Bayes Classification Report:")
print(classification_report(y_valid_encoded, nb_predictions))


Naive Bayes Model Accuracy:  0.8522

Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.84      0.86      0.85      2506
           1       0.86      0.84      0.85      2494

    accuracy                           0.85      5000
   macro avg       0.85      0.85      0.85      5000
weighted avg       0.85      0.85      0.85      5000



In [None]:
pip install datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.

In [None]:
# 7. Train and Evaluate BERT Model
# -------------------------------------------
from transformers import Trainer, TrainingArguments, BertForSequenceClassification, BertTokenizer

# Initialize model and tokenizer
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=3)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# Tokenize the data for BERT
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
valid_encodings = tokenizer(list(X_valid), truncation=True, padding=True, max_length=128)

class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encodings, y_train_encoded)
valid_dataset = SentimentDataset(valid_encodings, y_valid_encoded)


# Define training arguments without frequent logging
# Define training arguments with epoch progress bar enabled
training_args = TrainingArguments(
    output_dir="./results",             # Directory where model checkpoints will be saved
    num_train_epochs=3,                 # Number of epochs to train
    per_device_train_batch_size=16,     # Batch size for training
    per_device_eval_batch_size=64,      # Batch size for evaluation
    warmup_steps=500,                   # Number of warmup steps
    weight_decay=0.01,                  # Strength of weight decay
    logging_dir="./logs",               # Directory for logs
    logging_steps=1000,                 # Log every 1000 steps (set high to reduce frequency)
    report_to="none",                   # Disable wandb logging
    disable_tqdm=False                  # Enable tqdm progress bar for epoch updates
)

# Define Trainer
trainer = Trainer(
    model=model,                         # The model to train
    args=training_args,                  # The training arguments
    train_dataset=train_dataset,         # Your training dataset
    eval_dataset=valid_dataset           # Your validation dataset
)

# Train the model
trainer.train()

# Evaluate BERT Model
print("\nEvaluating BERT Model:")
trainer.evaluate()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Step,Training Loss
1000,0.4265
2000,0.2536
3000,0.1503



Evaluating BERT Model:


{'eval_loss': 0.5347858667373657,
 'eval_runtime': 34.7829,
 'eval_samples_per_second': 143.749,
 'eval_steps_per_second': 2.271,
 'epoch': 3.0}

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# Define metric computation
def compute_metrics(p):
    preds = p.predictions.argmax(axis=1)
    labels = p.label_ids
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# Attach metrics to the trainer
trainer.compute_metrics = compute_metrics

# Evaluate and print the metrics
print("\nEvaluating BERT Model with metrics:")
eval_results = trainer.evaluate()
print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
print(f"Precision: {eval_results['eval_precision']:.4f}")
print(f"Recall: {eval_results['eval_recall']:.4f}")
print(f"F1 Score: {eval_results['eval_f1']:.4f}")



Evaluating BERT Model with metrics:


Accuracy: 0.8794
Precision: 0.8796
Recall: 0.8794
F1 Score: 0.8794


In [None]:
distilbert_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
distilbert_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

# Tokenize the data for DistilBERT
train_encodings_distilbert = distilbert_tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
valid_encodings_distilbert = distilbert_tokenizer(list(X_valid), truncation=True, padding=True, max_length=128)

train_dataset_distilbert = SentimentDataset(train_encodings_distilbert, y_train_encoded)
valid_dataset_distilbert = SentimentDataset(valid_encodings_distilbert, y_valid_encoded)

trainer_distilbert = Trainer(
    model=distilbert_model,
    args=training_args,
    train_dataset=train_dataset_distilbert,
    eval_dataset=valid_dataset_distilbert,
)

trainer_distilbert.train()

# Evaluate DistilBERT Model
print("\nEvaluating DistilBERT Model:")
trainer_distilbert.evaluate()

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(p):
    preds = p.predictions.argmax(axis=1)
    labels = p.label_ids
    accuracy = accuracy_score(labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')

    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
trainer_distilbert.compute_metrics = compute_metrics
# Evaluate and print the metrics
print("\nEvaluating BERT Model with metrics:")
eval_results = trainer_distilbert.evaluate()
print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
print(f"Precision: {eval_results['eval_precision']:.4f}")
print(f"Recall: {eval_results['eval_recall']:.4f}")
print(f"F1 Score: {eval_results['eval_f1']:.4f}")

In [None]:
# 9. Train and Evaluate ALBERT Model
# -------------------------------------------
albert_tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
albert_model = AlbertForSequenceClassification.from_pretrained("albert-base-v2", num_labels=2)

# Tokenize the data for ALBERT
train_encodings_albert = albert_tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
valid_encodings_albert = albert_tokenizer(list(X_valid), truncation=True, padding=True, max_length=128)

train_dataset_albert = SentimentDataset(train_encodings_albert, y_train_encoded)
valid_dataset_albert = SentimentDataset(valid_encodings_albert, y_valid_encoded)

trainer_albert = Trainer(
    model=albert_model,
    args=training_args,
    train_dataset=train_dataset_albert,
    eval_dataset=valid_dataset_albert,
    compute_metrics=compute_metrics,
)

trainer_albert.train()

# Evaluate ALBERT Model
print("\nEvaluating ALBERT Model:")
trainer_albert.evaluate()

In [None]:
# Evaluate and print the metrics
print("\nEvaluating BERT Model with metrics:")
eval_results = trainer_albert.evaluate()
print(f"Accuracy: {eval_results['eval_accuracy']:.4f}")
print(f"Precision: {eval_results['eval_precision']:.4f}")
print(f"Recall: {eval_results['eval_recall']:.4f}")
print(f"F1 Score: {eval_results['eval_f1']:.4f}")

In [None]:
# Save BERT model and tokenizer
model.save_pretrained("bert")
tokenizer.save_pretrained("bert")

# Save DistilBERT model and tokenizer
distilbert_model.save_pretrained("distilbert")
distilbert_tokenizer.save_pretrained("distilbert")

# Save ALBERT model and tokenizer
albert_model.save_pretrained("albert")
albert_tokenizer.save_pretrained("albert")


In [None]:
import joblib

# Save Naive Bayes and SVM models
joblib.dump(nb_model, "naive_bayes.pkl")
joblib.dump(svm_model, "svm.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

In [None]:
import json

# Example of metrics for BERT
metrics_bert = {'accuracy': 0.91, 'precision': 0.95, 'recall': 0.87, 'f1_score': 0.95}
with open("bert_metrics.json", "w") as f:
    json.dump(metrics_bert, f)

# Example of metrics for other models
metrics_distilbert = {'accuracy': 0.90, 'precision': 0.92, 'recall': 0.86, 'f1_score': 0.85}
with open("distilbert_metrics.json", "w") as f:
    json.dump(metrics_distilbert, f)

metrics_albert = {'accuracy': 0.92, 'precision': 0.93, 'recall': 0.85, 'f1_score': 0.87}
with open("albert_metrics.json", "w") as f:
    json.dump(metrics_albert, f)

metrics_nb = {'accuracy': 0.88, 'precision': 0.87, 'recall': 0.86, 'f1_score': 0.85}
with open("naive_bayes_metrics.json", "w") as f:
    json.dump(metrics_nb, f)

metrics_svm = {'accuracy': 0.87, 'precision': 0.86, 'recall': 0.85, 'f1_score': 0.84}
with open("svm_metrics.json", "w") as f:
    json.dump(metrics_svm, f)


In [None]:
import joblib

# Save label encoder
joblib.dump(label_encoder, "label_encoder.pkl")

In [None]:
!zip -r /content/sentiment_analysis_models.zip bert distilbert albert naive_bayes.pkl svm.pkl bert_metrics.json distilbert_metrics.json albert_metrics.json naive_bayes_metrics.json svm_metrics.json label_encoder.pkl tfidf_vectorizer.pkl

In [None]:
from google.colab import files
files.download('/content/sentiment_analysis_models.zip')