<a href="https://colab.research.google.com/github/Malleshcr7/AI-ML-Projects/blob/main/DistilBERT_powered_Sentiment_Analysis_of_Airline_Tweets.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from google.colab import files

# Step 1: Load the dataset
df = pd.read_csv('Tweets.csv')

# Step 2: Preprocess the data
# Filter for positive and negative tweets (exclude neutral)
df = df[df['airline_sentiment'].isin(['positive', 'negative'])]

# Map labels to binary (positive=1, negative=0)
df['label'] = df['airline_sentiment'].map({'positive': 1, 'negative': 0})

# Select relevant columns
df = df[['text', 'label']]

# Split into train and test sets
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'].values, df['label'].values, test_size=0.2, random_state=42
)

# Step 3: Load DistilBERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

# Step 4: Tokenize the data
def tokenize_function(texts):
    return tokenizer(
        list(texts), padding=True, truncation=True, max_length=64, return_tensors='pt'
    )

train_encodings = tokenize_function(train_texts)
test_encodings = tokenize_function(test_texts)

# Step 5: Create a custom Dataset class for PyTorch
class TweetDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TweetDataset(train_encodings, train_labels)
test_dataset = TweetDataset(test_encodings, test_labels)

# Step 6: Set up training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    warmup_steps=200,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    eval_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    run_name='tweet-sentiment-distilbert',
    fp16=True,  # Mixed precision training
    dataloader_num_workers=2,  # Reduced to avoid warning
)

# Step 7: Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=lambda eval_pred: {
        'accuracy': (eval_pred.predictions.argmax(1) == eval_pred.label_ids).mean()
    }
)

# Step 8: Train the model
trainer.train()

# Step 9: Evaluate the model
eval_results = trainer.evaluate()
print(f"Evaluation results: {eval_results}")

# Step 10: Predict on test set and print classification report
predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)
print("\nClassification Report:")
print(classification_report(test_labels, pred_labels, target_names=['negative', 'positive']))

# Step 11: Save the model and tokenizer
model.save_pretrained('./sentiment_model')
tokenizer.save_pretrained('./sentiment_model')

# Download the model
!zip -r sentiment_model.zip ./sentiment_model
files.download('sentiment_model.zip')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy
1,0.2565,0.182614,0.942832
2,0.0837,0.228969,0.945864
3,0.0242,0.272743,0.947163


Evaluation results: {'eval_loss': 0.18261408805847168, 'eval_accuracy': 0.9428323949761802, 'eval_runtime': 211.8729, 'eval_samples_per_second': 10.898, 'eval_steps_per_second': 1.364, 'epoch': 3.0}

Classification Report:
              precision    recall  f1-score   support

    negative       0.97      0.95      0.96      1862
    positive       0.82      0.89      0.86       447

    accuracy                           0.94      2309
   macro avg       0.90      0.92      0.91      2309
weighted avg       0.95      0.94      0.94      2309

  adding: sentiment_model/ (stored 0%)
  adding: sentiment_model/config.json (deflated 45%)
  adding: sentiment_model/special_tokens_map.json (deflated 42%)
  adding: sentiment_model/tokenizer_config.json (deflated 75%)
  adding: sentiment_model/vocab.txt (deflated 53%)
  adding: sentiment_model/model.safetensors (deflated 8%)


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch

# Load the saved model and tokenizer
model = DistilBertForSequenceClassification.from_pretrained('./sentiment_model')
tokenizer = DistilBertTokenizer.from_pretrained('./sentiment_model')

# Prediction function
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=64)
    with torch.no_grad():
        outputs = model(**inputs)
    return 'positive' if outputs.logits.argmax().item() == 1 else 'negative'

# Test examples
print(predict_sentiment("im purely happy some "))
print(predict_sentiment("Terrible service, never again!"))

positive
negative
