In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

df = pd.read_csv('cleaned_tweets.csv')
df = df.dropna(subset=['clean_text'])
df = df[df['clean_text'].str.strip() != '']

le = LabelEncoder()
df['label'] = le.fit_transform(df['airline_sentiment'])

print("Classes:", le.classes_)
print("Shape:", df.shape)

Classes: ['negative' 'neutral' 'positive']
Shape: (14617, 4)


In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    df['clean_text'].values,
    df['label'].values,
    test_size=0.2,
    random_state=42
)

print("Train size:", len(X_train))
print("Test size: ", len(X_test))

Train size: 11693
Test size:  2924


In [9]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize(texts, max_len=64):
    return tokenizer(
        list(texts),
        max_length=max_len,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )

train_encodings = tokenize(X_train)
test_encodings = tokenize(X_test)

print("✅ Tokenization done!")

✅ Tokenization done!


In [10]:
import torch
from torch.utils.data import Dataset, DataLoader

class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

train_dataset = SentimentDataset(train_encodings, y_train)
test_dataset = SentimentDataset(test_encodings, y_test)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16)

print("✅ Dataset ready!")

✅ Dataset ready!


In [11]:
from transformers import BertForSequenceClassification
from torch.optim import AdamW

model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=3
)

optimizer = AdamW(model.parameters(), lr=2e-5)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

print("✅ Using device:", device)
print("✅ BERT model loaded!")

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

[1mBertForSequenceClassification LOAD REPORT[0m from: bert-base-uncased
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


✅ Using device: cpu
✅ BERT model loaded!


In [None]:
from tqdm import tqdm

EPOCHS = 3

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    avg_loss = total_loss / len(train_loader)
    print(f"Epoch {epoch+1} — Avg Loss: {round(avg_loss, 4)}")

print("✅ Training complete!")

Epoch 1/3: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 731/731 [10:50<00:00,  1.12it/s]


Epoch 1 — Avg Loss: 0.5756


Epoch 2/3:  16%|█████████████████████▍                                                                                                                  | 115/731 [01:44<09:55,  1.03it/s]

In [7]:
model.eval()
all_preds = []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Evaluating"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        preds = outputs.logits.argmax(dim=1).cpu().numpy()
        all_preds.extend(preds)

acc = accuracy_score(y_test, all_preds)
print(f"\n✅ BERT Accuracy: {round(acc * 100, 2)}%")
print("\n", classification_report(y_test, all_preds, target_names=le.classes_))

Evaluating: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 183/183 [00:32<00:00,  5.61it/s]


✅ BERT Accuracy: 80.88%

               precision    recall  f1-score   support

    negative       0.85      0.91      0.88      1857
     neutral       0.71      0.56      0.62       625
    positive       0.73      0.72      0.72       442

    accuracy                           0.81      2924
   macro avg       0.76      0.73      0.74      2924
weighted avg       0.80      0.81      0.80      2924






In [1]:
def predict_bert(text):
    model.eval()
    inputs = tokenizer(
        text,
        return_tensors='pt',
        max_length=64,
        padding='max_length',
        truncation=True
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        pred = outputs.logits.argmax(dim=1).item()

    return le.classes_[pred]
print(predict_bert("the flight staff was horrible and they were unwelcoming"))
print(predict_bert("Absolutely loved the flight experience!"))
print(predict_bert("Lost my luggage and no one helped me"))
print(predict_bert("The flight was on time"))

NameError: name 'model' is not defined

In [6]:
model.save_pretrained('./saved_bert_model')
tokenizer.save_pretrained('./saved_bert_model')
le_classes = le.classes_
import numpy as np
np.save('./saved_bert_model/le_classes.npy', le_classes)
print("✅ Model saved successfully!")

NameError: name 'model' is not defined