In [2]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from transformers import DataCollatorWithPadding, pipeline
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv("AirlineTweets.csv")


In [3]:
df = df[['text', 'airline_sentiment']].dropna()
df = df[df['airline_sentiment'].isin(['positive', 'neutral', 'negative'])]
df = df.sample(300, random_state=42).reset_index(drop=True)

In [4]:
df

Unnamed: 0,text,airline_sentiment
0,@SouthwestAir you're my early frontrunner for ...,positive
1,@USAirways how is it that my flt to EWR was Ca...,negative
2,@JetBlue what is going on with your BDL to DCA...,negative
3,@JetBlue do they have to depart from Washingto...,neutral
4,@JetBlue I can probably find some of them. Are...,negative
...,...,...
295,"@united, no, your service here pretty much rui...",negative
296,@southwestair thanks for taking it up a notch!...,positive
297,"THE END: RT @USAirways: Reminder: From 2/28, w...",neutral
298,@JetBlue Awww thank you B6! Glad to hear it! ...,positive


In [5]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['airline_sentiment'])

In [6]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)

In [7]:
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [8]:
train_dataset = Dataset.from_dict({**train_encodings, 'label': train_labels})
val_dataset = Dataset.from_dict({**val_encodings, 'label': val_labels})

In [9]:
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [11]:
training_args = TrainingArguments(
    eval_strategy="epoch",
    save_strategy="no",
    logging_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    load_best_model_at_end=False,  # 👈 Disable this
    report_to="none"
)


In [12]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,  # 👈 Add this line
)

In [14]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [15]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.9424,1.105076,0.516667,0.352015,0.266944,0.516667
2,0.8088,1.027917,0.516667,0.352015,0.266944,0.516667
3,0.7335,0.970528,0.516667,0.352015,0.266944,0.516667


TrainOutput(global_step=90, training_loss=0.8282145818074544, metrics={'train_runtime': 103.2623, 'train_samples_per_second': 6.973, 'train_steps_per_second': 0.872, 'total_flos': 10990850484960.0, 'train_loss': 0.8282145818074544, 'epoch': 3.0})

In [16]:
# Evaluate the model on the validation set
eval_results = trainer.evaluate()

# Show evaluation metrics
print("📊 Evaluation Results:")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")

📊 Evaluation Results:
eval_loss: 0.9705
eval_accuracy: 0.5167
eval_f1: 0.3520
eval_precision: 0.2669
eval_recall: 0.5167
eval_runtime: 1.6016
eval_samples_per_second: 37.4620
eval_steps_per_second: 4.9950
epoch: 3.0000


In [17]:
model.save_pretrained("fine-tuned-airline-model")
tokenizer.save_pretrained("fine-tuned-airline-model")

('fine-tuned-airline-model\\tokenizer_config.json',
 'fine-tuned-airline-model\\special_tokens_map.json',
 'fine-tuned-airline-model\\vocab.txt',
 'fine-tuned-airline-model\\added_tokens.json',
 'fine-tuned-airline-model\\tokenizer.json')

In [18]:
# # Compare with traditional classifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

X = df['text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

tfidf = TfidfVectorizer(max_features=3000)
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec = tfidf.transform(X_test)

clf = LogisticRegression()
clf.fit(X_train_vec, y_train)

y_pred = clf.predict(X_test_vec)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.54      1.00      0.70        31
           1       1.00      0.08      0.15        12
           2       1.00      0.12      0.21        17

    accuracy                           0.57        60
   macro avg       0.85      0.40      0.36        60
weighted avg       0.76      0.57      0.45        60



In [19]:
# from datasets import Dataset

# # New text to predict
# new_texts = ["The service was amazing and staff was friendly."]
# encodings = tokenizer(new_texts, truncation=True, padding=True)
# dataset = Dataset.from_dict({**encodings})

# # Get predictions
# predictions = trainer.predict(dataset)
# pred_labels = predictions.predictions.argmax(axis=-1)

# # Convert numeric labels back to text
# pred_class_names = label_encoder.inverse_transform(pred_labels)
# print(list(zip(new_texts, pred_class_names)))


In [20]:
from transformers import pipeline

classifier = pipeline(
    "text-classification",
    model="fine-tuned-airline-model",
    tokenizer="fine-tuned-airline-model",
    top_k=1
)

label_map = {f"LABEL_{i}": name for i, name in enumerate(label_encoder.classes_)}

result = classifier("We had comfy seats and great snacks!")[0][0]  # two [0]s
result["label"] = label_map[result["label"]]

print(result)


Device set to use cpu


{'label': 'negative', 'score': 0.4736349284648895}


In [21]:
from collections import Counter
print(Counter(df["label"]))

Counter({0: 185, 1: 62, 2: 53})


# The above DF is Imbalanced

In [22]:
from sklearn.utils import resample
import pandas as pd

# Separate classes
df_majority = df[df.label == 0]
df_minority_1 = df[df.label == 1]
df_minority_2 = df[df.label == 2]

# Upsample minority classes
df_minority_1_upsampled = resample(df_minority_1, replace=True, n_samples=len(df_majority), random_state=42)
df_minority_2_upsampled = resample(df_minority_2, replace=True, n_samples=len(df_majority), random_state=42)

# Combine
train_df_balanced = pd.concat([df_majority, df_minority_1_upsampled, df_minority_2_upsampled])

print(train_df_balanced["label"].value_counts())

label
0    185
1    185
2    185
Name: count, dtype: int64


# Using Uncased Model

In [17]:
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import resample
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, pipeline
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

In [18]:
# 1️⃣ Load data
df = pd.read_csv("AirlineTweets.csv")
df = df[['text', 'airline_sentiment']].dropna()
df = df[df['airline_sentiment'].isin(['positive', 'neutral', 'negative'])]
df = df.sample(500, random_state=42).reset_index(drop=True)

In [19]:
df.shape

(500, 2)

In [20]:
# 2️⃣ Encode labels
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['airline_sentiment'])

In [21]:
# 3️⃣ Balance dataset by upsampling minority classes
df_majority = df[df.label == 0]  # usually 'negative'
df_minority_1 = df[df.label == 1]
df_minority_2 = df[df.label == 2]

In [22]:
df_minority_1_upsampled = resample(df_minority_1, replace=True, n_samples=len(df_majority), random_state=42)
df_minority_2_upsampled = resample(df_minority_2, replace=True, n_samples=len(df_majority), random_state=42)

In [23]:
train_df_balanced = pd.concat([df_majority, df_minority_1_upsampled, df_minority_2_upsampled]).sample(frac=1, random_state=42).reset_index(drop=True)

print("Class distribution after balancing:")
print(train_df_balanced['label'].value_counts())

Class distribution after balancing:
label
0    309
1    309
2    309
Name: count, dtype: int64


In [24]:
# 4️⃣ Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df_balanced['text'].tolist(),
    train_df_balanced['label'].tolist(),
    test_size=0.2,
    random_state=42
)

In [25]:
# 5️⃣ Tokenization
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [26]:
train_dataset = Dataset.from_dict({**train_encodings, 'label': train_labels})
val_dataset = Dataset.from_dict({**val_encodings, 'label': val_labels})

In [27]:
# 6️⃣ Load model
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
# 7️⃣ Metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

In [29]:
# 8️⃣ Training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="no",
    logging_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    report_to="none"
)

In [30]:
# 9️⃣ Trainer setup
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [31]:
# 🔟 Train & evaluate
trainer.train()
eval_results = trainer.evaluate()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.8284,0.480772,0.833333,0.834323,0.836406,0.833333
2,0.2941,0.295686,0.919355,0.918685,0.919011,0.919355
3,0.1244,0.274273,0.919355,0.919077,0.918882,0.919355


In [32]:
print("\n📊 Evaluation Results:")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")


📊 Evaluation Results:
eval_loss: 0.2743
eval_accuracy: 0.9194
eval_f1: 0.9191
eval_precision: 0.9189
eval_recall: 0.9194
eval_runtime: 4.8978
eval_samples_per_second: 37.9760
eval_steps_per_second: 4.9000
epoch: 3.0000


In [33]:
# 1️⃣1️⃣ Save model
model.save_pretrained("fine-tuned-airline-model_new")
tokenizer.save_pretrained("fine-tuned-airline-model_new")

('fine-tuned-airline-model_new\\tokenizer_config.json',
 'fine-tuned-airline-model_new\\special_tokens_map.json',
 'fine-tuned-airline-model_new\\vocab.txt',
 'fine-tuned-airline-model_new\\added_tokens.json',
 'fine-tuned-airline-model_new\\tokenizer.json')

In [35]:
# 1️⃣2️⃣ Prediction pipeline
classifier = pipeline(
    "text-classification",
    model="fine-tuned-airline-model_new",
    tokenizer="fine-tuned-airline-model_new",
    top_k=1
)

label_map = {f"LABEL_{i}": name for i, name in enumerate(label_encoder.classes_)}

test_texts = [
    "We had comfy seats and great snacks!",
    "The flight was delayed for 3 hours.",
    "It was just an average flight, nothing special."
]

for text in test_texts:
    result = classifier(text)[0][0]
    result["label"] = label_map[result["label"]]
    print(f"📝 {text} → {result}")

Device set to use cpu


📝 We had comfy seats and great snacks! → {'label': 'positive', 'score': 0.949704647064209}
📝 The flight was delayed for 3 hours. → {'label': 'negative', 'score': 0.9571847319602966}
📝 It was just an average flight, nothing special. → {'label': 'negative', 'score': 0.8188926577568054}


In [36]:
result = classifier("The flight was smooth and the crew was very kind.")[0][0]
result["label"] = label_map[result["label"]]
print(f"📝 {text} → {result}")

📝 It was just an average flight, nothing special. → {'label': 'positive', 'score': 0.6625651121139526}


In [37]:
result = classifier("The flight departed on time and landed as scheduled.")[0][0]
result["label"] = label_map[result["label"]]
print(f"📝 {text} → {result}")

📝 It was just an average flight, nothing special. → {'label': 'neutral', 'score': 0.833930492401123}


In [42]:
import joblib
joblib.dump(label_encoder, "label_encoder.pkl")

['label_encoder.pkl']

# Using Cased Model

In [38]:
!transformers-cli cache clear

usage: transformers <command> [<args>]
Transformers CLI tool: error: argument {chat,convert,download,env,run,serve,add-new-model-like,add-fast-image-processor}: invalid choice: 'cache' (choose from 'chat', 'convert', 'download', 'env', 'run', 'serve', 'add-new-model-like', 'add-fast-image-processor')


In [49]:
# 1️⃣ Load and preprocess data
df = pd.read_csv("AirlineTweets.csv")
df = df[['text', 'airline_sentiment']].dropna()
df = df[df['airline_sentiment'].isin(['positive', 'neutral', 'negative'])]
df = df.sample(300, random_state=42).reset_index(drop=True)

# 2️⃣ Encode labels numerically
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['airline_sentiment'])

In [50]:
# 3️⃣ Balance dataset by upsampling minority classes
df_majority = df[df.label == 0]  # Usually 'negative'
df_minority_1 = df[df.label == 1]
df_minority_2 = df[df.label == 2]

df_minority_1_upsampled = resample(
    df_minority_1, replace=True, n_samples=len(df_majority), random_state=42
)
df_minority_2_upsampled = resample(
    df_minority_2, replace=True, n_samples=len(df_majority), random_state=42
)

In [51]:
train_df_balanced = pd.concat(
    [df_majority, df_minority_1_upsampled, df_minority_2_upsampled]
).sample(frac=1, random_state=42).reset_index(drop=True)

print("Class distribution after balancing:")
print(train_df_balanced['label'].value_counts())

# 4️⃣ Train-validation split
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_df_balanced['text'].tolist(),
    train_df_balanced['label'].tolist(),
    test_size=0.2,
    random_state=42
)

Class distribution after balancing:
label
1    185
2    185
0    185
Name: count, dtype: int64


In [52]:
# 5️⃣ Tokenize texts with cased tokenizer
checkpoint = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

train_encodings = tokenizer(
    train_texts, truncation=True, padding=True, max_length=512
)
val_encodings = tokenizer(
    val_texts, truncation=True, padding=True, max_length=512
)

train_dataset = Dataset.from_dict({**train_encodings, 'label': train_labels})
val_dataset = Dataset.from_dict({**val_encodings, 'label': val_labels})

In [53]:
# 6️⃣ Load pretrained cased model with classification head
model = AutoModelForSequenceClassification.from_pretrained(
    checkpoint, num_labels=3
)

# 7️⃣ Define metrics function for evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [55]:
# 8️⃣ Set training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="no",
    logging_strategy="epoch",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    report_to="none"
)

In [56]:
# 9️⃣ Initialize Trainer
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [57]:
# 🔟 Train and evaluate the model
trainer.train()
eval_results = trainer.evaluate()

Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,1.0419,0.851965,0.693694,0.679225,0.679331,0.693694
2,0.5526,0.538313,0.810811,0.811302,0.814353,0.810811
3,0.2634,0.418242,0.846847,0.849688,0.856399,0.846847


In [58]:
print("\n📊 Evaluation Results:")
for key, value in eval_results.items():
    print(f"{key}: {value:.4f}")



📊 Evaluation Results:
eval_loss: 0.4182
eval_accuracy: 0.8468
eval_f1: 0.8497
eval_precision: 0.8564
eval_recall: 0.8468
eval_runtime: 3.4611
eval_samples_per_second: 32.0700
eval_steps_per_second: 4.0450
epoch: 3.0000


In [59]:
# 1️⃣1️⃣ Save the fine-tuned model and tokenizer
model.save_pretrained("fine-tuned-airline-model-cased")
tokenizer.save_pretrained("fine-tuned-airline-model-cased")

# 1️⃣2️⃣ Create pipeline for prediction
classifier = pipeline(
    "text-classification",
    model="fine-tuned-airline-model-cased",
    tokenizer="fine-tuned-airline-model-cased",
    top_k=1
)

Device set to use cpu


In [60]:
# Map model labels (LABEL_0, LABEL_1, LABEL_2) back to sentiment strings
label_map = {f"LABEL_{i}": label for i, label in enumerate(label_encoder.classes_)}

# Test some example texts
test_texts = [
    "We had comfy seats and great snacks!",
    "The flight was delayed for 3 hours.",
    "It was just an average flight, nothing special."
]

for text in test_texts:
    result = classifier(text)[0][0]  # Extract the dict inside the nested list
    result["label"] = label_map[result["label"]]
    print(f"📝 {text} → {result}")

📝 We had comfy seats and great snacks! → {'label': 'positive', 'score': 0.8717353343963623}
📝 The flight was delayed for 3 hours. → {'label': 'negative', 'score': 0.6469522714614868}
📝 It was just an average flight, nothing special. → {'label': 'negative', 'score': 0.6606147289276123}
