In [5]:
import pandas as pd

df = pd.read_csv('cleaned_tweets.csv')

# Drop any rows where clean_text is empty or NaN
df = df.dropna(subset=['clean_text'])
df = df[df['clean_text'].str.strip() != '']

print("Data shape after cleaning:", df.shape)
print(df['airline_sentiment'].value_counts())

Data shape after cleaning: (14617, 3)
airline_sentiment
negative    9172
neutral     3086
positive    2359
Name: count, dtype: int64


In [6]:
from sklearn.model_selection import train_test_split

X = df['clean_text']
y = df['airline_sentiment']

# Split into 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Training samples:", len(X_train))
print("Testing samples: ", len(X_test))

Training samples: 11693
Testing samples:  2924


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("Shape of training matrix:", X_train_tfidf.shape)

Shape of training matrix: (11693, 5000)


In [8]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)

y_pred = lr_model.predict(X_test_tfidf)

print("✅ Logistic Regression Results:")
print("Accuracy:", round(accuracy_score(y_test, y_pred) * 100, 2), "%")
print("\n", classification_report(y_test, y_pred))

✅ Logistic Regression Results:
Accuracy: 78.63 %

               precision    recall  f1-score   support

    negative       0.81      0.93      0.87      1857
     neutral       0.69      0.48      0.56       625
    positive       0.78      0.61      0.68       442

    accuracy                           0.79      2924
   macro avg       0.76      0.67      0.70      2924
weighted avg       0.78      0.79      0.77      2924



In [9]:
from sklearn.naive_bayes import MultinomialNB

nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, y_train)

y_pred_nb = nb_model.predict(X_test_tfidf)

print("✅ Naive Bayes Results:")
print("Accuracy:", round(accuracy_score(y_test, y_pred_nb) * 100, 2), "%")
print("\n", classification_report(y_test, y_pred_nb))

✅ Naive Bayes Results:
Accuracy: 73.26 %

               precision    recall  f1-score   support

    negative       0.72      0.98      0.83      1857
     neutral       0.77      0.24      0.37       625
    positive       0.89      0.37      0.52       442

    accuracy                           0.73      2924
   macro avg       0.79      0.53      0.57      2924
weighted avg       0.76      0.73      0.69      2924



In [10]:
def predict_sentiment(text):
    cleaned = text.lower()
    vectorized = vectorizer.transform([cleaned])
    prediction = lr_model.predict(vectorized)[0]
    return prediction

# Try your own sentences!
print(predict_sentiment("The flight was amazing and the staff were so kind!"))
print(predict_sentiment("Worst airline ever, lost my baggage again"))
print(predict_sentiment("Flight was okay, nothing special"))

positive
negative
negative
