In [1]:
from keras.datasets import imdb
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# Upload the file
from google.colab import files
uploaded = files.upload()

Saving Tweets.csv to Tweets.csv


In [3]:
# Read the Tweets.csv dataset
df = pd.read_csv('Tweets.csv')

In [4]:
print(df.head())

             tweet_id airline_sentiment  airline_sentiment_confidence  \
0  570306133677760513           neutral                        1.0000   
1  570301130888122368          positive                        0.3486   
2  570301083672813571           neutral                        0.6837   
3  570301031407624196          negative                        1.0000   
4  570300817074462722          negative                        1.0000   

  negativereason  negativereason_confidence         airline  \
0            NaN                        NaN  Virgin America   
1            NaN                     0.0000  Virgin America   
2            NaN                        NaN  Virgin America   
3     Bad Flight                     0.7033  Virgin America   
4     Can't Tell                     1.0000  Virgin America   

  airline_sentiment_gold        name negativereason_gold  retweet_count  \
0                    NaN     cairdin                 NaN              0   
1                    NaN    jnar

In [5]:
# Check column names and shape
print(df.columns)
print(df.shape)

# Check sentiment distribution
print(df['airline_sentiment'].value_counts())

# Example tweet
print(df['text'][0])


Index(['tweet_id', 'airline_sentiment', 'airline_sentiment_confidence',
       'negativereason', 'negativereason_confidence', 'airline',
       'airline_sentiment_gold', 'name', 'negativereason_gold',
       'retweet_count', 'text', 'tweet_coord', 'tweet_created',
       'tweet_location', 'user_timezone'],
      dtype='object')
(14640, 15)
airline_sentiment
negative    9178
neutral     3099
positive    2363
Name: count, dtype: int64
@VirginAmerica What @dhepburn said.


In [6]:
# Data Cleaning

import re
import string
import nltk
from nltk.corpus import stopwords   # Natural Language Toolkit for text processing

nltk.download('stopwords')    # list of common words (like “the”, “and”, “is”) often removed from text, this downloads the English stopwords list from NLTK
stop_words = set(stopwords.words('english'))   # Loads the English stopwords into a set for fast lookup when filtering text

def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # remove URLs
    text = re.sub(r'@\w+|\#', '', text)  # remove mentions and hashtags
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)  # remove punctuations
    text = re.sub(r'\s+', ' ', text).strip()  # remove extra spaces
    text = " ".join([word for word in text.split() if word not in stop_words])  # remove stopwords ("the", "and",...)
    return text

df['clean_text'] = df['text'].apply(clean_text)   # clean_text function applied to each row in the text column of the DataFrame and creates a new column clean_text with the results
print(df['clean_text'].head())


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


0                                                 said
1        plus youve added commercials experience tacky
2         didnt today must mean need take another trip
3    really aggressive blast obnoxious entertainmen...
4                                 really big bad thing
Name: clean_text, dtype: object


In [7]:
# Encode Sentiment Labels

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['sentiment_label'] = le.fit_transform(df['airline_sentiment'])   # fit_transform() reads all unique values in df['airline_sentiment'] and assigns them numbers 0 for negative, 1 for neutral, and 2 for positive which are now saved in a new column called sentiment_label
print(le.classes_)  # ['negative', 'neutral', 'positive']


['negative' 'neutral' 'positive']


In [30]:
# Train-Test Split

from sklearn.model_selection import train_test_split

X = df['clean_text']      # input features — cleaned tweet texts
y = df['sentiment_label'] # target labels — encoded sentiment values (0, 1, 2)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42)


In [31]:
# Vectorize the Text (TF-IDF)
# TF-IDF — Term Frequency–Inverse Document Frequency
# TF (Term Frequency): how often a word appears in a document
# IDF (Inverse Document Frequency): how rare the word is across all documents
# Common words (like "the", "and") appear in many documents and get lower scores
# Uncommon but meaningful words get higher scores

from sklearn.feature_extraction.text import TfidfVectorizer
# This imports a tool that converts text into numeric feature vectors using the TF-IDF method

tfidf = TfidfVectorizer(max_features=5000)
# Only keeps the top 5,000 most important words (based on TF-IDF score)
# This reduces dimensionality and speeds up training
X_train_tfidf = tfidf.fit_transform(X_train)
# This fit the vectorizer and transform training data
# Learns the vocabulary and computes TF-IDF scores from the training texts
# Converts the training text into spare matrix of shape (samples, 5000)
X_test_tfidf = tfidf.transform(X_test)
# Transform test data by applying the same vocabulary (from training) to the test set
# Ensures that the training and test data have the same structure

print(X_train_tfidf.shape)  # (number_of_samples, 5000)
print(X_test_tfidf.shape)   # (number_of_samples, 5000)
# This shows the dimensions of the TF-IDF matrix

(11712, 5000)
(2928, 5000)


In [32]:
# Logistic Regression model

from sklearn.linear_model import LogisticRegression   # Best for binary and multiclass classificaion
from sklearn.metrics import classification_report, accuracy_score   # Import evaluation metrics

lr = LogisticRegression(max_iter=1000)  # max_iter=1000 increases the number of training iterations to help the model converge (default is 100)
lr.fit(X_train_tfidf, y_train)
y_pred_lr = lr.predict(X_test_tfidf)

print("Logistic Regression")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr, target_names=le.classes_))

Logistic Regression
Accuracy: 0.7827868852459017
              precision    recall  f1-score   support

    negative       0.80      0.94      0.87      1835
     neutral       0.66      0.49      0.56       620
    positive       0.82      0.56      0.67       473

    accuracy                           0.78      2928
   macro avg       0.76      0.66      0.70      2928
weighted avg       0.78      0.78      0.77      2928



In [15]:
new_tweets = ["Flight delayed again. Terrible experience.",
              "Thanks for the great service!",
              "It was okay, nothing special.",
              "not sure about what I could say, not bad and not too impressive either",
              "I wish I could use another classroom",
              "Mike was gonna come later today, try hold on",
              "streamlit has their own UI designed already, looks easy to use"]

new_cleaned = [clean_text(tweet) for tweet in new_tweets]
new_tfidf = tfidf.transform(new_cleaned)
predictions = lr.predict(new_tfidf)

print(predictions)  # outputs: [0, 2, 1] or similar
print(le.inverse_transform(predictions))  # to get original labels


[0 2 0 0 1 0 0]
['negative' 'positive' 'negative' 'negative' 'neutral' 'negative'
 'negative']


In [12]:
# Save files
import pickle

with open("model.pkl", "wb") as f:
    pickle.dump(lr, f)

with open("tfidf.pkl", "wb") as f:
    pickle.dump(tfidf, f)

with open("label_encoder.pkl", "wb") as f:
    pickle.dump(le, f)

# Download from Colab
from google.colab import files
files.download("model.pkl")
files.download("tfidf.pkl")
files.download("label_encoder.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [13]:
# save clean.py file

%%writefile clean.py
def clean_text(text):
    # your cleaning logic here
    return cleaned_text


Writing clean.py


In [14]:
files.download("clean.py")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>