In [1]:
from keras.datasets import imdb
from keras.preprocessing import sequence
import keras
import tensorflow as tf
import os
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt

In [2]:
from google.colab import files
uploaded = files.upload()

Saving test_labels.txt to test_labels.txt
Saving test_text.txt to test_text.txt
Saving train_labels.txt to train_labels.txt
Saving train_text.txt to train_text.txt
Saving val_labels.txt to val_labels.txt
Saving val_text.txt to val_text.txt


In [5]:
def load_tweeteval_split(text_path, label_path):
    """Loads TweetEval split data from text and label files."""
    with open(text_path, 'r', encoding='utf-8') as f:
        text_data = [line.strip() for line in f]
    with open(label_path, 'r', encoding='utf-8') as f:
        label_data = [int(line.strip()) for line in f] # Assuming labels are integers

    df = pd.DataFrame({'text': text_data, 'label': label_data})
    return df

# Load TweetEval Sentiment splits
df_train = load_tweeteval_split('train_text.txt', 'train_labels.txt')
df_val = load_tweeteval_split('val_text.txt', 'val_labels.txt')
df_test = load_tweeteval_split('test_text.txt', 'test_labels.txt')

print("Training data shape:", df_train.shape)
print("Validation data shape:", df_val.shape)
print("Testing data shape:", df_test.shape)

Training data shape: (45615, 2)
Validation data shape: (2000, 2)
Testing data shape: (12284, 2)


In [10]:
# Combine train and val sets for better training
df = pd.concat([df_train, df_val], ignore_index=True)

print("Dataset Size:", len(df))
print(df['label'].value_counts())
print(df.head())
print(df.tail())

Dataset Size: 47615
label
1    21542
2    18668
0     7405
Name: count, dtype: int64
                                                text  label
0  "QT @user In the original draft of the 7th boo...      2
1  "Ben Smith / Smith (concussion) remains out of...      1
2  Sorry bout the stream last night I crashed out...      1
3  Chase Headley's RBI double in the 8th inning o...      1
4  @user Alciato: Bee will invest 150 million in ...      2
                                                    text  label
47610  "LONDON (AP) "" Prince George celebrates his s...      1
47611  Harper's Worst Offense against Refugees may be...      1
47612  Hold on... Sam Smith may do the theme to Spect...      2
47613  Gonna watch Final Destination 5 tonight. I alw...      1
47614  "Interview with Devon Alexander \""""Speed Kil...      1


In [23]:
# Train-Test Split

from sklearn.model_selection import train_test_split

X = df['text']  # input features — tweet texts
y = df['label'] # target labels — encoded sentiment values (0 -> negative, 1 -> neutral, 2-> positive)

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.15, random_state=42)

In [24]:
# Training the Model (using TF-IDF + Logistic Regression)

# Vectorize the Text (TF-IDF)
# TF-IDF — Term Frequency–Inverse Document Frequency
# TF (Term Frequency): how often a word appears in a document
# IDF (Inverse Document Frequency): how rare the word is across all documents
# Common words (like "the", "and") appear in many documents and get lower scores
# Uncommon but meaningful words get higher scores

from sklearn.feature_extraction.text import TfidfVectorizer
# This imports a tool that converts text into numeric feature vectors using the TF-IDF method

vectorizer = TfidfVectorizer(max_features=10000)
# Only keeps the top 10,000 most important words (based on TF-IDF score)
# This reduces dimensionality and speeds up training
X_train_vec = vectorizer.fit_transform(X_train)
# This fit the vectorizer and transform training data
# Learns the vocabulary and computes TF-IDF scores from the training texts
# Converts the training text into spare matrix of shape (samples, 10000)
X_val_vec = vectorizer.transform(X_val)
# Transform test data by applying the same vocabulary (from training) to the test set
# Ensures that the training and test data have the same structure

print(X_train_vec.shape)  # (number_of_samples, 5000)
print(X_val_vec.shape)   # (number_of_samples, 5000)
# This shows the dimensions of the TF-IDF matrix

(40472, 10000)
(7143, 10000)


In [27]:
# Train Logistic Regression model

from sklearn.linear_model import LogisticRegression   # Best for binary and multiclass classificaion
from sklearn.metrics import classification_report, accuracy_score   # Import evaluation metrics

model = LogisticRegression(max_iter=1000)  # max_iter=1000 increases the number of training iterations to help the model converge (default is 100)
model.fit(X_train_vec, y_train)

# Evaluate lr model

y_pred = model.predict(X_val_vec)

print("Logistic Regression")
print("Accuracy:", accuracy_score(y_val, y_pred))
print(classification_report(y_val, y_pred, digits=4))

Logistic Regression
Accuracy: 0.6609267814643707
              precision    recall  f1-score   support

           0     0.6260    0.3418    0.4422      1141
           1     0.6240    0.7666    0.6880      3141
           2     0.7227    0.6721    0.6965      2861

    accuracy                         0.6609      7143
   macro avg     0.6576    0.5935    0.6089      7143
weighted avg     0.6638    0.6609    0.6521      7143



In [32]:
# Prediction on new unseen data

new_tweets = ["Flight delayed again. Terrible experience.",
              "Thanks for the great service!",
              "It was okay, nothing special.",
              "not sure about what I could say, not bad and not too impressive either",
              "I wish I could use another classroom",
              "Mike was gonna come later today, try hold on",
              "streamlit has their own UI designed already, looks easy to use"]

new_vectorizer = vectorizer.transform(new_tweets)
predictions = model.predict(new_vectorizer)

print(predictions)  # outputs: [0, 2, 1] or similar


[0 2 2 0 2 1 2]


In [33]:
# Save the model and vectorizer

import joblib

joblib.dump(model, "tweet_eval_sentiment_model.pkl")
joblib.dump(vectorizer, "tweet_eval_vectorizer.pkl")

['tweet_eval_vectorizer.pkl']

In [34]:
from google.colab import files

files.download("tweet_eval_sentiment_model.pkl")
files.download("tweet_eval_vectorizer.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# save clean.py file

%%writefile clean.py
def clean_text(text):
    # your cleaning logic here
    return cleaned_text


Writing clean.py


In [None]:
files.download("clean.py")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Training data shape: (45615, 2)
Validation data shape: (2000, 2)
Testing data shape: (12284, 2)
