Assignment 6 - Sentiment analysis

Dataset name - Sentiment140 dataset with 1.6 million tweets(kaggle)
# Dataset: https://www.kaggle.com/datasets/kazanova/sentiment140



In [1]:
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Dense, Dropout

In [35]:
# Step 2: Load Dataset
df = pd.read_csv("/content/archive (12).zip",
                 encoding="latin-1",
                 names=["target","ids","date","flag","user","text"])
print(df);

# Keep only required columns
df = df[['target','text']]

         target         ids                          date      flag  \
0             0  1467810369  Mon Apr 06 22:19:45 PDT 2009  NO_QUERY   
1             0  1467810672  Mon Apr 06 22:19:49 PDT 2009  NO_QUERY   
2             0  1467810917  Mon Apr 06 22:19:53 PDT 2009  NO_QUERY   
3             0  1467811184  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
4             0  1467811193  Mon Apr 06 22:19:57 PDT 2009  NO_QUERY   
...         ...         ...                           ...       ...   
1599995       4  2193601966  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   
1599996       4  2193601969  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   
1599997       4  2193601991  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   
1599998       4  2193602064  Tue Jun 16 08:40:49 PDT 2009  NO_QUERY   
1599999       4  2193602129  Tue Jun 16 08:40:50 PDT 2009  NO_QUERY   

                    user                                               text  
0        _TheSpecialOne_  @switchfoot http://twitpic.com/2y1zl - Awww

In [6]:
# Step 3: Clean Dataset
def clean_text(text):
    text = re.sub(r"http\S+", "", text)        # remove links
    text = re.sub(r"@\w+", "", text)           # remove mentions
    text = re.sub(r"#", "", text)              # remove hashtags symbol
    text = re.sub(r"[^a-zA-Z']", " ", text)    # keep letters only
    text = text.lower().strip()
    return text

df['text'] = df['text'].apply(clean_text)

In [42]:
import nltk
from nltk.corpus import stopwords

# Download stopwords if not already downloaded
try:
    english_stops = set(stopwords.words('english'))
except LookupError:
    nltk.download('stopwords')
    english_stops = set(stopwords.words('english'))


def load_dataset():
    df = pd.read_csv('/content/archive (12).zip',
                     encoding="latin-1",
                     names=["target","ids","date","flag","user","text"])
    x_data = df['text']       # Reviews/Input - Use 'text' column
    y_data = df['target']    # Sentiment/Output - Use 'target' column

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case

    # ENCODE SENTIMENT -> 0 & 1 (Map original 0 and 4 to 0 and 1)
    y_data = y_data.replace({0: 0, 4: 1})


    return x_data, y_data

x_data, y_data = load_dataset()

print('Reviews')
print(x_data, '\n')
print('Sentiment')
print(y_data)

Reviews
0          [switchfoot, http, twitpic, com, zl, awww, bum...
1          [upset, update, facebook, texting, might, cry,...
2          [kenichan, i, dived, many, times, ball, manage...
3                    [whole, body, feels, itchy, like, fire]
4                   [nationwideclass, behaving, mad, i, see]
                                 ...                        
1599995    [just, woke, having, school, best, feeling, ever]
1599996    [thewdb, com, very, cool, hear, old, walt, int...
1599997           [are, ready, mojo, makeover, ask, details]
1599998    [happy, th, birthday, boo, alll, time, tupac, ...
1599999    [happy, charitytuesday, thenspcc, sparkscharit...
Name: text, Length: 1600000, dtype: object 

Sentiment
0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: target, Length: 1600000, dtype: int64


In [28]:
# Step 4: Encode Sentiments
# Map: 0 = negative, 4 = positive
df['target'] = df['target'].replace({0:0, 4:1})
num_classes = len(df['target'].unique())

print(df['target'].value_counts())

target
0    800000
1    800000
Name: count, dtype: int64


In [8]:
# Step 5: Split Dataset
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['target'], test_size=0.2, random_state=42
)

In [10]:
# Step 6: Tokenize and Pad
max_words = 30000   # top 30k words
max_len = 40        # tweets are short, so 40 tokens is enough

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)

X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

In [26]:
# Step 7: Build Model (LSTM / GRU)
# ---------------------------
from tensorflow.keras.layers import Input
from tensorflow.keras.models import Model

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(LSTM(128, return_sequences=False))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))  # 2 classes

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [20]:
model = Sequential()

# 1) Embedding converts each word index → dense vector
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))

# 2) LSTM now receives 3D input (batch, sequence_length, embedding_dim)
model.add(LSTM(128, return_sequences=False))

# 3) Dropout + Dense
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))

# 4) Compile
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()




In [22]:
import numpy as np
print("Unique labels:", np.unique(y_train))
num_classes = len(np.unique(y_train))
print("Number of classes:", num_classes)


Unique labels: [0 2]
Number of classes: 2


In [None]:
# Re-run train_test_split after correcting target mapping
X_train, X_test, y_train, y_test = train_test_split(
    df['text'], df['target'], test_size=0.2, random_state=42
)

print("Value counts for y_train after re-split:")
print(y_train.value_counts())
print("\nValue counts for y_test after re-split:")
print(y_test.value_counts())

In [33]:
# Step 8: Train Model
history = model.fit(
    X_train_pad, y_train,
    epochs=3,
    batch_size=128,
    validation_split=0.1
)

Epoch 1/3
[1m9000/9000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1746s[0m 194ms/step - accuracy: 0.8663 - loss: 0.3085 - val_accuracy: 0.8267 - val_loss: 0.4075
Epoch 2/3
[1m9000/9000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1685s[0m 185ms/step - accuracy: 0.8807 - loss: 0.2776 - val_accuracy: 0.8241 - val_loss: 0.4183
Epoch 3/3
[1m9000/9000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1709s[0m 186ms/step - accuracy: 0.8943 - loss: 0.2487 - val_accuracy: 0.8195 - val_loss: 0.4677


In [45]:
# 9.Ensure 'filtered' contains the text data you want to tokenize
filtered = ["This is a sample sentence to tokenize and pad."]

tokenize_words = tokenizer.texts_to_sequences(filtered)
tokenize_words = pad_sequences(tokenize_words, maxlen=max_len, padding='post', truncating='post')
print(tokenize_words)

[[  27    9    5 6054 5287    3    1    7 4618    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0]]


In [37]:
# Step 10: Predict on New Tweet
def predict_sentiment(text):
    seq = tokenizer.texts_to_sequences([text])
    pad = pad_sequences(seq, maxlen=max_len, padding='post', truncating='post')
    pred = model.predict(pad)
    return pred

sample = "I love this new phone, the battery life is amazing!"
prediction_probabilities = predict_sentiment(sample)
print("Prediction Probabilities:", prediction_probabilities)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step
Prediction Probabilities: [[6.6781382e-04 9.9933213e-01]]


In [38]:
# Step 10: Predict on New Tweet (cont.)
# Assuming the prediction probabilities are in a variable named prediction_probabilities
# and the positive class probability is at index 1
result = prediction_probabilities[0][1]

if result >= 0.7:
    print('positive')
else:
    print('negative')

positive
