# Assignment 2 - Kival Mahadew (221001688)

## Data Preprocessing

In [10]:
# some configuration options
SAMPLE_SIZE = 200_000
CSV_PATH = 'sentiment140.csv'
LEMMATIZE = True


In [11]:
# setup
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

# other imports
import pandas as pd
import numpy as np

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
# read the data
data = pd.read_csv(CSV_PATH, encoding='latin-1', header=None, names=['sentiment', 'id', 'date', 'query', 'user', 'text'])

# shuffle the data
data = data.sample(frac=1).reset_index(drop=True)

# drop unnecessary columns
data = data.drop(columns=['id', 'date', 'query', 'user'])

# convert sentiment to 0 and 1
data['sentiment'] = data['sentiment'].replace({0: 0, 4: 1})

# sample n from each class since neutral tweets are not present
data = pd.concat([data[data['sentiment'] == 0].sample(SAMPLE_SIZE // 2), data[data['sentiment'] == 1].sample(SAMPLE_SIZE // 2)])


In [13]:
# Text Cleaning
# We remove numbers, special characters, stopwords, @mentions, and URLs, then perform lemmatization.
import re
from nltk.tokenize import word_tokenize

def clean_text(text):
    # &amp; is the escaped version of &
    text = re.sub(r'&amp;', '&', text)
    # &quot; is the escaped version of "
    text = re.sub(r'&quot;', '"', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^A-Za-z]', ' ', text)
    text = text.lower()
    text = re.sub(r'www\S+', '', text)
    text = ' '.join([word for word in word_tokenize(text) if word not in stop_words])
    if LEMMATIZE:
        text = ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text)])
    return text

In [14]:
# pick random text to test the function
text = data['text'].sample().values[0]
print('Original Text:', text)
print('Cleaned Text:', clean_text(text))

Original Text: it's only 11:12P, but my body thinks it's 6 in the AM. boo... 
Cleaned Text: p body think boo


In [15]:
# apply the function to the entire dataset
data['text'] = data['text'].apply(clean_text)
data.head()

Unnamed: 0,sentiment,text
553444,0,twittter f ferrari race point somebody got ta ...
819778,0,hoping rain stay away outside car need good cl...
835698,0,ahamazing story lifeeeeee without happppppppy ...
487542,0,havent seen someone sent fraud booo
258499,0,research n doctor see realize option dont use ...


# TRAINING

In [16]:
# Some configurations
TRAIN_SIZE = 0.8
VAL_SIZE = 0.2
STRATIFY = True
RANDOM_STATE = 42 # for reproducibility
MAX_SEQUENCE_LENGTH = 100

In [17]:
# split the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['text'], data['sentiment'], train_size=TRAIN_SIZE, stratify=data['sentiment'] if STRATIFY else None, random_state=RANDOM_STATE)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, train_size=(1-VAL_SIZE), stratify=y_test if STRATIFY else None, random_state=RANDOM_STATE)


In [18]:
# Tokenization
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

vocab_size = len(tokenizer.word_index) + 1
print('Vocab Size:', vocab_size)

# sequence padding
from keras.preprocessing.sequence import pad_sequences

X_train = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_SEQUENCE_LENGTH)
X_val = pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=MAX_SEQUENCE_LENGTH)
X_test = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=MAX_SEQUENCE_LENGTH)

Vocab Size: 62330


In [19]:
# encoding the target
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(y_train)

y_train = label_encoder.transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)

# print what the classes are encoded as
print('Classes:', label_encoder.classes_)
print('Sample:', y_train[0])

# Print the shapes
print('X_train:', X_train.shape)
print('X_val:', X_val.shape)
print('X_test:', X_test.shape)
print('y_train:', y_train.shape)
print('y_val:', y_val.shape)
print('y_test:', y_test.shape)


Classes: [0 1]
Sample: 0
X_train: (160000, 100)
X_val: (32000, 100)
X_test: (8000, 100)
y_train: (160000,)
y_val: (32000,)
y_test: (8000,)


In [20]:
# embeddings layer with glove config
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

# we will use the 100d embeddings
EMBEDDING_DIM = 100
GLOVE_PATH = 'glove.6B.100d.txt'



--2024-05-12 19:32:57--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2024-05-12 19:32:57--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2024-05-12 19:32:58--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [21]:
from keras.layers import Embedding

embeddings_index = {}
with open(GLOVE_PATH) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs

    embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(vocab_size, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)

In [22]:
# Model configuration
LEARNING_RATE = 1e-3
EPOCHS = 5
BATCH_SIZE = 128



In [26]:
# LSTM Model
from keras.models import Sequential
from keras.layers import LSTM, GRU, Dense, Dropout, Bidirectional, Input, SpatialDropout1D, Conv1D
from keras.optimizers import Adam
import tensorflow as tf

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_sequences = embedding_layer(sequence_input)
x = SpatialDropout1D(0.2)(embedding_sequences)
x = Conv1D(64, 5, activation='relu')(x)
x = LSTM(64, dropout=0.2, recurrent_dropout=0.2)(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(512, activation='relu')(x)
outputs = Dense(1, activation='sigmoid')(x)
model = tf.keras.Model(sequence_input, outputs)
model.summary()





Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding (Embedding)       (None, 100, 100)          6233000   
                                                                 
 spatial_dropout1d_2 (Spati  (None, 100, 100)          0         
 alDropout1D)                                                    
                                                                 
 conv1d_2 (Conv1D)           (None, 96, 64)            32064     
                                                                 
 lstm_2 (LSTM)               (None, 64)                33024     
                                                                 
 dense_6 (Dense)             (None, 512)               33280     
                                                           

In [27]:
model.compile(optimizer=Adam(learning_rate=LEARNING_RATE), loss='binary_crossentropy', metrics=['accuracy'])

In [28]:
history = model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(X_val, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [29]:

loss, accuracy = model.evaluate(X_test, y_test)
print("Loss: ", loss)
print("Accuracy: ", accuracy)

y_pred = model.predict(X_test)
y_pred = np.where(y_pred > 0.5, 1, 0)

from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

print("Classification Report:")
print(classification_report(y_test, y_pred))

Loss:  0.5082894563674927
Accuracy:  0.7490000128746033
Confusion Matrix:
[[2996 1004]
 [1004 2996]]
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.75      0.75      4000
           1       0.75      0.75      0.75      4000

    accuracy                           0.75      8000
   macro avg       0.75      0.75      0.75      8000
weighted avg       0.75      0.75      0.75      8000



In [30]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_sequences = embedding_layer(sequence_input)
x = SpatialDropout1D(0.2)(embedding_sequences)
x = Conv1D(64, 5, activation='relu')(x)
x = GRU(64, dropout=0.2, recurrent_dropout=0.2)(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(512, activation='relu')(x)
outputs = Dense(1, activation='sigmoid')(x)
model = tf.keras.Model(sequence_input, outputs)
model.compile(optimizer=Adam(learning_rate=LEARNING_RATE), loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()



Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_5 (InputLayer)        [(None, 100)]             0         
                                                                 
 embedding (Embedding)       (None, 100, 100)          6233000   
                                                                 
 spatial_dropout1d_3 (Spati  (None, 100, 100)          0         
 alDropout1D)                                                    
                                                                 
 conv1d_3 (Conv1D)           (None, 96, 64)            32064     
                                                                 
 gru (GRU)                   (None, 64)                24960     
                                                                 
 dense_9 (Dense)             (None, 512)               33280     
                                                           

In [31]:
history = model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(X_val, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [32]:
loss, accuracy = model.evaluate(X_test, y_test)
print("Loss: ", loss)
print("Accuracy: ", accuracy)

y_pred = model.predict(X_test)
y_pred = np.where(y_pred > 0.5, 1, 0)

from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

print("Classification Report:")
print(classification_report(y_test, y_pred))

Loss:  0.5068916082382202
Accuracy:  0.7447500228881836
Confusion Matrix:
[[2953 1047]
 [ 995 3005]]
Classification Report:
              precision    recall  f1-score   support

           0       0.75      0.74      0.74      4000
           1       0.74      0.75      0.75      4000

    accuracy                           0.74      8000
   macro avg       0.74      0.74      0.74      8000
weighted avg       0.74      0.74      0.74      8000

