# Assignment 2 - Kival Mahadew (221001688)

## Data Preprocessing

In [None]:
# some configuration options
SAMPLE_SIZE = 200_000
CSV_PATH = 'sentiment140.csv'
LEMMATIZE = True


In [None]:
# setup
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

# other imports
import pandas as pd
import numpy as np

In [None]:
# read the data
data = pd.read_csv(CSV_PATH, encoding='latin-1', header=None, names=['sentiment', 'id', 'date', 'query', 'user', 'text'])

# shuffle the data
data = data.sample(frac=1).reset_index(drop=True)

# drop unnecessary columns
data = data.drop(columns=['id', 'date', 'query', 'user'])

# convert sentiment to 0 and 1
data['sentiment'] = data['sentiment'].replace({0: 0, 4: 1})

# sample n from each class since neutral tweets are not present
data = pd.concat([data[data['sentiment'] == 0].sample(SAMPLE_SIZE // 2), data[data['sentiment'] == 1].sample(SAMPLE_SIZE // 2)])


In [None]:
# Text Cleaning
# We remove numbers, special characters, stopwords, @mentions, and URLs, then perform lemmatization.
import re
from nltk.tokenize import word_tokenize

def clean_text(text):
    # &amp; is the escaped version of &
    text = re.sub(r'&amp;', '&', text)
    # &quot; is the escaped version of "
    text = re.sub(r'&quot;', '"', text)
    text = re.sub(r'@\w+', '', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^A-Za-z]', ' ', text)
    text = text.lower()
    text = re.sub(r'www\S+', '', text)
    text = ' '.join([word for word in word_tokenize(text) if word not in stop_words])
    if LEMMATIZE:
        text = ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text)])
    return text

In [None]:
# pick random text to test the function
text = data['text'].sample().values[0]
print('Original Text:', text)
print('Cleaned Text:', clean_text(text))

In [None]:
# apply the function to the entire dataset
data['text'] = data['text'].apply(clean_text)
data.head()

# TRAINING

In [None]:
# Some configurations
TRAIN_SIZE = 0.8
VAL_SIZE = 0.2
STRATIFY = True
RANDOM_STATE = 42 # for reproducibility
MAX_SEQUENCE_LENGTH = 100

In [None]:
# split the data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(data['text'], data['sentiment'], train_size=TRAIN_SIZE, stratify=data['sentiment'] if STRATIFY else None, random_state=RANDOM_STATE)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, train_size=(1-VAL_SIZE), stratify=y_test if STRATIFY else None, random_state=RANDOM_STATE)


In [None]:
# Tokenization
from keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)

vocab_size = len(tokenizer.word_index) + 1
print('Vocab Size:', vocab_size)

# sequence padding
from keras.preprocessing.sequence import pad_sequences

X_train = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAX_SEQUENCE_LENGTH)
X_val = pad_sequences(tokenizer.texts_to_sequences(X_val), maxlen=MAX_SEQUENCE_LENGTH)
X_test = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=MAX_SEQUENCE_LENGTH)

In [None]:
# encoding the target
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(y_train)

y_train = label_encoder.transform(y_train)
y_val = label_encoder.transform(y_val)
y_test = label_encoder.transform(y_test)

# print what the classes are encoded as
print('Classes:', label_encoder.classes_)
print('Sample:', y_train[0])

# Print the shapes
print('X_train:', X_train.shape)
print('X_val:', X_val.shape)
print('X_test:', X_test.shape)
print('y_train:', y_train.shape)
print('y_val:', y_val.shape)
print('y_test:', y_test.shape)


: 

In [None]:
# embeddings layer with glove config
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

# we will use the 100d embeddings
EMBEDDING_DIM = 100
GLOVE_PATH = 'glove.6B.100d.txt'



In [None]:
from keras.layers import Embedding

embeddings_index = {}
with open(GLOVE_PATH) as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs, 'f', sep=' ')
        embeddings_index[word] = coefs

    embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))

for word, i in tokenizer.word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

embedding_layer = Embedding(vocab_size, EMBEDDING_DIM, weights=[embedding_matrix], input_length=MAX_SEQUENCE_LENGTH, trainable=False)

In [None]:
# Model configuration
LEARNING_RATE = 1e-3
EPOCHS = 5
BATCH_SIZE = 128



In [None]:
# LSTM Model
from keras.models import Sequential
from keras.layers import LSTM, GRU, Dense, Dropout, Bidirectional, Input, SpatialDropout1D, Conv1D
from keras.optimizers import Adam
import tensorflow as tf

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_sequences = embedding_layer(sequence_input)
x = SpatialDropout1D(0.2)(embedding_sequences)
x = Conv1D(64, 5, activation='relu')(x)
x = LSTM(64, dropout=0.2, recurrent_dropout=0.2)(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(512, activation='relu')(x)
outputs = Dense(1, activation='sigmoid')(x)
model = tf.keras.Model(sequence_input, outputs)
model.summary()



In [None]:
model.compile(optimizer=Adam(learning_rate=LEARNING_RATE), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(X_val, y_val))

In [None]:

loss, accuracy = model.evaluate(X_test, y_test)
print("Loss: ", loss)
print("Accuracy: ", accuracy)

y_pred = model.predict(X_test)
y_pred = np.where(y_pred > 0.5, 1, 0)

from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

print("Classification Report:")
print(classification_report(y_test, y_pred))

In [None]:
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding_sequences = embedding_layer(sequence_input)
x = SpatialDropout1D(0.2)(embedding_sequences)
x = Conv1D(64, 5, activation='relu')(x)
x = GRU(64, dropout=0.2, recurrent_dropout=0.2)(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.5)(x)
x = Dense(512, activation='relu')(x)
outputs = Dense(1, activation='sigmoid')(x)
model = tf.keras.Model(sequence_input, outputs)
model.compile(optimizer=Adam(learning_rate=LEARNING_RATE), loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

In [None]:
history = model.fit(X_train, y_train, batch_size=BATCH_SIZE, epochs=EPOCHS, validation_data=(X_val, y_val))

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print("Loss: ", loss)
print("Accuracy: ", accuracy)

y_pred = model.predict(X_test)
y_pred = np.where(y_pred > 0.5, 1, 0)

from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score


cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)

print("Classification Report:")
print(classification_report(y_test, y_pred))