## Imports

In [1]:
import re
import random
import numpy as np
import pandas as pd
import tensorflow as tf
from gensim.models import Word2Vec
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D, Dense, Dropout, Flatten

## Loading and Reading the Dataset

In [2]:
# Load the dataset
file_path = "/kaggle/input/sentiment140/training.1600000.processed.noemoticon.csv"

# Importing the dataset
DATASET_COLUMNS  = ["sentiment", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
dataset = pd.read_csv(file_path,encoding=DATASET_ENCODING , names=DATASET_COLUMNS)

# Removing the unnecessary columns
dataset = dataset[['sentiment','text']]

## Preprocessing Tweets using Regular Expressions

In [3]:
# Defining regex patterns
urlPattern        = r"((http://)[^ ]*|(https://)[^ ]*|(www\.)[^ ]*)"
userPattern       = '@[^\s]+'
alphaPattern      = "[^a-z0-9<>]"
hashtagPattern    = '#[^\s]+'
sequencePattern   = r"(.)\1\1+"
seqReplacePattern = r"\1\1"

# Defining regex for emojis
sademoji          = r"[8:=;]['`\-]?\(+"
lolemoji          = r"[8:=;]['`\-]?p+"
smileemoji        = r"[8:=;]['`\-]?[)d]+"
neutralemoji      = r"[8:=;]['`\-]?[\/|l*]"

def apply_preprocessing(tweet):

    tweet = tweet.lower()

    # Replace all URls with '<url>'
    tweet = re.sub(urlPattern,'<url>',tweet)
    
    # Replace @USERNAME to '<user>'
    tweet = re.sub(userPattern,'<user>', tweet)

    # Replace 3 or more consecutive letters by 2 letters
    tweet = re.sub(sequencePattern, seqReplacePattern, tweet)

    # Replace all emojis.
    tweet = re.sub(r'<3', '<heart>', tweet)
    tweet = re.sub(smileemoji, '<smile>', tweet)
    tweet = re.sub(sademoji, '<sadface>', tweet)
    tweet = re.sub(neutralemoji, '<neutralface>', tweet)
    tweet = re.sub(lolemoji, '<lolface>', tweet)

    # Remove non-alphanumeric and symbols
    tweet = re.sub(alphaPattern, ' ', tweet)

    # Adding space on either side of '/' to seperate words (After replacing URLS)
    tweet = re.sub(r'/', ' / ', tweet)
    return tweet

## Apply Preprocessing on the Text Column

In [4]:
dataset['processed_text'] = dataset.text.apply(apply_preprocessing)

## Extracting and Splitting the Preprocessed Data

In [5]:
# Extracting the text and it's sentiment
X_data, y_data = np.array(dataset['processed_text']), np.array(dataset['sentiment'])

# Splitting the dataset into training and testing
X_train, X_test, Y_train, Y_test = train_test_split(X_data, y_data,test_size = 0.05, random_state = 42)

## Replacing Labels for Binary Classification Purpose

In [6]:
# Replace label value 4 with 1
Y_train[Y_train == 4] = 1
Y_test[Y_test == 4] = 1

## Converting the Labels to a One Hot Encoding Vector

In [7]:
y_train_one_hot = to_categorical(Y_train, num_classes=2)
y_test_one_hot = to_categorical(Y_test, num_classes=2)

## Training a Word2Vec Model to Generate Word Embeddings

In [8]:
# Creating Word2Vec training dataset by splitting the training sentences into words
Word2vec_train_data = [tweet.split() for tweet in X_train]

# Defining the model and training it
word2vec_model = Word2Vec(Word2vec_train_data, vector_size=100, workers=8, min_count=5)

In [9]:
# Initializing the tokenizer with specific settings
tokenizer = Tokenizer(filters="", lower=False, oov_token="<oov>")

# Fitting the tokenizer on the training data
tokenizer.fit_on_texts(X_data)

In [10]:
# Calculate the maximum length of tokenized documents
input_length = max(len(tweet.split()) for tweet in X_data)

# Padding the sequences to be the same length
X_train_padded = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=input_length)
X_test_padded  = pad_sequences(tokenizer.texts_to_sequences(X_test) , maxlen=input_length)

## Creating an Embedding Matrix from the Word2Vec Model

In [11]:
# Defining the vocabulary size
vocab_length = len(tokenizer.word_index) + 1

# Initializing the embedding matrix
embedding_matrix = np.zeros((vocab_length, 100))

for word, token in tokenizer.word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[token] = word2vec_model.wv[word]
        
print("Embedding Matrix Shape:", embedding_matrix.shape)

Embedding Matrix Shape: (263128, 100)


## Creating and Training a CNN Model for Classification

In [12]:
# Define the CNN model
model = Sequential()

# Define the embedding layer with pre-trained weights
embedding_layer = tf.keras.layers.Embedding(input_dim=vocab_length, output_dim=100, embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix), trainable=False)

# Add the embedding layer to the model
model.add(embedding_layer)

# Add 1D convolutional layer 
model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))

# Add global max pooling layer
model.add(GlobalMaxPooling1D())

# Add fully connected layers
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))  # Adding dropout for regularization
model.add(Dense(2, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_padded, y_train_one_hot, epochs=5, batch_size=64)

Epoch 1/5
[1m23750/23750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m63s[0m 2ms/step - accuracy: 0.7781 - loss: 0.4705
Epoch 2/5
[1m23750/23750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 2ms/step - accuracy: 0.8092 - loss: 0.4205
Epoch 3/5
[1m23750/23750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 2ms/step - accuracy: 0.8145 - loss: 0.4113
Epoch 4/5
[1m23750/23750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 2ms/step - accuracy: 0.8173 - loss: 0.4060
Epoch 5/5
[1m23750/23750[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 2ms/step - accuracy: 0.8194 - loss: 0.4019


## Evaluate the Model

In [13]:
_, accuracy = model.evaluate(X_test_padded, y_test_one_hot)
print("Testing Accuracy:", accuracy)

[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 1ms/step - accuracy: 0.8193 - loss: 0.3970
Testing Accuracy: 0.8190000057220459


## Generate Predictions

In [14]:
predictions = model.predict(X_test_padded)

[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step


## Displaying a sample of the Model's Predictions

In [15]:
# Randomly select 10 indices from X_test
random_indices = random.sample(range(len(X_test)), 10)

# Iterate over the selected indices
for idx in random_indices:
    
    sample_text = X_test[idx]  # Get the original text sample
    result = predictions[idx]    # Get the model prediction
    y_pred_classes = np.argmax(result)

    label = "positive" if y_pred_classes == 1 else "negative" 
    
    # Print the sample and its prediction
    print("Sample:", sample_text, " ---> ", label)

Sample: on the balcony at kensingtons in the sun   --->  positive
Sample: <user> i ll suffer alongside you   --->  positive
Sample: late lunch then off to cheers for bday drinks   --->  positive
Sample: <user>  why not    --->  negative
Sample: <user> i just need to know what mine is   --->  positive
Sample: <user> morning   happy sunday      --->  positive
Sample: i need to find my disc so i can update my spyware stuff and anti virus stuff    --->  negative
Sample: ughh my dad is on my case   --->  negative
Sample: <user>  i m obviously not meant to send this email out  as its crashed again  and i was so close to the send button   --->  negative
Sample: i am this close to getting that second job  i just have to wait a week   --->  negative
