In [1]:
# Import Library

import re
import nltk
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
# Load Dataset

def load_dataset(file_name):
    
    df = pd.read_csv(file_name, encoding= 'unicode_escape')
    
    df = df[["text", "sentiment"]]
    df.drop_duplicates(inplace = True)
    df.dropna(inplace = True)
    
    return df

train_df = load_dataset("Dataset/Ex 2/train.csv")
test_df = load_dataset("Dataset/Ex 2/test.csv")

print("Train Shape :", train_df.shape)
print("Test Shape  :", test_df.shape)

train_df.head()

Train Shape : (27480, 2)
Test Shape  : (3534, 2)


Unnamed: 0,text,sentiment
0,"I`d have responded, if I were going",neutral
1,Sooo SAD I will miss you here in San Diego!!!,negative
2,my boss is bullying me...,negative
3,what interview! leave me alone,negative
4,"Sons of ****, why couldn`t they put them on t...",negative


In [3]:
# Y Label

from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
train_y = encoder.fit_transform(train_df["sentiment"])
test_y = encoder.transform(test_df["sentiment"])

train_y = train_y.reshape((-1, 1))

In [4]:
# Tweet Preprocessing

def pre_processing(tweet: str):
    
    # Remove Leading Blank Spaces
    tweet = tweet.strip()
    
    # Lower Case
    tweet = tweet.lower()
    
    # Remove URLS 
    url_pattern = re.compile(r"https?://\S+|www\.\S+")
    tweet = re.sub(url_pattern, "", tweet)
    
    # Remove UserName
    username_pattern = re.compile(r"@\w+")
    tweet = re.sub(username_pattern, "", tweet)
    
    # Remove Hashtags
    hashtag_pattern = re.compile(r"#\w+")
    tweet = re.sub(hashtag_pattern, "", tweet)
    
    # Character normalization // todaaaaay -> today
    tweet = re.sub(r"([a-zA-Z])\1{2,}", r'\1', tweet)
    
    # Remove Special Characters
    tweet = re.sub(r'[^a-zA-Z\s]', "", tweet)
    
    # Word Tokenizer
    tweet = nltk.word_tokenize(tweet)
    
#     # Remove Stop Words 
#     stop_words = set([re.sub(r'[^a-zA-Z\s]', "", word) for word in nltk.corpus.stopwords.words("english")])
#     tweet = [word for word in tweet if word not in stop_words]
    
    # lemmatization
    def get_pos(word):
        tag = nltk.pos_tag([word])[0][1][0].upper()
        tag_dict = {"N": "n", "V": "v", "R": "r", "J": "a"}
        return tag_dict.get(tag, "n")
    
    lemma = nltk.stem.WordNetLemmatizer()
    tweet = [lemma.lemmatize(word, pos=get_pos(word)) for word in tweet]
    
    return tweet

train_x = train_df["text"].apply(pre_processing).values
test_x = test_df["text"].apply(pre_processing).values

pre_processing("I loveeeee NLP, @rahul_appu, www.rahul_appu.com, #NLP ")

['i', 'love', 'nlp']

In [5]:
# Vocab

voc = set()
for temp in train_x:
    voc.update(set(temp))
    
voc_len = len(voc) + 1
print("Vocab Size :", voc_len)

Vocab Size : 22038


In [6]:
# Vectorization

from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenization
token = Tokenizer()
token.fit_on_texts(train_x)

train_x = token.texts_to_sequences(train_x)
test_x = token.texts_to_sequences(test_x)

# Pad Sequence
train_x = pad_sequences(train_x, maxlen=30, padding='post')
test_x = pad_sequences(test_x, maxlen=30, padding='post')

In [7]:
# Model

from tensorflow.keras.layers import Input, Conv1D, Embedding, Dense, MaxPooling1D, Flatten
from tensorflow.keras.models import Sequential

model = Sequential()

model.add(Input(shape=(30, )))
model.add(Embedding(voc_len, 32, trainable=True))
model.add(Conv1D(32, 3, padding='same', activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(256, activation='relu'))
model.add(Dense(3, activation='sigmoid'))

model.summary()

In [8]:
# Training

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_x, train_y, epochs=5, batch_size=128)

Epoch 1/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.4806 - loss: 0.9977
Epoch 2/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.7678 - loss: 0.5755
Epoch 3/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.8470 - loss: 0.4040
Epoch 4/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 10ms/step - accuracy: 0.9082 - loss: 0.2633
Epoch 5/5
[1m215/215[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.9421 - loss: 0.1757


<keras.src.callbacks.history.History at 0x1bf6baab590>

In [9]:
# Classification Report

y_pred = np.argmax(model.predict(test_x, verbose=0), axis=1)

from sklearn.metrics import classification_report
print(classification_report(y_pred = y_pred, y_true = test_y))

              precision    recall  f1-score   support

           0       0.69      0.69      0.69      1001
           1       0.65      0.67      0.66      1430
           2       0.76      0.74      0.75      1103

    accuracy                           0.70      3534
   macro avg       0.70      0.70      0.70      3534
weighted avg       0.70      0.70      0.70      3534

