# `Module 9`: Introduction to NLP

* You are a Data Scientist in a big firm. You have to develop a deep learning model to perform sentiment analysis on a dataset of tweets related to various candidates.

In [1]:
import pandas as pd
import re

from transformers import BertModel, BertTokenizer

import torch
from torch.utils.data import random_split, DataLoader, Dataset
import torch.nn as nn

In [2]:
bert_model = BertModel.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [3]:
df = pd.read_csv("dataset/twitter_sentiment_dataset.csv", names=['id', 'candidate', 'sentiment', 'text'], header=None)
df = df[['text', 'sentiment']]
df.head()

Unnamed: 0,text,sentiment
0,im getting on borderlands and i will murder yo...,Positive
1,I am coming to the borders and I will kill you...,Positive
2,im getting on borderlands and i will kill you ...,Positive
3,im coming on borderlands and i will murder you...,Positive
4,im getting on borderlands 2 and i will murder ...,Positive


In [4]:
df = df.dropna()
df = df[df["sentiment"].isin(["Positive", "Negative"])]
df = df.reset_index(drop=True)
df.shape

(43013, 2)

In [5]:
df.sentiment.value_counts()

sentiment
Negative    22358
Positive    20655
Name: count, dtype: int64

In [6]:
def clean_text(text):
    text = re.sub(r'http\S+', '', text.lower())  # Remove URLs
    text = re.sub(r'[^a-zA-Z0-9\s]+', '', text)
    return text.strip()

In [7]:
df['text'] = df['text'].apply(clean_text)
df.sentiment = df.sentiment.astype('category').cat.codes

In [11]:
df = df.iloc[:20000]

In [12]:
texts = df.text.to_list()

In [13]:
def getEmbedding(texts):
    
    embedding = []
    batch_size = 8
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i : i + batch_size]
        input = tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=100)

        with torch.no_grad():
            output = bert_model(**input)

        batch_embeddings = output.last_hidden_state.mean(dim=1)
        embedding.append(batch_embeddings)
    
    return torch.cat(embedding, dim=0) # Concatenate all batch embeddings

In [14]:
x = getEmbedding(texts)

In [37]:
y = torch.tensor(df.sentiment.values, dtype=torch.float32)

In [38]:
class TwitterSentimentDataset(Dataset):
    def __init__(self, text, sentiment):
        self.text = text
        self.sentiment = sentiment

    def __len__(self):
        return len(self.text)
    
    def __getitem__(self,  idx):
        return self.text[idx], self.sentiment[idx]

In [39]:
dataset = TwitterSentimentDataset(x, y)

In [40]:
len(dataset)

20000

In [41]:
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_data, test_data = random_split(dataset, [train_size, test_size])

In [42]:
train_loader = DataLoader(train_data, batch_size=16, shuffle=True)
test_loader = DataLoader(test_data, batch_size=16, shuffle=True)

In [48]:
class SentimentLSTM(nn.Module):
    def __init__(self):
        super(SentimentLSTM, self).__init__()
        
        self.dropout = nn.Dropout(0.3)
        self.lstm = nn.LSTM(768, 128, batch_first=True) # BERT embedding size is 768
        self.fc = nn.Linear(128, 16)

    def forward(self, x):
        x = self.dropout(x)
        _, (h, _) = self.lstm(x)
        x = self.fc(h[-1])
        return x

In [49]:
model = SentimentLSTM()
print(model)

SentimentLSTM(
  (dropout): Dropout(p=0.3, inplace=False)
  (lstm): LSTM(768, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=16, bias=True)
)


In [60]:
lr = 0.001
epoch_size = 50

In [61]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

In [62]:
model.train()
for epoch in range(epoch_size):
    
    total_loss = 0
    for input, label in train_loader:
        output = model(input)
        
        optimizer.zero_grad()
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch: {epoch+1}/{epoch_size} loss: {total_loss:.4f}")

Epoch: 1/50 loss: 24119.6032
Epoch: 2/50 loss: 24092.8790
Epoch: 3/50 loss: 24096.4334
Epoch: 4/50 loss: 24071.4737
Epoch: 5/50 loss: 24100.1118
Epoch: 6/50 loss: 24103.3069
Epoch: 7/50 loss: 24092.4586
Epoch: 8/50 loss: 24085.5506
Epoch: 9/50 loss: 24098.4200
Epoch: 10/50 loss: 24085.1602
Epoch: 11/50 loss: 24100.4579
Epoch: 12/50 loss: 24098.9492
Epoch: 13/50 loss: 24103.4760
Epoch: 14/50 loss: 24103.9509
Epoch: 15/50 loss: 24077.6087
Epoch: 16/50 loss: 24093.1512
Epoch: 17/50 loss: 24091.7109
Epoch: 18/50 loss: 24115.8184
Epoch: 19/50 loss: 24092.9552
Epoch: 20/50 loss: 24070.8037
Epoch: 21/50 loss: 24055.6760
Epoch: 22/50 loss: 24080.5722
Epoch: 23/50 loss: 24059.4506
Epoch: 24/50 loss: 24037.3594
Epoch: 25/50 loss: 24028.7738
Epoch: 26/50 loss: 24031.9893
Epoch: 27/50 loss: 24023.9490
Epoch: 28/50 loss: 24016.1735
Epoch: 29/50 loss: 24011.5329
Epoch: 30/50 loss: 24003.3819
Epoch: 31/50 loss: 23999.8160
Epoch: 32/50 loss: 24001.8473
Epoch: 33/50 loss: 23996.2914
Epoch: 34/50 loss: 

In [115]:
model.eval()
with torch.no_grad():
    
    total = 0
    correct = 0
    for text, label in test_loader:
        
        output = model(text)
        predict = torch.argmax(output)
        correct += (predict == label).sum().item()
        total += label.size(0)

    print(f"Model Accuracy is {correct / total * 100:.2f}%")

Model Accuracy is 1.18%
