# General Parameters

In [1]:
# data manipulation and normalization
import numpy as np
import pandas as pd
import re
from collections import Counter


# Neural Network utilits
import torch
from torch import nn, optim
from torch.utils.data import TensorDataset, DataLoader
import torch.nn.functional as F
from torchinfo import summary

import tqdm

# set gpu if available
if torch.cuda.is_available():
    device = torch.device("cuda")
else:
    device = torch.device("gpu")

torch.set_default_device(device)
print(device)

cuda


In [2]:
# Global Variables

INPUT_SIZE = 140         # max tokens allowed in a text
EMBEDDING_SIZE = 100     # dimensions used by the embedding layer
TARGET_SIZE = 2          # features in the dataset
MAX_VOCABULARY = 10 ** 5 # max number of tokens possible

EPOCHS = 10
BATCH  = 32
VAL_SPLIT = 0.1

MENTION_PATTERN = "@\w+"
LINK_PATTERN = "http\S+|www\.\S+"
EMOJI_PATTERN = re.compile("["
                           u"\U0001F600-\U0001F64F" #emotions
                           u"\U0001F300-\U0001F5FF" #sumbols and pictographs
                           u"\U0001F680-\U0001F6FF" #transport and map symbols
                           u"\U0001F1E0-\U0001F1FF" #flags
                           u"\U00002702-\U000027B0"  
                           u"\U000024C2-\U0001F251" 
                           "]+",flags = re.UNICODE)

DATA_SET = "Sentiment140.csv" # path and headers of the dataset
HEADERS = [ 'target', 'ids', 'date', 'flag', 'user', 'text']

  MENTION_PATTERN = "@\w+"
  LINK_PATTERN = "http\S+|www\.\S+"


# Seting Determinism

In [3]:
torch.manual_seed(123)
np.random.seed(123)

# Explore and Normalize Data

We are using the [Sentimental 140](https://www.kaggle.com/datasets/kazanova/sentiment140) database, that contains the following 6 fields:

**target**: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive). No neutral entry is present.

**ids**: The id of the tweet ( 2087 )

**date**: the date of the tweet ( Sat May 16 23:58:44 UTC 2009 )

**flag**: The query (lyx). If there is no query, then this value is NO_QUERY.

**user**: the user that tweeted (robotickilldozr)

**text**: the text of the tweet (Lyx is cool)

Since we only care about the sentiment of the text, only the text and target fields will be used.

In [4]:
df = pd.read_csv(DATA_SET, names=HEADERS, encoding = "latin", )
df = df[['target', 'text']]
df.head()

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


Now that we narrow down the useful columns, lets normalize the data.

In [5]:
print("dataframe values = ", df['target'].unique())
df.loc[df['target'] == 0, 'target'] = 0
df.loc[df['target'] == 4, 'target'] = 1
print("normalized values = ", df['target'].unique())

dataframe values =  [0 4]
normalized values =  [0 1]


For the text, the normalization will include the following steps:
1. remove mentions
2. remove special characters (flags, emojis, etc)
3. remove links
4. remove punctuation
5. set to lower case

In [6]:
import re

def normalize_text(text):
    
    # remove metions
    text = re.sub(MENTION_PATTERN, "", text)
    # remove special symbols
    text = re.sub(EMOJI_PATTERN, "", text)
    # remove links
    text = re.sub(LINK_PATTERN, "", text)
    # remove punctuation
    text = text.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?[\\]^_`{|}~'))
    # Clean up extra whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text.lower()
    
df['text'] = df['text'].map(normalize_text)
df.head()

Unnamed: 0,target,text
0,0,awww thats a bummer you shoulda got david carr...
1,0,is upset that he cant update his facebook by t...
2,0,i dived many times for the ball managed to sav...
3,0,my whole body feels itchy and like its on fire
4,0,no its not behaving at all im mad why am i her...


Lets also mix it once

In [7]:
df = df.sample(frac = 1)
df['target'].values[:30]

array([0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 0])

Now we tokenize the text dataset.

In [8]:
# Tokenization and vectorization
def tokenize_and_vectorize(texts, max_vocab_size, input_size):
    # Create a vocabulary dictionary (word -> index)
    word_counts = Counter(" ".join(texts).split())
    most_common = word_counts.most_common(max_vocab_size-1) # include empty word
    vocab = {word: idx + 1 for idx, (word, _) in enumerate(most_common)}  # start index at 1

    # Convert texts to sequences of word indices
    sequences = []
    for text in texts:
        sequence = [vocab.get(word, 0) for word in text.split()]  # 0 for unknown words
        sequences.append(sequence)

    # Padding sequences
    padded_sequences = [seq[:input_size] + [0] * (input_size - len(seq)) if len(seq) < input_size else seq[:input_size] for seq in sequences]
    return np.array(padded_sequences), vocab

# Tokenize and vectorize
X, vocab = tokenize_and_vectorize(df['text'], MAX_VOCABULARY, INPUT_SIZE)
y = df['target'].values

# Convert X and y to PyTorch tensors
X_tensor = torch.tensor(X, dtype=torch.long)
y_tensor = torch.tensor(y, dtype=torch.long)

# Print dimensions
print(X_tensor.shape, y_tensor.shape)

torch.Size([1600000, 140]) torch.Size([1600000])


In [9]:
# since we already shuffled the data, lets just split it on data loaders

train_loader = DataLoader(
    TensorDataset(
        X_tensor[:int(-len(X_tensor)*VAL_SPLIT)], 
        y_tensor[:int(-len(X_tensor)*VAL_SPLIT)]), 
    batch_size=BATCH)

valid_loader = DataLoader(
    TensorDataset(
        X_tensor[int(-len(X_tensor)*VAL_SPLIT):], 
        y_tensor[int(-len(X_tensor)*VAL_SPLIT):]), 
    batch_size=BATCH)

# Multiple Layer Perceptron

In [10]:
# Define the MLP model (same as before)
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.embedding = nn.Embedding(MAX_VOCABULARY, EMBEDDING_SIZE)
        self.flatten = nn.Flatten()
        self.fc = nn.Linear(INPUT_SIZE * EMBEDDING_SIZE, TARGET_SIZE)

    def forward(self, x):
        x = self.embedding(x)
        x = self.flatten(x)
        x = self.fc(x)
        return x

In [11]:
summary(MLP())

Layer (type:depth-idx)                   Param #
MLP                                      --
├─Embedding: 1-1                         10,000,000
├─Flatten: 1-2                           --
├─Linear: 1-3                            28,002
Total params: 10,028,002
Trainable params: 10,028,002
Non-trainable params: 0

# Convolutional Neural Network

In [12]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.embedding = nn.Embedding(MAX_VOCABULARY, EMBEDDING_SIZE)

        self.conv = nn.Conv2d(
            in_channels=1, 
            out_channels=20, 
            kernel_size=(100, 100))

        self.pool = nn.MaxPool2d(kernel_size=(20, 20), stride=1)

        self.flatten = nn.Flatten()
        self.fc = nn.Linear(22, TARGET_SIZE)

    def forward(self, x):
        x = self.embedding(x)  
        #print(x.shape)
        x = x.unsqueeze(1)
        #print(x.shape)
        x = self.conv(x).squeeze(3)
        #print(x.shape)
        #x = torch.permute(x,(0,3,1,2))
        #print(x.shape)
        x = self.pool(x)
        #print(x.shape)

        x = self.flatten(x)
        #print(x.shape)

        x = F.softmax(self.fc(x), dim=1)  # Output layer with softmax for probabilities

        return x

In [13]:
summary(CNN())

Layer (type:depth-idx)                   Param #
CNN                                      --
├─Embedding: 1-1                         10,000,000
├─Conv2d: 1-2                            200,020
├─MaxPool2d: 1-3                         --
├─Flatten: 1-4                           --
├─Linear: 1-5                            46
Total params: 10,200,066
Trainable params: 10,200,066
Non-trainable params: 0

# Long-Short Term Memory

In [14]:
class LSTM(nn.Module):
    def __init__(self):
        super(LSTM, self).__init__()
        self.embedding = nn.Embedding(MAX_VOCABULARY, EMBEDDING_SIZE)  # Embedding layer
        
        # Bidirectional LSTM layers
        self.lstm = nn.LSTM(input_size=EMBEDDING_SIZE, 
                            hidden_size=128,
                            num_layers=2,
                            batch_first=True,
                            bidirectional=True)
        
        self.fc = nn.Linear(2 * 128, TARGET_SIZE)

    def forward(self, x):
        # Embedding layer
        x = self.embedding(x)  # Shape: [batch_size, seq_length, embedding_size]

        # Staked LSTM layers
        lstm_out, (h_n, c_n) = self.lstm(x)  # Shape: [batch_size, seq_length, 2 * hidden_size]

        # Use the last hidden state for classification
        x = torch.cat((h_n[-2], h_n[-1]), dim=1)  # [batch_size, hidden_dim * 2]
        
        # Fully connected layers
        x = F.softmax(self.fc(x), dim=1)
        return x

In [15]:
summary(LSTM())

Layer (type:depth-idx)                   Param #
LSTM                                     --
├─Embedding: 1-1                         10,000,000
├─LSTM: 1-2                              630,784
├─Linear: 1-3                            514
Total params: 10,631,298
Trainable params: 10,631,298
Non-trainable params: 0

# Swarm Characteristic Neural Network

In [16]:

class SwarmFilter(nn.Module):
    def __init__(self, units=32):
        super(SwarmFilter, self).__init__()
        self.units = units
        self.filter = nn.Parameter(torch.randn(units))  # Trainable parameter with random initialization

    def forward(self, x):
        # Compute the mean along the last axis, keeping the dimensions
        mean_values = torch.mean(x, dim=-1, keepdim=True)
        return mean_values * self.filter

In [17]:
class SCNN(nn.Module):
    def __init__(self):
        super(SCNN, self).__init__()
        self.embedding = nn.Embedding(MAX_VOCABULARY, EMBEDDING_SIZE)
        self.flatten = nn.Flatten()
        self.swarm1 = SwarmFilter(units=300)
        self.swarm2 = SwarmFilter(units=10)
        self.fc = nn.Linear(10, 2) 

    def forward(self, x):
        x = self.embedding(x)  
        x = self.flatten(x)     
        x = self.swarm1(x)     
        x = self.swarm2(x)     
        x = self.fc(x)         
        return F.softmax(x, dim=1)

In [18]:
summary(SCNN())

Layer (type:depth-idx)                   Param #
SCNN                                     --
├─Embedding: 1-1                         10,000,000
├─Flatten: 1-2                           --
├─SwarmFilter: 1-3                       300
├─SwarmFilter: 1-4                       10
├─Linear: 1-5                            22
Total params: 10,000,332
Trainable params: 10,000,332
Non-trainable params: 0

# Running the models

In [19]:
def run_model(model, epochs=EPOCHS):
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    criterion = nn.CrossEntropyLoss()
    
    for epoch in range(EPOCHS):
        model.train()  # Set the model to training mode
        running_loss = 0.0
        correct = 0
        total = 0
    
        print(f"Epoch {epoch + 1}/{EPOCHS}")
        
        for _, data in tqdm.tqdm(enumerate(train_loader, 0), unit="batch", total=len(train_loader)):
            inputs, labels = data
            # Zero the gradients
            optimizer.zero_grad()
    
            # Forward pass
            outputs = model(inputs)
    
            # Calculate the loss
            loss = criterion(outputs, labels)
    
            # Backward pass (compute gradients)
            loss.backward()
    
            # Update the model parameters
            optimizer.step()
    
            # Calculate accuracy (for tracking)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
            # Track the running loss
            running_loss += loss.item()
    
        # Print training stats for the epoch
        print(f"\tTraining Loss: {running_loss/len(train_loader):.4f}, Accuracy: {100 * correct/total:.2f}%")
    
        # Validation loop
        model.eval()  # Set the model to evaluation mode
        val_loss = 0.0
        val_correct = 0
        val_total = 0
    
        with torch.no_grad():  # Disable gradient calculation during validation
            for inputs, labels in valid_loader:
                # Forward pass
                outputs = model(inputs)
    
                # Calculate the loss
                loss = criterion(outputs, labels)
    
                # Calculate accuracy
                _, predicted = torch.max(outputs, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()
    
                # Track validation loss
                val_loss += loss.item()
    
        # Print validation stats
        print(f"\tValidation Loss: {val_loss/len(valid_loader):.4f}, Accuracy: {100 * val_correct/val_total:.2f}%")

# Comparative Analyse

In [20]:
run_model(MLP())

Epoch 1/10


100%|█████████████████████████████████| 45000/45000 [02:53<00:00, 259.27batch/s]


	Training Loss: 0.5483, Accuracy: 75.41%
	Validation Loss: 0.5120, Accuracy: 77.57%
Epoch 2/10


100%|█████████████████████████████████| 45000/45000 [02:55<00:00, 256.82batch/s]


	Training Loss: 0.4803, Accuracy: 79.19%
	Validation Loss: 0.5268, Accuracy: 77.56%
Epoch 3/10


100%|█████████████████████████████████| 45000/45000 [02:56<00:00, 255.24batch/s]


	Training Loss: 0.4436, Accuracy: 81.08%
	Validation Loss: 0.5592, Accuracy: 76.96%
Epoch 4/10


100%|█████████████████████████████████| 45000/45000 [02:56<00:00, 254.92batch/s]


	Training Loss: 0.4088, Accuracy: 82.82%
	Validation Loss: 0.6023, Accuracy: 76.28%
Epoch 5/10


100%|█████████████████████████████████| 45000/45000 [02:57<00:00, 253.34batch/s]


	Training Loss: 0.3760, Accuracy: 84.42%
	Validation Loss: 0.6517, Accuracy: 75.69%
Epoch 6/10


100%|█████████████████████████████████| 45000/45000 [02:56<00:00, 254.64batch/s]


	Training Loss: 0.3464, Accuracy: 85.83%
	Validation Loss: 0.7060, Accuracy: 75.07%
Epoch 7/10


100%|█████████████████████████████████| 45000/45000 [02:55<00:00, 256.08batch/s]


	Training Loss: 0.3203, Accuracy: 87.07%
	Validation Loss: 0.7655, Accuracy: 74.60%
Epoch 8/10


100%|█████████████████████████████████| 45000/45000 [02:56<00:00, 255.37batch/s]


	Training Loss: 0.2976, Accuracy: 88.10%
	Validation Loss: 0.8277, Accuracy: 74.05%
Epoch 9/10


100%|█████████████████████████████████| 45000/45000 [02:56<00:00, 255.18batch/s]


	Training Loss: 0.2780, Accuracy: 89.01%
	Validation Loss: 0.8898, Accuracy: 73.64%
Epoch 10/10


100%|█████████████████████████████████| 45000/45000 [02:55<00:00, 256.57batch/s]


	Training Loss: 0.2612, Accuracy: 89.76%
	Validation Loss: 0.9526, Accuracy: 73.23%


In [21]:
run_model(CNN())

Epoch 1/10


100%|██████████████████████████████████| 45000/45000 [28:14<00:00, 26.56batch/s]


	Training Loss: 0.5375, Accuracy: 76.35%
	Validation Loss: 0.5196, Accuracy: 78.54%
Epoch 2/10


100%|██████████████████████████████████| 45000/45000 [28:14<00:00, 26.55batch/s]


	Training Loss: 0.5081, Accuracy: 79.72%
	Validation Loss: 0.5123, Accuracy: 79.32%
Epoch 3/10


100%|██████████████████████████████████| 45000/45000 [28:14<00:00, 26.56batch/s]


	Training Loss: 0.5002, Accuracy: 80.60%
	Validation Loss: 0.5150, Accuracy: 79.17%
Epoch 4/10


100%|██████████████████████████████████| 45000/45000 [28:15<00:00, 26.54batch/s]


	Training Loss: 0.4941, Accuracy: 81.29%
	Validation Loss: 0.5157, Accuracy: 79.12%
Epoch 5/10


100%|██████████████████████████████████| 45000/45000 [28:15<00:00, 26.53batch/s]


	Training Loss: 0.4888, Accuracy: 81.89%
	Validation Loss: 0.5133, Accuracy: 79.38%
Epoch 6/10


100%|██████████████████████████████████| 45000/45000 [28:15<00:00, 26.54batch/s]


	Training Loss: 0.4841, Accuracy: 82.41%
	Validation Loss: 0.5132, Accuracy: 79.43%
Epoch 7/10


100%|██████████████████████████████████| 45000/45000 [28:15<00:00, 26.54batch/s]


	Training Loss: 0.4798, Accuracy: 82.90%
	Validation Loss: 0.5130, Accuracy: 79.47%
Epoch 8/10


100%|██████████████████████████████████| 45000/45000 [28:15<00:00, 26.54batch/s]


	Training Loss: 0.4760, Accuracy: 83.33%
	Validation Loss: 0.5138, Accuracy: 79.42%
Epoch 9/10


100%|██████████████████████████████████| 45000/45000 [28:17<00:00, 26.51batch/s]


	Training Loss: 0.4726, Accuracy: 83.70%
	Validation Loss: 0.5142, Accuracy: 79.37%
Epoch 10/10


100%|██████████████████████████████████| 45000/45000 [28:16<00:00, 26.52batch/s]


	Training Loss: 0.4696, Accuracy: 84.03%
	Validation Loss: 0.5151, Accuracy: 79.35%


In [22]:
run_model(LSTM())

Epoch 1/10


100%|██████████████████████████████████| 45000/45000 [10:58<00:00, 68.29batch/s]


	Training Loss: 0.4999, Accuracy: 80.10%
	Validation Loss: 0.4822, Accuracy: 82.03%
Epoch 2/10


100%|██████████████████████████████████| 45000/45000 [10:56<00:00, 68.58batch/s]


	Training Loss: 0.4712, Accuracy: 83.33%
	Validation Loss: 0.4782, Accuracy: 82.57%
Epoch 3/10


100%|██████████████████████████████████| 45000/45000 [10:56<00:00, 68.54batch/s]


	Training Loss: 0.4599, Accuracy: 84.63%
	Validation Loss: 0.4782, Accuracy: 82.62%
Epoch 4/10


100%|██████████████████████████████████| 45000/45000 [10:56<00:00, 68.50batch/s]


	Training Loss: 0.4527, Accuracy: 85.47%
	Validation Loss: 0.4788, Accuracy: 82.62%
Epoch 5/10


100%|██████████████████████████████████| 45000/45000 [10:56<00:00, 68.52batch/s]


	Training Loss: 0.4476, Accuracy: 86.06%
	Validation Loss: 0.4789, Accuracy: 82.66%
Epoch 6/10


100%|██████████████████████████████████| 45000/45000 [10:56<00:00, 68.50batch/s]


	Training Loss: 0.4445, Accuracy: 86.40%
	Validation Loss: 0.4798, Accuracy: 82.62%
Epoch 7/10


100%|██████████████████████████████████| 45000/45000 [10:57<00:00, 68.49batch/s]


	Training Loss: 0.4424, Accuracy: 86.63%
	Validation Loss: 0.4802, Accuracy: 82.59%
Epoch 8/10


100%|██████████████████████████████████| 45000/45000 [10:57<00:00, 68.48batch/s]


	Training Loss: 0.4404, Accuracy: 86.85%
	Validation Loss: 0.4812, Accuracy: 82.47%
Epoch 9/10


100%|██████████████████████████████████| 45000/45000 [10:57<00:00, 68.47batch/s]


	Training Loss: 0.4393, Accuracy: 86.96%
	Validation Loss: 0.4807, Accuracy: 82.52%
Epoch 10/10


100%|██████████████████████████████████| 45000/45000 [10:57<00:00, 68.40batch/s]


	Training Loss: 0.4384, Accuracy: 87.07%
	Validation Loss: 0.4810, Accuracy: 82.51%


In [23]:
run_model(SCNN())

Epoch 1/10


100%|█████████████████████████████████| 45000/45000 [03:11<00:00, 234.97batch/s]


	Training Loss: 0.5267, Accuracy: 77.28%
	Validation Loss: 0.5086, Accuracy: 79.62%
Epoch 2/10


100%|█████████████████████████████████| 45000/45000 [03:10<00:00, 236.56batch/s]


	Training Loss: 0.5028, Accuracy: 80.21%
	Validation Loss: 0.5061, Accuracy: 79.88%
Epoch 3/10


100%|█████████████████████████████████| 45000/45000 [03:11<00:00, 235.43batch/s]


	Training Loss: 0.4982, Accuracy: 80.72%
	Validation Loss: 0.5064, Accuracy: 79.84%
Epoch 4/10


100%|█████████████████████████████████| 45000/45000 [03:10<00:00, 236.23batch/s]


	Training Loss: 0.4954, Accuracy: 81.04%
	Validation Loss: 0.5054, Accuracy: 79.94%
Epoch 5/10


100%|█████████████████████████████████| 45000/45000 [03:10<00:00, 236.41batch/s]


	Training Loss: 0.4933, Accuracy: 81.27%
	Validation Loss: 0.5044, Accuracy: 80.05%
Epoch 6/10


100%|█████████████████████████████████| 45000/45000 [03:10<00:00, 236.67batch/s]


	Training Loss: 0.4918, Accuracy: 81.46%
	Validation Loss: 0.5040, Accuracy: 80.07%
Epoch 7/10


100%|█████████████████████████████████| 45000/45000 [03:13<00:00, 232.61batch/s]


	Training Loss: 0.4905, Accuracy: 81.61%
	Validation Loss: 0.5038, Accuracy: 80.12%
Epoch 8/10


100%|█████████████████████████████████| 45000/45000 [03:10<00:00, 235.98batch/s]


	Training Loss: 0.4893, Accuracy: 81.74%
	Validation Loss: 0.5036, Accuracy: 80.16%
Epoch 9/10


100%|█████████████████████████████████| 45000/45000 [03:10<00:00, 235.89batch/s]


	Training Loss: 0.4884, Accuracy: 81.84%
	Validation Loss: 0.5037, Accuracy: 80.17%
Epoch 10/10


100%|█████████████████████████████████| 45000/45000 [03:10<00:00, 236.03batch/s]


	Training Loss: 0.4875, Accuracy: 81.94%
	Validation Loss: 0.5039, Accuracy: 80.14%
