In [None]:
# data manipulation and normalization
import numpy as np
import pandas as pd
import re


# Neural Network utilits
import tensorflow as tf

from tensorflow.keras.layers import (
    Input,
    Embedding,     
    Dense,
    Flatten,
    Layer,
    
    Conv2D, 
    MaxPooling2D, 
    Reshape,
    
    Bidirectional,
    LSTM
)
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.ops import mean, outer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical, set_random_seed

In [None]:
# Global Variables

INPUT_SIZE = 140         # max tokens allowed in a text
EMBEDDING_SIZE = 100     # dimensions used by the embedding layer
TARGET_SIZE = 2          # features in the dataset
MAX_VOCABULARY = 10 ** 5 # max number of tokens possible

OPTIMIZER = Adam(learning_rate=0.001)
EPOCHS = 10
BATCH  = 32
VALIDATION_PLIT = 0.1

MENTION_PATTERN = "@\w+"
LINK_PATTERN = "http\S+|www\.\S+"
EMOJI_PATTERN = re.compile("["
                           u"\U0001F600-\U0001F64F" #emotions
                           u"\U0001F300-\U0001F5FF" #sumbols and pictographs
                           u"\U0001F680-\U0001F6FF" #transport and map symbols
                           u"\U0001F1E0-\U0001F1FF" #flags
                           u"\U00002702-\U000027B0"  
                           u"\U000024C2-\U0001F251" 
                           "]+",flags = re.UNICODE)

DATA_SET = "Sentiment140.csv" # path and headers of the dataset
HEADERS = [ 'target', 'ids', 'date', 'flag', 'user', 'text']

# Seting TensorFlow

In [3]:
set_random_seed(123)
tf.config.experimental.enable_op_determinism()

# Explore and Normalize Data

We are using the [Sentimental 140](https://www.kaggle.com/datasets/kazanova/sentiment140) database, that contains the following 6 fields:

**target**: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive). No neutral entry is present.

**ids**: The id of the tweet ( 2087 )

**date**: the date of the tweet ( Sat May 16 23:58:44 UTC 2009 )

**flag**: The query (lyx). If there is no query, then this value is NO_QUERY.

**user**: the user that tweeted (robotickilldozr)

**text**: the text of the tweet (Lyx is cool)

Since we only care about the sentiment of the text, only the text and target fields will be used.

In [4]:
df = pd.read_csv(DATA_SET, names=HEADERS, encoding = "latin", )
df = df[['target', 'text']]
df.head()

Unnamed: 0,target,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


Now that we narrow down the useful columns, lets normalize the data.

In [5]:
print("dataframe values = ", df['target'].unique())
df.loc[df['target'] == 0, 'target'] = 0
df.loc[df['target'] == 4, 'target'] = 1
print("normalized values = ", df['target'].unique())

dataframe values =  [0 4]
normalized values =  [0 1]


For the text, the normalization will include the following steps:
1. remove mentions
2. remove special characters (flags, emojis, etc)
3. remove links
4. remove punctuation
5. set to lower case

In [6]:
import re

def normalize_text(text):
    
    # remove metions
    text = re.sub(MENTION_PATTERN, "", text)
    # remove special symbols
    text = re.sub(EMOJI_PATTERN, "", text)
    # remove links
    text = re.sub(LINK_PATTERN, "", text)
    # remove punctuation
    text = text.translate(str.maketrans('', '', '!"#$%&\'()*+,-./:;<=>?[\\]^_`{|}~'))
    # Clean up extra whitespace
    text = re.sub(r"\s+", " ", text).strip()
    return text.lower()
    
df['text'] = df['text'].map(normalize_text)
df.head()

Unnamed: 0,target,text
0,0,awww thats a bummer you shoulda got david carr...
1,0,is upset that he cant update his facebook by t...
2,0,i dived many times for the ball managed to sav...
3,0,my whole body feels itchy and like its on fire
4,0,no its not behaving at all im mad why am i her...


Lets also mix it once

In [7]:
df = df.sample(frac = 1)
df['target'].values[:30]

array([0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1,
       1, 1, 1, 0, 0, 0, 0, 0])

Now we tokenize the text dataset.

In [8]:
# Tokenization and vectorization
tokenizer = Tokenizer(num_words=MAX_VOCABULARY)       
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text']) 

# Padding sequences
padded = pad_sequences(sequences, maxlen=INPUT_SIZE, padding='post')

# print dimensions
print(padded.shape)

(1600000, 140)


In [9]:
X = padded
y = to_categorical(df['target'].values, num_classes=2)

# Multiple Layer Perceptron

In [10]:
mlp = Sequential([
    Input((INPUT_SIZE,)),
    Embedding(MAX_VOCABULARY, EMBEDDING_SIZE),
    Flatten(),
    Dense(units=TARGET_SIZE, activation='softmax'),
], name="MLP")
mlp.compile(optimizer='adam',
            loss='categorical_crossentropy',
            metrics=['accuracy'])
mlp.summary()

In [None]:
mlp.fit(X, y, batch_size=BATCH, epochs=EPOCHS, validation_split=VALIDATION_PLIT)

Epoch 1/10
[1m45000/45000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1252s[0m 28ms/step - accuracy: 0.7756 - loss: 0.4846 - val_accuracy: 0.7969 - val_loss: 0.4603
Epoch 2/10
[1m45000/45000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1201s[0m 27ms/step - accuracy: 0.8225 - loss: 0.4100 - val_accuracy: 0.7779 - val_loss: 0.5131
Epoch 3/10
[1m45000/45000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1201s[0m 27ms/step - accuracy: 0.8574 - loss: 0.3415 - val_accuracy: 0.7602 - val_loss: 0.5993
Epoch 4/10
[1m 1133/45000[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m19:22[0m 27ms/step - accuracy: 0.8645 - loss: 0.3263

# Convolutional Neural Network

We must assume a mistype in the original paper, as it is not possible to transform the input (140, 100) passing by a convolutional layer with kernel = 100 x 100, them again with a maxpooling  20 x 20, due to dimensionality loss throught the kernels

Thus we aproximate the following:

In [None]:
cnn = Sequential([
    Input((INPUT_SIZE,)),
    Embedding(MAX_VOCABULARY, EMBEDDING_SIZE),

    Reshape((140,100,1)),
    Conv2D(filters=20, kernel_size=(100, 100), activation='relu'),
    MaxPooling2D(pool_size=(20, 20)),
    Flatten(),

    Dense(50, activation='relu'),
    Dense(TARGET_SIZE, activation='softmax'),
], name="CNN")
cnn.compile(optimizer='adam',
            loss='categorical_crossentropy',
            metrics=['accuracy'])
cnn.summary()

In [None]:
cnn.fit(X, y, batch_size=BATCH, epochs=EPOCHS, validation_split=VALIDATION_PLIT)

# Long-Short Term Memory

In [None]:
lstm = Sequential([
    Input((INPUT_SIZE,)),
    Embedding(MAX_VOCABULARY, EMBEDDING_SIZE),

    Bidirectional(LSTM(128, return_sequences=True)),
    Bidirectional(LSTM(128)),

    
    Dense(424, activation='relu'),
    Dense(units=2, activation='softmax'),
], name="LSTM")
lstm.compile(optimizer='adam',
             loss='categorical_crossentropy',
             metrics=['accuracy'])
lstm.summary()

In [None]:
lstm.fit(X, y, batch_size=BATCH, epochs=EPOCHS, validation_split=VALIDATION_PLIT)

# Swarm Characteristic Neural Network

In [None]:
class SwarmFeature(Layer):
    def __init__(self, units=32):
        super().__init__()
        self.units = units

    def build(self, input_shape):
        self.filter = self.add_weight(
            shape=(self.units,),
            initializer="random_normal",
            trainable=True,
        )

    def call(self, inputs):
        return mean(inputs, axis=-1, keepdims=True) * self.filter

In [None]:
scnn = Sequential([
    Input((INPUT_SIZE,)),
    Embedding(MAX_VOCABULARY, EMBEDDING_SIZE),
    
    Flatten(),
    SwarmFeature(units=300),
    SwarmFeature(units=10),
    
    Dense(units=2, activation='softmax'),
], name="SCNN")
scnn.compile(optimizer='adam',
             loss='categorical_crossentropy',
             metrics=['accuracy'])
scnn.summary()

In [None]:
scnn.fit(X, y, batch_size=BATCH, epochs=EPOCHS, validation_split=VALIDATION_PLIT)

# Comparative Analyse