<a href="https://colab.research.google.com/github/NeerajHazarika/ENITS_DataMining_WS24/blob/main/Week_12/Assignment_12.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Assingment 12: Twitter Sentiment Classification with Transformers

We work on the Kaggle challenge on [Tweet Sentiments](https://www.kaggle.com/c/tweet-sentiment-extraction/overview).

* Task: classify tweets into positive, neutral or negative sentiments.




In [1]:
#get the data
!git clone https://github.com/keuperj/DATA.git

Cloning into 'DATA'...
remote: Enumerating objects: 126, done.[K
remote: Counting objects: 100% (39/39), done.[K
remote: Compressing objects: 100% (28/28), done.[K
remote: Total 126 (delta 11), reused 39 (delta 11), pack-reused 87 (from 1)[K
Receiving objects: 100% (126/126), 185.56 MiB | 9.72 MiB/s, done.
Resolving deltas: 100% (32/32), done.
Updating files: 100% (86/86), done.


In [2]:
import pandas as pd

In [3]:
train = pd.read_csv('DATA/twitter_train.csv' , encoding = "ISO-8859-1")
test = pd.read_csv('DATA/twitter_test.csv' , encoding = "ISO-8859-1")

In [4]:
len(train)

27481

In [5]:
len(test)

3534

In [6]:
train.head()

Unnamed: 0,textID,text,selected_text,sentiment
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative
2,088c60f138,my boss is bullying me...,bullying me,negative
3,9642c003ef,what interview! leave me alone,leave me alone,negative
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative


In [7]:
test.head()

Unnamed: 0,textID,text,sentiment
0,f87dea47db,Last session of the day http://twitpic.com/67ezh,neutral
1,96d74cb729,Shanghai is also really exciting (precisely -...,positive
2,eee518ae67,"Recession hit Veronique Branquinho, she has to...",negative
3,01082688c6,happy bday!,positive
4,33987a8ee5,http://twitpic.com/4w75p - I like it!!,positive


In [8]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [9]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super().__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [10]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super().__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [26]:
import pandas as pd
from collections import Counter

def encode_by_word_frequency(df, df_test, text_column='text'):
    """Encodes text data in a DataFrame based on word frequency.

    Args:
        df: The input DataFrame.
        text_column: The name of the column containing the text data.

    Returns:
        A new DataFrame with an added 'encoded_text' column.
    """

    # Combine all text data into a single string
    all_text = ' '.join(df[text_column].astype(str))  # Handle potential NaN values

    # Count word frequencies
    word_counts = Counter(all_text.split())

    # Create a mapping from word to its frequency rank
    sorted_word_counts = sorted(word_counts.items(), key=lambda x: x[1], reverse=True)
    word_to_rank = {word: rank + 1 for rank, (word, _) in enumerate(sorted_word_counts)}

    print("max word encoding", sorted_word_counts[0])

    def encode_text(text):
        # Handle potential NaN values
        text = str(text)
        # encoded = ' '.join(str(word_to_rank.get(word, 0)) for word in text.split()) # Map words to their ranks or 0 if not found

        encoded_text = []

        for w in text.split():
          encoded_text.append(word_to_rank.get(w, 0))

        return encoded_text

    encoded_y = []

    for s in df['sentiment']:
      if s == 'positive':
        encoded_y.append(1)
      elif s == 'neutral':
        encoded_y.append(0)
      elif s == 'negative':
        encoded_y.append(2)

    encoded_x = []

    for s in df[text_column]:
      encoded_x.append(encode_text(s))

    encoded_test_y = []

    for s in df['sentiment']:
      if s == 'positive':
        encoded_test_y.append(1)
      elif s == 'neutral':
        encoded_test_y.append(0)
      elif s == 'negative':
        encoded_test_y.append(2)

    encoded_test_x = []

    for s in df['text']:
      encoded_test_x.append(encode_text(s))

    return encoded_x, encoded_y, encoded_test_x, encoded_test_y


x_train_encoded, y_train_encoded, x_test_encoded, y_test_encoded = encode_by_word_frequency(train, test, text_column='selected_text')

max word encoding ('to', 5190)


In [27]:
x_train_encoded[:5]

[[451, 15, 9521, 75, 2, 140, 43],
 [3490, 2241],
 [9522, 16],
 [316, 16, 633],
 [5888, 13, 729]]

In [28]:
y_train_encoded[:5]

[0, 2, 2, 2, 2]

In [29]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(x_train_encoded, y_train_encoded, test_size=0.2, random_state=42) # Adjust test_size as needed

print(f"x_train shape: {len(x_train)}")
print(f"x_val shape: {len(x_val)}")
print(f"y_train shape: {len(y_train)}")
print(f"y_val shape: {len(y_val)}")

x_train shape: 21984
x_val shape: 5497
y_train shape: 21984
y_val shape: 5497


In [30]:
vocab_size = 40000  # Only consider the top 40k words
maxlen = 200  # Only consider the first 200 words of each tweet
x_train = keras.utils.pad_sequences(x_train, maxlen=maxlen)
x_val = keras.utils.pad_sequences(x_val, maxlen=maxlen)

In [31]:
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x, training=False)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(20, activation="relu")(x)
x = layers.Dropout(0.1)(x)
outputs = layers.Dense(3, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

In [32]:
# Convert x_train and x_val to TensorFlow Tensors
x_train = tf.convert_to_tensor(x_train)
x_val = tf.convert_to_tensor(x_val)

# Convert y_train and y_val to TensorFlow Tensors
y_train = tf.convert_to_tensor(y_train)
y_val = tf.convert_to_tensor(y_val)

In [33]:
model.compile(
    optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
history = model.fit(
    x_train, y_train, batch_size=32, epochs=20, validation_data=(x_val, y_val)
)

Epoch 1/20
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 7ms/step - accuracy: 0.3913 - loss: 1.0948 - val_accuracy: 0.5414 - val_loss: 0.9938
Epoch 2/20
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 4ms/step - accuracy: 0.5205 - loss: 0.9497 - val_accuracy: 0.6087 - val_loss: 0.8225
Epoch 3/20
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.5976 - loss: 0.8041 - val_accuracy: 0.5805 - val_loss: 0.8403
Epoch 4/20
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 5ms/step - accuracy: 0.6320 - loss: 0.7199 - val_accuracy: 0.5989 - val_loss: 0.8201
Epoch 5/20
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.6681 - loss: 0.6381 - val_accuracy: 0.6140 - val_loss: 0.8586
Epoch 6/20
[1m687/687[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.6901 - loss: 0.5876 - val_accuracy: 0.5991 - val_loss: 0.9432
Epoch 7/20
[1m687/687[0m

In [35]:
import numpy as np

# Assuming x_test_encoded and y_test_encoded are defined from previous code execution.
# Pad the test sequences
x_test = keras.utils.pad_sequences(x_test_encoded, maxlen=maxlen)

# Convert x_test and y_test_encoded to TensorFlow Tensors
x_test = tf.convert_to_tensor(x_test)
y_test = tf.convert_to_tensor(y_test_encoded)

# Evaluate the model on the test data
loss, accuracy = model.evaluate(x_test, y_test, verbose=2)
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")

859/859 - 3s - 3ms/step - accuracy: 0.6662 - loss: 4.7203
Test Loss: 4.7203
Test Accuracy: 0.6662
