## Installing kaggle packages

In [None]:
TRAINING = False

In [None]:
!pip install kaggle > /dev/null


In [None]:
from google.colab import drive
drive.mount('/gdrive')

In [None]:
!mkdir ~/.kaggle
!cp /gdrive/MyDrive/Kaggle/kaggle.json ~/.kaggle/

In [None]:
!kaggle competitions download -c tweet-sentiment-analysis-ssn

In [None]:
!if [[ -e train.csv ]]; then true; else unzip train.csv.zip; fi

#Importing packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from collections import Counter

In [None]:
train_df = pd.read_csv('train.csv',encoding='iso-8859-1')

In [None]:
train_df.head()

In [None]:
train_df.Sentiment.unique()

In [None]:
train_df.shape

In [None]:
mapping = {'Extremely Negative': 0, 'Negative': 1, 'Neutral': 2, 'Positive': 3, 'Extremely Positive': 4}
reverse_mapping = {i[1]:i[0] for i in mapping.items()}

In [None]:
train_df.Sentiment = train_df.Sentiment.map(lambda x: mapping.get(x))

In [None]:
train_df.head()

In [None]:
counts=Counter(train_df.Sentiment)

In [None]:
print(counts)

In [None]:
fig, ax = plt.subplots()
ax.set_xlabel('Sentiment', fontsize=15)
ax.set_ylabel('Count', fontsize=15)

train_df.Sentiment.hist(ax=ax,grid=False, bins=20)
plt.show()

In [None]:
plt.pie([float(v) for v in counts.values()], labels=[float(k) for k in counts],
           autopct=None, radius=2)
plt.show()

## Cleaning the data

Reference: [Article on kaggle](https://www.kaggle.com/redwankarimsony/nlp-101-tweet-sentiment-analysis-preprocessing)


In [None]:
import nltk
nltk.download('stopwords')
import re                                  
import string                             
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer  

In [None]:
# tokenizer = TweetTokenizer(preserve_case=False, 
#                            strip_handles=True,
#                            reduce_len=True)
stopwords_english = stopwords.words('english') 
def process_tweet(tweet: string, get_tokens=False):
  global tokenizer
  global stopwords_english
  tweet2 = re.sub(r'^RT[\s]+', '', tweet)

  # it will remove hyperlinks
  tweet2 = re.sub(r'https?:\/\/.*[\r\n]*', '', tweet2)

  # it will remove hashtags. We have to be careful here not to remove 
  # the whole hashtag because text of hashtags contains huge information. 
  # only removing the hash # sign from the word
  tweet2 = re.sub(r'#', '', tweet2)

  # it will remove single numeric terms in the tweet. 
  tweet2 = re.sub(r'[0-9]', '', tweet2)

  # tokenize the tweets
  if get_tokens:
    tokenizer = TweetTokenizer(preserve_case=False, 
                           strip_handles=False,
                           reduce_len=True)
    tweet_tokens = tokenizer.tokenize(tweet2)
    tweets_clean = []

    for word in tweet_tokens: # Go through every word in your tokens list
        if (word not in stopwords_english and  # remove stopwords
            word not in string.punctuation):  # remove punctuation
              if word[0] == '@':
                tweets_clean.append('@handle')
              else:
                tweets_clean.append(word)
    return tweets_clean
  return tweet2

In [None]:
train_df.OriginalTweet = train_df.OriginalTweet.map(lambda original_tweet: process_tweet(original_tweet))
train_df.head()

In [None]:
print('Number of data points in training set: ', train_df.shape[0])

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test = train_test_split(train_df, test_size=0.2, random_state=42)

In [None]:
X_train.rename(columns={'OriginalTweet': 'DATA_COLUMN', 'Sentiment': 'LABEL_COLUMN'}, inplace=True)
X_test.rename(columns={'OriginalTweet': 'DATA_COLUMN', 'Sentiment': 'LABEL_COLUMN'}, inplace=True)


In [None]:
X_train = X_train[['DATA_COLUMN', 'LABEL_COLUMN']]
X_test = X_test[['DATA_COLUMN', 'LABEL_COLUMN']]


In [None]:
X_train

In [None]:
!pip install transformers --upgrade > /dev/null

In [None]:
from transformers import BertTokenizer, TFBertForSequenceClassification
from transformers import InputExample, InputFeatures
import tensorflow as tf

In [None]:
def convert_data_to_examples(train, test, DATA_COLUMN, LABEL_COLUMN): 
  train_InputExamples = train.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)

  validation_InputExamples = test.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x[DATA_COLUMN], 
                                                          text_b = None,
                                                          label = x[LABEL_COLUMN]), axis = 1)
  
  return train_InputExamples, validation_InputExamples


  
def convert_examples_to_tf_dataset(examples, tokenizer, max_length=128):
    features = [] # -> will hold InputFeatures to be converted later

    for e in examples:
        # Documentation is really strong for this method, so please take a look at it
        input_dict = tokenizer.encode_plus(
            e.text_a,
            add_special_tokens=True,
            max_length=max_length, # truncates if len(s) > max_length
            return_token_type_ids=True,
            return_attention_mask=True,
            pad_to_max_length=True, # pads to the right by default # CHECK THIS for pad_to_max_length
            truncation=True
        )

        input_ids, token_type_ids, attention_mask = (input_dict["input_ids"],
            input_dict["token_type_ids"], input_dict['attention_mask'])

        features.append(
            InputFeatures(
                input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, label=e.label
            )
        )

    def gen():
        for f in features:
            yield (
                {
                    "input_ids": f.input_ids,
                    "attention_mask": f.attention_mask,
                    "token_type_ids": f.token_type_ids,
                },
                f.label,
            )

    return tf.data.Dataset.from_generator(
        gen,
        ({"input_ids": tf.int32, "attention_mask": tf.int32, "token_type_ids": tf.int32}, tf.int64),
        (
            {
                "input_ids": tf.TensorShape([None]),
                "attention_mask": tf.TensorShape([None]),
                "token_type_ids": tf.TensorShape([None]),
            },
            tf.TensorShape([]),
        ),
    )


DATA_COLUMN = 'DATA_COLUMN'
LABEL_COLUMN = 'LABEL_COLUMN'


In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
train_InputExamples, validation_InputExamples = convert_data_to_examples(X_train, X_test, DATA_COLUMN, LABEL_COLUMN)
if TRAINING == True:
  train_data = convert_examples_to_tf_dataset(list(train_InputExamples), tokenizer)
  train_data = train_data.shuffle(100).batch(32).repeat(2)

else:
  validation_data = convert_examples_to_tf_dataset(list(validation_InputExamples), tokenizer)
  validation_data = validation_data.batch(32)
  print('Warning! Model in evaluation mode!')

## Training with BERT

In [None]:
model = TFBertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=5)
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model.summary()

In [None]:
if TRAINING == True:
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

  history = model.fit(train_data, epochs=5, validation_data=validation_data)
  model.save_weights('v1_model_weights', save_format='tf')
else:
  print('Warning! Model in evaluation mode!')

## Loading trained model

In [None]:
if TRAINING == False:
  print('Model Evaluation Running!')
  !cp /gdrive/MyDrive/TweetSentiment/v1_model_weights.data-00000-of-00001 .
  !cp /gdrive/MyDrive/TweetSentiment/v1_model_weights.index .

  model.load_weights('v1_model_weights')

In [None]:
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=3e-5, epsilon=1e-08, clipnorm=1.0), 
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), 
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy('accuracy')])

In [None]:
model.summary()

In [None]:
y_pred = model.predict(validation_data)

In [None]:
y_pred_argmax = np.argmax(y_pred[0], axis=1)

In [None]:
from sklearn.metrics import classification_report
print('classification_report:\n', classification_report(y_true=np.array(X_test['LABEL_COLUMN']), y_pred=y_pred_argmax))

In [None]:
test_df = pd.read_csv('test.csv', encoding='iso-8859-1')

In [None]:
test_df.OriginalTweet = test_df.OriginalTweet.map(lambda tweet: process_tweet(tweet))

In [None]:
test_df.head()

In [None]:
test_df  = test_df[['OriginalTweet']]

In [None]:
test_df.head()

In [None]:
test_InputExamples = test_df.apply(lambda x: InputExample(guid=None, # Globally unique ID for bookkeeping, unused in this case
                                                          text_a = x['OriginalTweet'], label=1), axis = 1)

In [None]:
test_InputExamples.head()

In [None]:
test_data = convert_examples_to_tf_dataset(list(test_InputExamples), tokenizer).batch(32)

In [None]:
y_pred = model.predict(test_data)  

In [None]:
y_pred_argmax = np.argmax(y_pred[0], axis=1)

In [None]:
y_pred_argmax

In [None]:
predicted_sentiments = [reverse_mapping[i] for i in y_pred_argmax]

In [None]:
predicted_sentiments[:3]

In [None]:
final_df = pd.read_csv('test.csv', encoding='iso-8859-1')
final_df['Sentiment'] = predicted_sentiments

In [None]:
final_df.head()

In [None]:
submission_df = final_df[['UserName', 'Sentiment']]
submission_df.head()

In [None]:
submission_df.to_csv('submission1.csv', sep=',', index=False)

In [None]:
!kaggle competitions submit -c tweet-sentiment-analysis-ssn -f submission1.csv -m "API submission 1"

# Approach2: Vanilla Transformer

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
from keras.callbacks import ModelCheckpoint

In [None]:
train_df = pd.read_csv('train.csv',encoding='iso-8859-1')

In [None]:
train_df.head()

In [None]:
train_df.Sentiment.unique()

In [None]:
mapping = {'Extremely Negative': 0, 'Negative': 1, 'Neutral': 2, 'Positive': 3, 'Extremely Positive': 4}
reverse_mapping = {i[1]:i[0] for i in mapping.items()}

In [None]:
train_df.Sentiment = train_df.Sentiment.map(lambda x: mapping.get(x))

In [None]:
train_df.head()

In [None]:
Counter(train_df.Sentiment)

In [None]:
fig, ax = plt.subplots()
ax.set_xlabel('Sentiment', fontsize=15)
ax.set_ylabel('Count', fontsize=15)

train_df.Sentiment.hist(ax=ax,grid=False, bins=20)
plt.show()

In [None]:
train_df.OriginalTweet = train_df.OriginalTweet.map(lambda original_tweet: process_tweet(original_tweet, get_tokens=False))
train_df.head()

In [None]:
print('Number of data points in training set: ', train_df.shape[0])

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test = train_test_split(train_df, test_size=0.2, random_state=42)

In [None]:
X_train, y_train = X_train['OriginalTweet'], X_train['Sentiment']

In [None]:
X_train

In [None]:
y_train = tf.keras.utils.to_categorical(y_train, num_classes=5)

In [None]:
y_train

In [None]:
X_test, y_test = X_test['OriginalTweet'], X_test['Sentiment']

In [None]:
y_test = tf.keras.utils.to_categorical(y_test, num_classes=5)
y_test

In [None]:
X_train = list(X_train)
len(X_train), type(X_train)


In [None]:
t  = Tokenizer(num_words=20000, filters='"#$%&()*+,-./:;<=>?[\\]^_`{|}~\t\n\r', lower=True)

In [None]:
t.fit_on_texts(X_train)

In [None]:
print('Vocab size: ',t.num_words)

In [None]:
X_train = t.texts_to_sequences(X_train)

In [None]:
X_test = t.texts_to_sequences(X_test)

In [None]:
maxlen=150
X_train = keras.preprocessing.sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = keras.preprocessing.sequence.pad_sequences(X_test, maxlen=maxlen)

In [None]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

In [None]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

In [None]:
embed_dim = 128  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 1024  # Hidden layer size in feed forward network inside transformer
vocab_size = t.num_words
inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(512, activation="relu")(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(5, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-3)

In [None]:
model.compile(optimizer, "categorical_crossentropy", metrics=["accuracy"])

In [None]:
model.summary()

In [None]:
checkpoint = ModelCheckpoint(filepath='/gdrive/MyDrive/TweetSentiment/vanilla-weights.h5',
                             save_weights_only=True, 
                             monitor='val_accuracy',
                             verbose=1, 
                             save_best_only=True,
                             mode='max')

In [None]:
es = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

In [None]:
history = model.fit(
    X_train, y_train, batch_size=1024, epochs=100, validation_data=(X_test,y_test), callbacks=[es, checkpoint]
)

### Plotting graphs

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()


In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
y_pred = model.predict(X_test)

In [None]:
y_pred_argmax = np.argmax(y_pred, axis=1)

In [None]:
y_test_argmax = np.argmax(y_test, axis=1)

In [None]:
print('Classification Report:\n', classification_report(y_true=y_test_argmax, y_pred=y_pred_argmax))

## Evaluating Model Performance on Test Data

In [None]:
embed_dim = 128  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 1024  # Hidden layer size in feed forward network inside transformer
vocab_size = t.num_words
inputs = layers.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
x = embedding_layer(inputs)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.5)(x)
x = layers.Dense(512, activation="relu")(x)
x = layers.Dropout(0.5)(x)
outputs = layers.Dense(5, activation="softmax")(x)

model = keras.Model(inputs=inputs, outputs=outputs)

In [None]:
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-3)

In [None]:
model.compile(optimizer, "categorical_crossentropy", metrics=["accuracy"])

In [None]:
model.load_weights('/gdrive/MyDrive/TweetSentiment/vanilla-weights.h5')

In [None]:
test_df = pd.read_csv('test.csv', encoding='iso-8859-1')

In [None]:
test_df.OriginalTweet = test_df.OriginalTweet.map(lambda tweet: process_tweet(tweet))

In [None]:
test_df.head()

In [None]:
X_val  = test_df['OriginalTweet']
X_val

In [None]:
X_val = t.texts_to_sequences(list(X_val))

In [None]:
maxlen=150
X_val = keras.preprocessing.sequence.pad_sequences(X_val, maxlen=maxlen)

In [None]:
y_val_pred = model.predict(X_val)

In [None]:
y_val_pred.shape

In [None]:
y_val_argmax = np.argmax(y_val_pred, axis = 1)

In [None]:
predicted_sentiments = [reverse_mapping[i] for i in y_val_argmax]

In [None]:
predicted_sentiments[:3]

In [None]:
final_df = pd.read_csv('test.csv', encoding='iso-8859-1')
final_df['Sentiment'] = predicted_sentiments

In [None]:
final_df.head()

In [None]:
submission_df = final_df[['UserName', 'Sentiment']]
submission_df.head()

In [None]:
submission_df.to_csv('submission2.csv', sep=',', index=False)

In [None]:
!kaggle competitions submit -c tweet-sentiment-analysis-ssn -f submission2.csv -m "API submission 2: Vanilla Transformer"