# Setup

In [64]:
!pip install transformers
!pip install datasets



In [65]:
import numpy as np
import sklearn
import gensim
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf

In [66]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [82]:
# constants and global variables
DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/CIL/Dataset/{}'
MODEL_PATH = '/content/drive/MyDrive/Colab Notebooks/CIL/Models/{}'
PROBABILITIES = '/content/drive/MyDrive/Colab Notebooks/CIL/Probabilities/{}'

# MODEL_NAME = "bert-base-cased"
MODEL_NAME =  "roberta-base"
# MODEL_NAME = "cardiffnlp/twitter-roberta-base-sentiment-latest"

# if set to true the trainig of the classifier models will be performed, otherwise the 
# models will be loaded from a file (if present)
is_train_enabled = True

# Read Preprocessed Data

In [83]:
def load_X_data(file_name):
  tweets = []
  with open(DATA_PATH.format(file_name), 'r', encoding='utf-8') as f:
      for line in f:
        tweets.append(line.rstrip())
  
  return np.array(tweets)

X_train = load_X_data("X_train_processed_bert.txt")
X_test = load_X_data("X_test_processed_bert.txt")

# swap url and user with @user and http
if MODEL_NAME == "cardiffnlp/twitter-roberta-base-sentiment-latest":
  X_train = list(map(lambda tweet : tweet.replace("<user>", "@user").replace("<url>", "http"), X_train))
  X_test = list(map(lambda tweet : tweet.replace("<user>", "@user").replace("<url>", "http"), X_test))

In [84]:
def load_y_data(file_name):
  labels = []
  with open(DATA_PATH.format(file_name), 'r', encoding='utf-8') as f:
      for line in f:
        labels.append(int(line.rstrip()))
  
  return np.array(labels)

y_train = load_y_data("y_train.txt")

In [85]:
for i in range(10):
  print(X_train[i])
  print(X_test[i])
  print(y_train[i])

<user> yes i didn't even realize he did ! i really wanted him to respond because i want to buy him candy or something
sea doo pro sea scooter ( sports with the portable sea-doo seascootersave air
1
bradly james lowrey is my bestfriend & & & he mean alot to me
<user> shucks well i work all week so now i can't come cheer you on ! oh and put those batteries in your calculator ! ! !
1
mckleinusa ashburn 15144 s series leather laptop case ( brown clean , front flap-over design with a secure key l ... <url>
i cant stay away from bug thats my baby
0
<user> next time ima come in yo class nd wake you up myself ! i wanted you to come out , so i can get my hug but you was sleep
<user> no ma'am ! ! ! lol im perfectly fine and not contagious anymore lmao
0
trivial pursuit for juniors ( second edition the second edition for juniors of the legendary trivial pursuit games <url>
whenever i fall asleep watching the tv
0
new  adds please pin : 28b9ead0
<user> he needs to get rid of that thing ! it scares

# BERT

In [86]:
from sklearn.model_selection import train_test_split

# split the model into training test and validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.10, random_state=33)

In [87]:
from datasets import Dataset

# go from list type to 'Dataset' type. This object is requiered to train the model
df = pd.DataFrame(X_train, columns =['Phrase'])
train = Dataset.from_pandas(df).add_column(name="Label", column=y_train)

df = pd.DataFrame(X_val, columns =['Phrase'])
val = Dataset.from_pandas(df).add_column(name="Label", column=y_val)

df = pd.DataFrame(X_test, columns =['Phrase'])
test = Dataset.from_pandas(df)

In [88]:
# perform tokenization of the dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize_function(sentence):
    return tokenizer(sentence['Phrase'], padding=True, truncation=True, max_length=30)

train = train.map(tokenize_function, batched=True)
test = test.map(tokenize_function, batched=True)
val = val.map(tokenize_function, batched=True)

for i in range(5):
  print(train[i])
  print(test[i])
  print(val[i])

  0%|          | 0/2250 [00:00<?, ?ba/s]

  0%|          | 0/10 [00:00<?, ?ba/s]

  0%|          | 0/250 [00:00<?, ?ba/s]

{'Phrase': "<user> can't wait until my sister have her baby . the 30th ain't coming fast enough . in excited for y'all", 'Label': 1, 'input_ids': [0, 41552, 12105, 15698, 64, 75, 2067, 454, 127, 2761, 33, 69, 1928, 479, 5, 389, 212, 18212, 75, 567, 1769, 615, 479, 11, 2283, 13, 1423, 108, 1250, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}
{'Phrase': 'sea doo pro sea scooter ( sports with the portable sea-doo seascootersave air', 'input_ids': [0, 16466, 109, 139, 1759, 3342, 2850, 15917, 36, 1612, 19, 5, 15295, 3342, 12, 417, 3036, 842, 8631, 22923, 4097, 935, 2, 1, 1, 1, 1, 1, 1, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0]}
{'Phrase': 'mid fielder christmas ornament ( royal velvet blue 2 5/8 " glass keepsake ornament free gift box included <url>', 'Label': 0, 'input_ids': [0, 16079, 16297, 29224, 13738, 40932, 36, 5754, 29986, 2440, 132, 195, 73, 398

In [89]:
# go from 'Dataset' type to tensorflow so that our dataset can be used for training in keras
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors="tf")

tf_train_dataset = train.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["Label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=64,
)

tf_val_dataset = val.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["Label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=64,
)

tf_test_dataset = test.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=64,
)

In [90]:
from transformers import TFAutoModelForSequenceClassification

# Compute some variables needed to speed up training
train_steps_per_epoch = int(len(tf_train_dataset) * (100/100) / 64)
dev_steps_per_epoch = int(len(tf_val_dataset) * (100/100) / 64)


# download pre-trained model
model = TFAutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)

All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [91]:
if is_train_enabled:
  # train model
  model.fit(tf_train_dataset,
            validation_data=tf_val_dataset,
            epochs=3,
            verbose=2,
            steps_per_epoch=train_steps_per_epoch,
            validation_steps=dev_steps_per_epoch,)
  
  # save model to file
  model.save_pretrained(MODEL_PATH.format(MODEL_NAME.replace("/", "_")))

Epoch 1/3
549/549 - 155s - loss: 0.3800 - sparse_categorical_accuracy: 0.8261 - val_loss: 0.3400 - val_sparse_categorical_accuracy: 0.8519 - 155s/epoch - 283ms/step
Epoch 2/3
549/549 - 137s - loss: 0.3464 - sparse_categorical_accuracy: 0.8462 - val_loss: 0.3320 - val_sparse_categorical_accuracy: 0.8614 - 137s/epoch - 250ms/step
Epoch 3/3
549/549 - 137s - loss: 0.3346 - sparse_categorical_accuracy: 0.8517 - val_loss: 0.3292 - val_sparse_categorical_accuracy: 0.8566 - 137s/epoch - 250ms/step


In [None]:
# load trained model from file
model.load_weights(MODEL_PATH.format(MODEL_NAME.replace("/", "_") + "/tf_model.h5"))

In [92]:
from scipy.special import softmax


y_pred = model.predict(tf_test_dataset)[0]
if MODEL_NAME == "cardiffnlp/twitter-roberta-base-sentiment-latest" or MODEL_NAME == "roberta-base":
  y_pred = list(map(lambda x : softmax(x) , y_pred))

# write prediction probabilites to a file. These probabilites will be used to create
# an ensemble of models
def write_output_probas(file_name, Y):
  f  = open(file_name, "w")
  f.write("Id,Prediction\n")
  id = 1
  for y in Y:
    f.write(str(id) + "," + str(list(y)) + "\n")
    id = id + 1
  f.close()

write_output_probas(PROBABILITIES.format(MODEL_NAME.replace("/", "_") + "_predictions_probas.csv"), y_pred)


# write predictions to file
y_pred = np.argmax(np.array(y_pred), axis=-1)
def write_output(file_name, Y):
  f  = open(file_name, "w")
  f.write("Id,Prediction\n")
  id = 1
  for y in Y:
    if y == 0:
      y = -1
    f.write(str(id) + "," + str(y) + "\n")
    id = id + 1
  f.close()

write_output(DATA_PATH.format(MODEL_NAME.replace("/", "_") + "_predictions.csv"), y_pred)