# Setup

In [None]:
!pip install transformers
!pip install datasets

In [None]:
import numpy as np
import sklearn
import gensim
import matplotlib.pyplot as plt
import pandas as pd
import tensorflow as tf

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# constants and global variables
DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/CIL/Dataset/{}'
MODEL_PATH = '/content/drive/MyDrive/Colab Notebooks/CIL/Models/{}'

# if set to true the trainig of the classifier models will be performed, otherwise the 
# models will be loaded from a file (if present)
is_train_enabled = True

# Read Preprocessed Data

In [None]:
def load_X_data(file_name):
  tweets = []
  with open(DATA_PATH.format(file_name), 'r', encoding='utf-8') as f:
      for line in f:
        tweets.append(line.rstrip().split())
  
  return np.array(tweets)

X_train = load_X_data("X_train_processed.txt")
X_test = load_X_data("X_test_processed.txt")

  import sys


In [None]:
def load_y_data(file_name):
  labels = []
  with open(DATA_PATH.format(file_name), 'r', encoding='utf-8') as f:
      for line in f:
        labels.append(int(line.rstrip()))
  
  return np.array(labels)

y_train = load_y_data("y_train.txt")

In [None]:
for i in range(10):
  print(X_train[i])
  print(X_test[i])
  print(y_train[i])

['yes', 'even', 'realize', 'really', 'wanted', 'respond', 'want', 'buy', 'candy', 'something']
['sea', 'doo', 'pro', 'sea', 'scooter', 'sport', 'portable', 'sea', 'doo', 'seascootersave', 'air']
1
['bradly', 'james', 'lowrey', 'bestfriend', 'mean', 'alot', 'justthoughtidleteveryoneknow']
['shuck', 'well', 'work', 'week', 'come', 'cheer', 'oh', 'put', 'battery', 'calculator']
1
['mckleinusa', 'ashburn', 'series', 'leather', 'laptop', 'case', 'brown', 'clean', 'front', 'flap', 'design', 'secure', 'key', 'l']
['cant', 'stay', 'away', 'bug', 'thats', 'baby']
0
['next', 'time', 'ima', 'come', 'yo', 'class', 'nd', 'wake', 'wanted', 'come', 'get', 'hug', 'sleep']
['lol', 'im', 'perfectly', 'fine', 'contagious', 'anymore', 'lmao']
0
['trivial', 'pursuit', 'junior', 'second', 'edition', 'second', 'edition', 'junior', 'legendary', 'trivial', 'pursuit', 'game']
['whenever', 'fall', 'asleep', 'watching', 'tv']
0
['new', 'bbm', 'add', 'please', 'pin']
['need', 'get', 'rid', 'thing', 'scare', 'lol',

# BERT

In [None]:
from sklearn.model_selection import train_test_split

# split the model into training test and validation
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.10, random_state=33)

In [None]:
from datasets import Dataset

# go from list type to 'Dataset' type. This object is requiered to train the model
df = pd.DataFrame(X_train, columns =['Phrase'])
train = Dataset.from_pandas(df).add_column(name="Label", column=y_train)

df = pd.DataFrame(X_val, columns =['Phrase'])
val = Dataset.from_pandas(df).add_column(name="Label", column=y_val)

df = pd.DataFrame(X_test, columns =['Phrase'])
test = Dataset.from_pandas(df)

In [None]:
# perform tokenizationn of the dataset for BERT models
from transformers import AutoTokenizer


tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_function(sentence):
    return tokenizer(sentence['Phrase'], padding=True, truncation=True, max_length=30)

train = train.map(tokenize_function, batched=True)
test = test.map(tokenize_function, batched=True)
val = val.map(tokenize_function, batched=True)

for i in range(5):
  print(train[i])
  print(test[i])
  print(val[i])

In [None]:
# go from 'Dataset' type to tensorflow so that our dataset can be used for training in keras
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors="tf")

tf_train_dataset = train.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["Label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=64,
)

tf_val_dataset = val.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["Label"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=64,
)

tf_test_dataset = test.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=64,
)


In [None]:
from transformers import TFAutoModelForSequenceClassification

# Compute some variables needed to speed up training
train_steps_per_epoch = int(len(tf_train_dataset) * (100/100) / 64)
dev_steps_per_epoch = int(len(tf_val_dataset) * (100/100) / 64)


# download pre-trained model
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased", num_labels=5)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)

In [None]:
if is_train_enabled:
  model.fit(tf_train_dataset,
            validation_data=tf_val_dataset,
            epochs=20,
            verbose=2,
            steps_per_epoch=train_steps_per_epoch,
            validation_steps=dev_steps_per_epoch,)
  
  model.save_pretrained(MODEL_PATH.format("bert_1"))

In [None]:
# load trained model from file
model.load_weights(MODEL_PATH.format("bert_1/tf_model.h5"))

In [None]:
y_pred = model.predict(tf_test_dataset)
y_pred = np.argmax(np.array(y_pred[0]), axis=-1)


def write_output(file_name, Y):
  f  = open(file_name, "w")
  f.write("id,y\n")
  id = 0
  for y in Y:
    f.write(str(id) + "," + str(y) + "\n")
    id = id + 1
  f.close()

write_output(DATA.PATH.format("bert_1_predictions.txt"), y_pred)

