# **NLP - Sentiment Analysis Division Sigma Competition (Our Efforts)**

## *1.0 Importing necessary libraries*

In [None]:
# Standard imports - sklearn and transformer imports will be handled later
import tensorflow as tf
from tensorflow import keras
from google.colab import drive
import os
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
import pandas as pd

  import pandas.util.testing as tm


### 1.1 Import and save Training Data from Google Drive

In [None]:
from google.colab import files
data_to_load = files.upload()

Saving training_data.csv to training_data.csv


In [None]:
import io
df = pd.read_csv(io.StringIO(data_to_load["training_data.csv"].decode("utf-8")))  

In [None]:
#We pickle the dataframe to make it easier to load for late uses.
df.to_pickle('train-df.csv') 

In [None]:
df = pd.read_pickle('train-df.csv')

We define and use a quick lambda function for visualizations 

### 1.2 Setting Up Sentiments Label and Data Frame

In [None]:
#Uses the sentiment reading in order to add a sentiment label to the data frame
def encoded_to_label(sentiment_encode):
  if sentiment_encode == 1:
    return "Positive"
  else:
    return "Negative"

df["Sentiment Label"] = df["Sentiment"].apply(lambda x: encoded_to_label(x))

### 1.3 Displaying Data before Preprocessing

In [None]:
df

Unnamed: 0,ID,User,Text,Sentiment,Sentiment Label
0,864192,Carly_FTS,I *heart* filling up @dennisschaub desk 1 it...,1,Positive
1,523691,Open_Sourcing,"#SocioMat - people create prettier, younger an...",1,Positive
2,584154,xxcharlx,no way i dont want the tour to end,0,Negative
3,1527961,andreapuddu,@HemalRadia Hi Amazing Brother! Sending Limitl...,1,Positive
4,28609,umbec,@flockmaster they are chocolate,1,Positive
...,...,...,...,...,...
999995,1366175,b13thy,@midderhonz i'm good.. off to buy an electric ...,1,Positive
999996,681828,HeyyitsALison,@StaceyPaha i know..for youngerr boys..what am...,1,Positive
999997,488988,sleepycove,I can't belive it I just got asked for an auto...,1,Positive
999998,985613,AmyyXD,i am putting my bb in the fridge so it cant di...,0,Negative


In [None]:
# Checking for empty values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 5 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   ID               1000000 non-null  int64 
 1   User             1000000 non-null  object
 2   Text             1000000 non-null  object
 3   Sentiment        1000000 non-null  int64 
 4   Sentiment Label  1000000 non-null  object
dtypes: int64(2), object(3)
memory usage: 38.1+ MB


## *2.0 Preprocessing Data*

### 2.1 Formating The Text for the Bert or DistilBert Model

In [None]:
# Useful libraries
import re 
import unicodedata 
import string

# Removing punctuation with regular expressions
def clean_sentence(text):
    text = unicodedata.normalize('NFKD', text)
    text = text.strip(', , ,')
    text = text.strip('\t\n') 
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = re.sub('[‘’“”…]', '', text)
    text = re.sub('\n', '', text)
    return text

In [None]:
# Applying the data cleaning to the pandas dataframe
df_revised =  pd.DataFrame(df['Text'].apply(lambda x: clean_sentence(x)))
df_revised['Sentiment'] = df['Sentiment']

In [None]:
# Before we begin encoding the text, we need to split it up into a training and validation set
# We use scikit-learn's train_test_split for this
from sklearn.model_selection import train_test_split
train_df, valid_df = train_test_split(df_revised, test_size = 0.2)

In [None]:
# Necessary pip command to use HuggingFace's transformer library and toolkit
!pip install transformers==3.0.0

Collecting transformers==3.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/9c/35/1c3f6e62d81f5f0daff1384e6d5e6c5758682a8357ebc765ece2b9def62b/transformers-3.0.0-py3-none-any.whl (754kB)
[K     |████████████████████████████████| 757kB 4.8MB/s 
[?25hCollecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 23.2MB/s 
Collecting tokenizers==0.8.0-rc4
[?25l  Downloading https://files.pythonhosted.org/packages/e8/bd/e5abec46af977c8a1375c1dca7cb1e5b3ec392ef279067af7f6bc50491a0/tokenizers-0.8.0rc4-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
[K     |████████████████████████████████| 3.0MB 40.8MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (

Before we feed the data to the BERT or DistilBert model, we must preprocess it. First, we must tokenize the data, add start-of-sentence and end-of-sentence tokens, pad the data, and mask it. This will be done with the transformer tokenizer's batch_encode_plus(method), which enables us to do all of this at once.
Then, we must

### 2.2 Tokenizing, Padding, and Masking

In [71]:
# Necessary imports
import transformers 
from transformers import BertTokenizer, TFBertModel

BERT_MODEL = 'bert-base-cased'
 
test = 160

 # Using the names of the available models from the pretrained_models documentation
tokenizer = BertTokenizer.from_pretrained(BERT_MODEL)

# To silence Colab warnings
import logging
logging.basicConfig(level=logging.ERROR)

# Encoding the data
def encode_data(text, tokenizer, testlen=160): 
  encoded =  tokenizer.batch_encode_plus(
      text,
      add_special_tokens=True,
      return_attention_masks=True, 
      return_token_type_ids=False,   
      pad_to_max_length=True,
      max_length=test
  ) 
  return np.array(encoded['input_ids'])

# Applying the encoding to the training and validation data
train_encoded_data = encode_data(train_df.Text.values, tokenizer, testlen = 160)
valid_encoded_data = encode_data(valid_df.Text.values, tokenizer, testlen = 160)  

### 2.3 Converting to Tensorflow Dataset

We convert the pandas series that were output previously and convert it to TensorFlow datasets with the from_tensor_slices(method).

In [72]:
y_train = train_df.Sentiment.values
y_valid = valid_df.Sentiment.values
 
BATCH_SIZE = 32

ds_train = (tf.data.Dataset 
.from_tensor_slices((train_encoded_data, y_train)) 
.repeat()
.shuffle(512)
.batch(BATCH_SIZE)
.prefetch(tf.data.experimental.AUTOTUNE))


ds_valid = (tf.data.Dataset 
.from_tensor_slices((valid_encoded_data, y_valid)) 
.repeat()
.shuffle(512)
.batch(BATCH_SIZE)
.prefetch(tf.data.experimental.AUTOTUNE))

## *3.0 Training the Model*

The Bert model training came out to about 300+ hours for us for some reason and we tried to spend a very long time to fix it but we werent able to. Alot of PAIN :( We decided to change the model to distilbert however the training was still way to long at about 4+hours per epoch. This is weird for even the distilbert model as it should be able to be trained fully in under an hour for this training data set. We tried our best to cut down the training time but kept getting training errors at the end of when an epoch was about to finish. Trying to avoid the training errors we also added checkpoints but those didn't end up working lol!!!! In the end we tried our best and even though we werent able to succeed we managed to learn alot about NLP models and their implementation. 

In [None]:
 #setting up checkpoint call backs for the modes training 
 checkpoint_cb = keras.callbacks.ModelCheckpoint('sigma_bert_model.h5', save_best_only=True)  

Implementing the distilbert model after the bert model didnt work

In [None]:
from transformers import TFDistilBertForSequenceClassification

model=TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-cased') 
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

In [None]:
Distilbert model summary

In [None]:
model.summary()

Model: "tf_distil_bert_for_sequence_classification_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
distilbert (TFDistilBertMain multiple                  65190912  
_________________________________________________________________
pre_classifier (Dense)       multiple                  590592    
_________________________________________________________________
classifier (Dense)           multiple                  1538      
_________________________________________________________________
dropout_139 (Dropout)        multiple                  0         
Total params: 65,783,042
Trainable params: 65,783,042
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Uncomment the next line to restore the checkpoint
#This sadly didnt end up working :(
# model = keras.models.load_model('sigma_bert_model.ht')

Tried to train the data below, but due to the long training times either human errors would happen or the training would just not work after one epoch. Spent a long time tring to change the hyperparamters but only the base model was properly training for atleast one epoch before breaking. PAIN :(

In [None]:
number_steps = 800000/BATCH_SIZE
history = model.fit(ds_train, steps_per_epoch = number_steps, validation_data = ds_valid, epochs = 3, callbacks = [checkpoint_cb]) 