# Task 3

In [1]:
#packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# load data into df
df = pd.read_csv('train.csv')
df.head()

Unnamed: 0,rating,reviewText,summary
0,3,Plot Storyline: 5 StarsThis novel accomplished...,3 1/4 Stars
1,3,I did not like how EL ended this one. I don't ...,"It was going great, then just.... ended"
2,5,I love how old fashioned this family is - they...,LOVED ALL 4!
3,5,I loved this story - It's about two friends wh...,friends make the best lovers
4,1,"In the Dark Lands, a virus killed all possibil...",Blatantly sexist and homophobic


## Text Process

In [3]:
#packages for nlp
import re
import nltk
import string
##download support file 
#nltk.download('punkt')
#nltk.download('wordnet')
#nltk.download('vader_lexicon')

In [4]:
#data cleaning
df['processed_review'] = df['reviewText'].copy().str.lower()
df['processed_summary'] = df['summary'].copy().str.lower()

##remove stop word
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
sw = stopwords.words('english')
df['processed_review'] = df['processed_review'].apply(lambda x: " ".join([x for x in word_tokenize(x) if x not in sw]))
df['processed_summary'] = df['processed_summary'].apply(lambda x: " ".join([x for x in word_tokenize(x) if x not in sw]))

##remove extra spaces
df['processed_review']=df['processed_review'].apply(lambda x: re.sub(' +', ' ', x))
df['processed_summary']=df['processed_summary'].apply(lambda x: re.sub(' +', ' ', x))

##remove all punctuation
df['processed_review'] = df['processed_review'].apply(lambda i: i.translate(str.maketrans('', '', string.punctuation)))
df['processed_summary'] = df['processed_summary'].apply(lambda i: i.translate(str.maketrans('', '', string.punctuation)))

##Lemmatzing
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
df['processed_review']=df['processed_review'].apply(lambda x: " ".join([lemmatizer.lemmatize(w) for w in x.split()]))
df['processed_summary']=df['processed_summary'].apply(lambda x: " ".join([lemmatizer.lemmatize(w) for w in x.split()]))

In [5]:
#data for bert
data_bert = df.iloc[:, 0:2]


data_bert['rating'] = data_bert['rating'].replace([1],'1-Star')
data_bert['rating'] = data_bert['rating'].replace([2],'2-Stars')
data_bert['rating'] = data_bert['rating'].replace([3],'3-Stars')
data_bert['rating'] = data_bert['rating'].replace([4],'4-Stars')
data_bert['rating'] = data_bert['rating'].replace([5],'5-Stars')

data_bert

Unnamed: 0,rating,reviewText
0,3-Stars,Plot Storyline: 5 StarsThis novel accomplished...
1,3-Stars,I did not like how EL ended this one. I don't ...
2,5-Stars,I love how old fashioned this family is - they...
3,5-Stars,I loved this story - It's about two friends wh...
4,1-Star,"In the Dark Lands, a virus killed all possibil..."
...,...,...
8995,1-Star,From the description I was expecting a bit of ...
8996,5-Stars,Heather is the human mate of Cael and Riyu. W...
8997,1-Star,I thought the blurb and free sample were good ...
8998,2-Stars,this is one of the most superficial book i hav...


In [6]:
#train validation split 60-20-20
from sklearn.model_selection import train_test_split

tv_indices, test_indices = train_test_split(np.array(df.index), test_size=0.2, random_state=23)
train_indices, valid_indices = train_test_split(tv_indices, test_size=0.25, random_state=23)

data_train = data_bert.loc[train_indices].copy()
data_valid = data_bert.loc[valid_indices].copy()
data_test = data_bert.loc[test_indices].copy() 

# Modeling

## Bert

In [7]:
#install required packages
!pip install transformers



In [8]:
#import required packages
import torch
import numpy as np
from transformers import BertTokenizer


#load tokenizer and create a dictionary of the labels to train the model
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
labels = {'1-Star':0,
          '2-Stars':1,
          '3-Stars':2,
          '4-Stars':3,
          '5-Stars':4
          }

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):

        self.labels = [labels[label] for label in df['rating']]
        self.texts = [tokenizer(text, 
                               padding='max_length', max_length = 256, truncation=True,
                                return_tensors="pt") for text in df['reviewText']]

    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of inputs
        return self.texts[idx]

    def __getitem__(self, idx):

        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_texts, batch_y

In [9]:
#build the model to be used, dropout at 0.25, middle layer has no activation, final layer is using ReLU
from torch import nn
from transformers import BertModel

class BertClassifier(nn.Module):

    def __init__(self, dropout=0.25):

        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-cased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 5)
        self.relu = nn.ReLU()

    def forward(self, input_id, mask):

        _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        final_layer = self.relu(linear_output)

        return final_layer

In [10]:
#create the loop to train our data, this is a standard Pytorch training loop
from torch.optim import Adam
from tqdm import tqdm

def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=2, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=2)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_input, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                mask = train_input['attention_mask'].to(device)
                input_id = train_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                
                batch_loss = criterion(output, train_label.type(torch.LongTensor))
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc

                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_input, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    mask = val_input['attention_mask'].to(device)
                    input_id = val_input['input_ids'].squeeze(1).to(device)

                    output = model(input_id, mask)

                    batch_loss = criterion(output, val_label.type(torch.LongTensor))
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .4f} \
                | Train Accuracy: {total_acc_train / len(train_data): .4f} \
                | Val Loss: {total_loss_val / len(val_data): .4f} \
                | Val Accuracy: {total_acc_val / len(val_data): .4f}')
                  

In [12]:
#train the model, 5 epochs and learning rate of 0.000001
EPOCHS = 5
model = BertClassifier()
LR = 1e-6
              
train(model, data_train, data_valid, LR, EPOCHS)

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|████████████████████████████████████████████████████████████████████████████| 2700/2700 [2:50:43<00:00,  3.79s/it]
  0%|                                

Epochs: 1 | Train Loss:  0.7202                 | Train Accuracy:  0.3641                 | Val Loss:  0.6220                 | Val Accuracy:  0.4694


100%|████████████████████████████████████████████████████████████████████████████| 2700/2700 [2:53:58<00:00,  3.87s/it]
  0%|                                                                                         | 0/2700 [00:00<?, ?it/s]

Epochs: 2 | Train Loss:  0.5897                 | Train Accuracy:  0.5074                 | Val Loss:  0.5694                 | Val Accuracy:  0.5278


100%|████████████████████████████████████████████████████████████████████████████| 2700/2700 [2:53:07<00:00,  3.85s/it]
  0%|                                                                                         | 0/2700 [00:00<?, ?it/s]

Epochs: 3 | Train Loss:  0.5321                 | Train Accuracy:  0.5685                 | Val Loss:  0.5489                 | Val Accuracy:  0.5433


100%|████████████████████████████████████████████████████████████████████████████| 2700/2700 [2:53:09<00:00,  3.85s/it]
  0%|                                                                                         | 0/2700 [00:00<?, ?it/s]

Epochs: 4 | Train Loss:  0.4835                 | Train Accuracy:  0.6257                 | Val Loss:  0.5443                 | Val Accuracy:  0.5383


100%|████████████████████████████████████████████████████████████████████████████| 2700/2700 [2:53:15<00:00,  3.85s/it]


Epochs: 5 | Train Loss:  0.4339                 | Train Accuracy:  0.6783                 | Val Loss:  0.5564                 | Val Accuracy:  0.5317


In [None]:
#we see that the best result is in epoch 3, with a validation accuracy of 0.5433

In [63]:
#create a function to evaluate test data. This will let us know how we perform out of sample
def evaluate(model, test_data):

    test = Dataset(test_data)
    test_dataloader = torch.utils.data.DataLoader(test, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    output_array = [] 
    with torch.no_grad():

        for test_input, test_label in test_dataloader:

            test_label = test_label.to(device)
            mask = test_input['attention_mask'].to(device)
            input_id = test_input['input_ids'].squeeze(1).to(device)

            output = model(input_id, mask)

            acc = (output.argmax(dim=1) == test_label).sum().item()
            total_acc_test += acc
            output_array.append(output.argmax(dim=1))
        print(f'Test Accuracy: {total_acc_test / len(test_data): .4f}')
    return output_array
    
results = evaluate(model, data_test)

Test Accuracy:  0.5233


In [80]:
results2 = results

This code is modified from https://towardsdatascience.com/text-classification-with-bert-in-pytorch-887965e5820f

In [102]:
##save the results into a csv and reload as a dataframe to run confusion matrix
import csv 
file = open('results.csv', 'w+', newline ='') 
header = ['rating']
with file:     
    write = csv.writer(file) 
    write.writerow(header)
    write.writerows(results2)

In [107]:
bert_results = pd.read_csv('results.csv')

In [108]:
bert_results

Unnamed: 0,rating
0,tensor(3)
1,tensor(0)
2,tensor(3)
3,tensor(1)
4,tensor(4)
...,...
1795,tensor(3)
1796,tensor(3)
1797,tensor(3)
1798,tensor(4)


In [120]:
bert_results['rating'] = bert_results['rating'].replace('tensor(0)',0)
bert_results['rating'] = bert_results['rating'].replace('tensor(1)',1)
bert_results['rating'] = bert_results['rating'].replace('tensor(2)',2)
bert_results['rating'] = bert_results['rating'].replace('tensor(3)',3)
bert_results['rating'] = bert_results['rating'].replace('tensor(4)',4)

In [121]:
bert_results

Unnamed: 0,rating
0,3
1,0
2,3
3,1
4,4
...,...
1795,3
1796,3
1797,3
1798,4


In [124]:
data_test['rating'] = data_test['rating'].replace('1-Star',0)
data_test['rating'] = data_test['rating'].replace('2-Stars',1)
data_test['rating'] = data_test['rating'].replace('3-Stars',2)
data_test['rating'] = data_test['rating'].replace('4-Stars',3)
data_test['rating'] = data_test['rating'].replace('5-Stars',4)

data_test

Unnamed: 0,rating,reviewText
5757,2,I purchased this book for my kindle for 2 reas...
3911,0,Nah...sorry I started reading it. Underwritten...
1879,3,I loved the characters and stories. Each one w...
7807,1,I was all set to like or to love this story. I...
1235,4,This is another of the old classics that Amazo...
...,...,...
3508,3,This book wasn't bad. I like how the writer g...
7911,1,"The book almost read like an episode of ""Leave..."
6845,3,This series is quite interesting...i've never ...
1553,4,It's a good thing Ms. Crusie has written so pr...


In [132]:
from sklearn.metrics import classification_report
y_true = np.array(data_test['rating'])
y_pred = np.array(bert_results['rating'])
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.61      0.67      0.64       340
           1       0.43      0.51      0.47       292
           2       0.00      0.00      0.00       252
           3       0.43      0.73      0.54       436
           4       0.74      0.51      0.61       480

    accuracy                           0.52      1800
   macro avg       0.44      0.48      0.45      1800
weighted avg       0.49      0.52      0.49      1800



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
from sklearn.metrics import confusion_matrix

In [133]:
print(confusion_matrix(y_true, y_pred))

[[227  92   0  16   5]
 [101 149   0  41   1]
 [ 26  72   0 150   4]
 [ 13  27   0 320  76]
 [  7   6   0 221 246]]
