In [1]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from torch.utils.data import DataLoader, TensorDataset
import torch

In [2]:

import pandas as pd
from wordcloud import WordCloud
import seaborn as sns
import re
import string
from collections import Counter, defaultdict

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from plotly.offline import plot

import matplotlib.gridspec as gridspec
from matplotlib.ticker import MaxNLocator
from tqdm import tqdm
import matplotlib.patches as mpatches
# from transformers import BertTokenizer
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


import matplotlib.pyplot as plt

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stopWords_nltk = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [47]:
df = pd.read_csv('/content/drive/MyDrive/cleaned_df.csv')[['Cleaned_Review', 'Raw_Rating']]

In [48]:
df

Unnamed: 0,Cleaned_Review,Raw_Rating
0,enjoy spa facility massage evelina recommend b...,5
1,lovely hotel great facility situate quiet loca...,5
2,snowboard couple day felt treat myselfso ask f...,5
3,need group friends ski free shuttle service go...,5
4,staff friendly food good room bite plain dated...,4
...,...,...
5208,absolute love stay arrived member staff greet ...,5
5209,miss year pandemic wait iglika disappoint room...,5
5210,time come borovets pleasantly surprise hotel r...,4
5211,fool feature place refurbed great half rest ro...,5


In [84]:
df = df.dropna(subset=['Cleaned_Review', 'Raw_Rating'])

In [None]:
df['Raw_Rating'].unique()

In [None]:
def tokenize(text):
    """ basic tokenize method with word character, non word character and digits """
    text = re.sub(r" +", " ", str(text))
    text = re.split(r"(\d+|[a-zA-ZğüşıöçĞÜŞİÖÇ]+|\W)", text)
    text = list(filter(lambda x: x != '' and x != ' ', text))
    sent_tokenized = ' '.join(text)
    return sent_tokenized

In [50]:
fig = px.histogram(df,
             x = 'Raw_Rating',
             title = 'Histogram of Review Rating',
             template = 'ggplot2',
             color = 'Raw_Rating',
             color_discrete_sequence= px.colors.sequential.Blues_r,
             opacity = 0.8,
             height = 525,
             width = 835,
            )

fig.update_yaxes(title='Count')
fig.show()

In [51]:
# label encode
def label_encode(x):
    if x <3:
      return 0
    elif x == 3:
      return 1
    else:
        return 2

# label to name
def label2name(x):
    if x <3:
        return "Negative"
    elif x == 3:
      return "Neutral"
    else:
        return "Positive"


In [85]:
df["label"] = df["Raw_Rating"].apply(lambda x: label_encode(int(x)))
df["label_name"] = df["Raw_Rating"].apply(lambda x: label2name(int(x)))

In [86]:
df['chars'] = df['Cleaned_Review'].apply(lambda x: len(x))
df['words'] = df['Cleaned_Review'].apply(lambda x: len(x.split(' ')))

In [87]:
df['label_name'].unique()

array(['Positive', 'Negative', 'Neutral'], dtype=object)

In [90]:
# tokenize data
df["tokenized_review"] = df.Cleaned_Review.apply(lambda x: tokenize(x))
# calculate token count for any sent
df["sent_token_length"] = df["tokenized_review"].apply(lambda x: len(x.split()))

In [91]:
(df.sent_token_length < 512).mean()

0.9971209213051824

In [92]:
fig = px.histogram(df, x="sent_token_length", nbins=20, color_discrete_sequence=px.colors.cmocean.algae, barmode='group', histnorm="percent")
fig.show()

In [93]:
df.head()

Unnamed: 0,Cleaned_Review,Raw_Rating,label,label_name,chars,words,tokenized_review,sent_token_length
0,enjoy spa facility massage evelina recommend b...,5,2,Positive,129,17,enjoy spa facility massage evelina recommend b...,17
1,lovely hotel great facility situate quiet loca...,5,2,Positive,288,44,lovely hotel great facility situate quiet loca...,44
2,snowboard couple day felt treat myselfso ask f...,5,2,Positive,120,18,snowboard couple day felt treat myselfso ask f...,18
3,need group friends ski free shuttle service go...,5,2,Positive,242,39,need group friends ski free shuttle service go...,39
4,staff friendly food good room bite plain dated...,4,2,Positive,197,32,staff friendly food good room bite plain dated...,32


In [94]:
# BERT

In [95]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',
                                          do_lower_case=True)

In [99]:
# data tokenize with bert tokenizer
df["sent_bert_token_length"] = df["Cleaned_Review"].astype('str').apply(lambda x: len(tokenizer(x, add_special_tokens=False)["input_ids"]))

Token indices sequence length is longer than the specified maximum sequence length for this model (1174 > 512). Running this sequence through the model will result in indexing errors


In [100]:
import pandas as pd
import numpy as np
import os
import random
from pathlib import Path
import json

In [101]:
import torch
from tqdm.notebook import tqdm

from transformers import BertTokenizer
from torch.utils.data import TensorDataset

from transformers import BertForSequenceClassification

In [130]:
class Config():
    seed_val = 17
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    epochs = 5
    batch_size = 6
    seq_length = 512
    lr = 2e-5
    eps = 1e-8
    pretrained_model = 'bert-base-uncased'
    test_size=0.15
    random_state=42
    add_special_tokens=True
    return_attention_mask=True
    pad_to_max_length=True
    do_lower_case=False
    return_tensors='pt'

config = Config()

In [131]:
params = {"seed_val": config.seed_val,
    "device":str(config.device),
    "epochs":config.epochs,
    "batch_size":config.batch_size,
    "seq_length":config.seq_length,
    "lr":config.lr,
    "eps":config.eps,
    "pretrained_model": config.pretrained_model,
    "test_size":config.test_size,
    "random_state":config.random_state,
    "add_special_tokens":config.add_special_tokens,
    "return_attention_mask":config.return_attention_mask,
    "pad_to_max_length":config.pad_to_max_length,
    "do_lower_case":config.do_lower_case,
    "return_tensors":config.return_tensors,
         }


In [132]:

import random

device = config.device

random.seed(config.seed_val)
np.random.seed(config.seed_val)
torch.manual_seed(config.seed_val)
torch.cuda.manual_seed_all(config.seed_val)

In [133]:
df.head()

Unnamed: 0,Cleaned_Review,Raw_Rating,label,label_name,chars,words,tokenized_review,sent_token_length,sent_bert_token_length
0,enjoy spa facility massage evelina recommend b...,5,2,Positive,129,17,enjoy spa facility massage evelina recommend b...,17,19
1,lovely hotel great facility situate quiet loca...,5,2,Positive,288,44,lovely hotel great facility situate quiet loca...,44,52
2,snowboard couple day felt treat myselfso ask f...,5,2,Positive,120,18,snowboard couple day felt treat myselfso ask f...,18,23
3,need group friends ski free shuttle service go...,5,2,Positive,242,39,need group friends ski free shuttle service go...,39,50
4,staff friendly food good room bite plain dated...,4,2,Positive,197,32,staff friendly food good room bite plain dated...,32,35


In [134]:
from sklearn.model_selection import train_test_split

train_df_, val_df = train_test_split(df,
                                    test_size=0.20,
                                    random_state=config.random_state,
                            stratify=df.label.values)

In [135]:
train_df, test_df = train_test_split(train_df_,
                                    test_size=0.20,
                                    random_state=42,
                            stratify=train_df_.label.values)


In [136]:
# create tokenizer
tokenizer = BertTokenizer.from_pretrained(config.pretrained_model,
                                          do_lower_case=config.do_lower_case)

In [137]:
encoded_data_train = tokenizer.batch_encode_plus(
    train_df.Cleaned_Review.values,
    add_special_tokens=config.add_special_tokens,
    return_attention_mask=config.return_attention_mask,
    pad_to_max_length=config.pad_to_max_length,
    max_length=config.seq_length,
    return_tensors=config.return_tensors
)
encoded_data_val = tokenizer.batch_encode_plus(
    val_df.Cleaned_Review.values,
    add_special_tokens=config.add_special_tokens,
    return_attention_mask=config.return_attention_mask,
    pad_to_max_length=config.pad_to_max_length,
    max_length=config.seq_length,
    return_tensors=config.return_tensors
)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [138]:
input_ids_train = encoded_data_train['input_ids']
attention_masks_train = encoded_data_train['attention_mask']
labels_train = torch.tensor(train_df.label.values)

input_ids_val = encoded_data_val['input_ids']
attention_masks_val = encoded_data_val['attention_mask']
labels_val = torch.tensor(val_df.label.values)

In [139]:
dataset_train = TensorDataset(input_ids_train, attention_masks_train, labels_train)
dataset_val = TensorDataset(input_ids_val, attention_masks_val, labels_val)

In [140]:
model = BertForSequenceClassification.from_pretrained(config.pretrained_model,
                                                      num_labels=3,
                                                      output_attentions=False,
                                                      output_hidden_states=False)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [141]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

dataloader_train = DataLoader(dataset_train,
                              sampler=RandomSampler(dataset_train),
                              batch_size=config.batch_size)

dataloader_validation = DataLoader(dataset_val,
                                   sampler=SequentialSampler(dataset_val),
                                   batch_size=config.batch_size)

In [142]:
from transformers import AdamW, get_linear_schedule_with_warmup

optimizer = AdamW(model.parameters(),
                  lr=config.lr,
                  eps=config.eps)


scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=0,
                                            num_training_steps=len(dataloader_train)*config.epochs)

In [143]:
from sklearn.metrics import f1_score

def f1_score_func(preds, labels):
    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return f1_score(labels_flat, preds_flat, average='weighted')

def accuracy_per_class(preds, labels, label_dict):
    label_dict_inverse = {v: k for k, v in label_dict.items()}

    preds_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()

    for label in np.unique(labels_flat):
        y_preds = preds_flat[labels_flat==label]
        y_true = labels_flat[labels_flat==label]
        print(f'Class: {label_dict_inverse[label]}')
        print(f'Accuracy: {len(y_preds[y_preds==label])}/{len(y_true)}\n')

In [144]:
def evaluate(dataloader_val):

    model.eval()

    loss_val_total = 0
    predictions, true_vals = [], []

    for batch in dataloader_val:

        batch = tuple(b.to(config.device) for b in batch)

        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        with torch.no_grad():
            outputs = model(**inputs)

        loss = outputs[0]
        logits = outputs[1]
        loss_val_total += loss.item()

        logits = logits.detach().cpu().numpy()
        label_ids = inputs['labels'].cpu().numpy()
        predictions.append(logits)
        true_vals.append(label_ids)

        # calculate avareage val loss
        loss_val_avg = loss_val_total/len(dataloader_val)

        predictions = np.concatenate(predictions, axis=0)
        true_vals = np.concatenate(true_vals, axis=0)

        return loss_val_avg, predictions, true_vals

In [145]:
config.device


device(type='cuda', index=0)

In [None]:
model.to(config.device)

for epoch in tqdm(range(1, config.epochs+1)):

    model.train()

    loss_train_total = 0
    # allows you to see the progress of the training
    progress_bar = tqdm(dataloader_train, desc='Epoch {:1d}'.format(epoch), leave=False, disable=False)

    for batch in progress_bar:

        model.zero_grad()

        batch = tuple(b.to(config.device) for b in batch)


        inputs = {'input_ids':      batch[0],
                  'attention_mask': batch[1],
                  'labels':         batch[2],
                 }

        outputs = model(**inputs)

        loss = outputs[0]
        loss_train_total += loss.item()
        loss.backward()

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()
        scheduler.step()

        progress_bar.set_postfix({'training_loss': '{:.3f}'.format(loss.item()/len(batch))})


    torch.save(model.state_dict(), f'_BERT_epoch_{epoch}.model')

    tqdm.write(f'\nEpoch {epoch}')

    loss_train_avg = loss_train_total/len(dataloader_train)
    tqdm.write(f'Training loss: {loss_train_avg}')

    val_loss, predictions, true_vals = evaluate(dataloader_validation)
    val_f1 = f1_score_func(predictions, true_vals)
    tqdm.write(f'Validation loss: {val_loss}')

    tqdm.write(f'F1 Score (Weighted): {val_f1}');
    # save model params and other configs
    with Path('params.json').open("w") as f:
        json.dump(params, f, ensure_ascii=False, indent=4)

  0%|          | 0/5 [00:00<?, ?it/s]

Epoch 1:   0%|          | 0/556 [00:00<?, ?it/s]

In [None]:
torch.save(model, f'/content/drive/MyDrive/bert_model1.pth')


In [None]:
from sklearn.metrics import classification_report

preds_flat = np.argmax(predictions, axis=1).flatten()
print(classification_report(preds_flat, true_vals))

In [None]:
# Errors

In [None]:
pred_final = []

for i, row in tqdm(val_df.iterrows(), total=val_df.shape[0]):
    predictions = []

    review = row["Cleaned_Review"]
    encoded_data_test_single = tokenizer.batch_encode_plus(
    [review],
    add_special_tokens=config.add_special_tokens,
    return_attention_mask=config.return_attention_mask,
    pad_to_max_length=config.pad_to_max_length,
    max_length=config.seq_length,
    return_tensors=config.return_tensors
    )
    input_ids_test = encoded_data_test_single['input_ids']
    attention_masks_test = encoded_data_test_single['attention_mask']


    inputs = {'input_ids':      input_ids_test.to(device),
              'attention_mask':attention_masks_test.to(device),
             }

    with torch.no_grad():
        outputs = model(**inputs)

    logits = outputs[0]
    logits = logits.detach().cpu().numpy()
    predictions.append(logits)
    predictions = np.concatenate(predictions, axis=0)
    pred_final.append(np.argmax(predictions, axis=1).flatten()[0])

In [None]:
val_df["pred"] = pred_final

In [None]:
control = val_df.pred.values == val_df.label.values
val_df["control"] = control

In [None]:
val_df = val_df[val_df.control == False]

In [None]:
name2label = {"Negative":0,
             "Neutral":1,
              "Positive": 2
             }
label2name = {v: k for k, v in name2label.items()}

val_df["pred_name"] = val_df.pred.apply(lambda x: label2name.get(x))


In [None]:
from sklearn.metrics import confusion_matrix

# We create a confusion matrix to better observe the classes that the model confuses.
pred_name_values = val_df.pred_name.values
label_values = val_df.label_name.values
confmat = confusion_matrix(label_values, pred_name_values, labels=list(name2label.keys()))

In [None]:
confmat

In [None]:
df_confusion_val = pd.crosstab(label_values, pred_name_values)
df_confusion_val