In [1]:
from google.colab import files
files.upload()

! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/ #copying kaggle.json
! chmod 600 ~/.kaggle/kaggle.json #reading the file with full access
# make sure kaggle.json file is present
! ls -lha kaggle.json

! pip install -q kaggle

Saving kaggle.json to kaggle.json
-rw-r--r-- 1 root root 65 Feb  4 04:07 kaggle.json


In [2]:
! kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
! mkdir data
! unzip imdb-dataset-of-50k-movie-reviews.zip -d ./data

Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 97% 25.0M/25.7M [00:01<00:00, 32.6MB/s]
100% 25.7M/25.7M [00:01<00:00, 21.5MB/s]
Archive:  imdb-dataset-of-50k-movie-reviews.zip
  inflating: ./data/IMDB Dataset.csv  


In [3]:
!pip install -qq transformers

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.3/6.3 MB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m190.3/190.3 KB[0m [31m27.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.6/7.6 MB[0m [31m115.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch

import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap

from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import torch.nn.functional as F

from tqdm import tqdm
import string
import re
import spacy

In [5]:
MAX_LEN = 512
BATCH_SIZE = 16
RANDOM_SEED = 42
EPOCHS = 2 # since it takes to much time, we will only train for 2 epochs
NUM_CLASSES = 2

PRE_TRAINED_MODEL_NAME = 'bert-base-cased'
TRAIN_PATH = "./data/IMDB Dataset.csv"

In [6]:
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
     

device(type='cuda')

In [7]:
en = spacy.load('en_core_web_sm')
stopwords = en.Defaults.stop_words
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [8]:
class ReviewDataset(Dataset):

    def __init__(self, reviews, targets, tokenizer, max_len):
        self.reviews = reviews
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len
  
    def __len__(self):
        return len(self.reviews)
  
    def __getitem__(self, item):
        review = str(self.reviews[item])
        target = self.targets[item]

        encoding = self.tokenizer.encode_plus(
                  review,
                  add_special_tokens=True,
                  max_length=self.max_len,
                  truncation = True,
                  #return_token_type_ids=False,
                  padding='max_length',
                  pad_to_max_length=True,
                  return_attention_mask=True,
                  return_tensors='pt',
                )

#         return {
#               'review_text': review,
#               'input_ids': encoding['input_ids'].flatten(),
#               'attention_mask': encoding['attention_mask'].flatten(),
#               'targets': torch.tensor(target, dtype=torch.long)
#                 }
        return encoding['input_ids'].flatten(),encoding['attention_mask'].flatten(),torch.tensor(target, dtype=torch.long), review

In [9]:
def preprocess(ids, mask, target, review):
    return ids.to(device), mask.to(device), target.to(device), review


class WrappedDataLoader:
    def __init__(self, dl, func):
        self.dl = dl
        self.func = func

    def __len__(self):
        return len(self.dl)

    def __iter__(self):
        batches = iter(self.dl)
        for ids, mask, tar, review in batches:
            yield (self.func(ids, mask, tar, review))

In [10]:
def clean(x):
    x = re.sub("'",'',x).lower()
    exclude = set(string.punctuation)
    x = ''.join(ch for ch in x if ch not in exclude)
    digits = str.maketrans('','',string.digits)
    x = x.translate(digits)
    x = re.sub(r'br', '', x)
    x = re.sub(" +", " ", x.strip())
    # Remove @name
    x = re.sub(r'(@.*?)[\s]', ' ', x)
    # Remove some special characters
    x = re.sub(r'([\;\:\|•«\n])#', ' ', x)
    
    x = [x for x in x.split() if x not in stopwords]
    x = " ".join(map(str, x))
#     x = en(x)
#     xl = []
#     for token in x:
#         xl.append(token.lemma_)
#     x = ' '.join(map(str,xl))
#     print("one")
    return x

In [11]:
def csv_process(csv_path):
    data_csv = pd.read_csv(csv_path, nrows=25000)
    le = LabelEncoder()
    data_csv.sentiment = le.fit_transform(data_csv.sentiment)
    data_csv.review = data_csv.review.apply(clean)
    le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
#     data_csv['len'] = data_csv.review.apply(lambda x: len(x.split()))
#     data_csv = data_csv[data_csv.len <=512]
    df_train, df_valid = train_test_split(data_csv, test_size=0.1, random_state=RANDOM_SEED)
    df_train, df_valid = df_train.reset_index(drop=True), df_valid.reset_index(drop=True)
    print(le_name_mapping)
    return df_train, df_valid

In [12]:
df_train, df_valid = csv_process(TRAIN_PATH)

{'negative': 0, 'positive': 1}


In [13]:
class_names = {0: "Negative", 1:"Positive"}

In [14]:
def create_data_loader(x, y, tokenizer, max_len, batch_size):
    ds = ReviewDataset(
        reviews=x.to_numpy(),
        targets=y.to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
          )

    dl = DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=2
      )
    return WrappedDataLoader(dl,preprocess)

In [15]:
train_data_loader = create_data_loader(df_train.review,df_train.sentiment, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_valid.review,df_valid.sentiment, tokenizer, MAX_LEN, BATCH_SIZE)
# test_data_loader = create_data_loader(df_test, tokenizer, MAX_LEN, BATCH_SIZE)

In [16]:
class SentimentClassifier(nn.Module):
    
    def __init__(self, n_classes):
        super(SentimentClassifier, self).__init__()
        self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME, return_dict=False)
        self.drop = nn.Dropout(p=0.3)
        self.line = nn.Linear(self.bert.config.hidden_size, n_classes)
        self.out = nn.Softmax(dim=0)
  
    def forward(self, input_ids, attention_mask):
        _, pooled_output = self.bert(
          input_ids=input_ids,
          attention_mask=attention_mask,
#           return_dict=False
        )
        output = self.drop(pooled_output)
        return self.out(self.line(output))

In [17]:
model = SentimentClassifier(NUM_CLASSES).to(device)

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
def train_epoch( model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    
    model = model.train()

    losses = []
    correct_predictions = 0
    
    tk0 = tqdm(data_loader, total=len(data_loader))
    #for d in data_loader:
    for d in tk0:
        ids = d[0]
        mask = d[1]
        targets = d[2]

        outputs = model(
          input_ids=ids,
          attention_mask=mask
        )

        _, preds = torch.max(outputs, dim=1)
        loss = loss_fn(outputs, targets)

        correct_predictions += torch.sum(preds == targets)
        losses.append(loss.item())

        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()

    return correct_predictions.double() / n_examples, np.mean(losses)

In [19]:
def eval_model(model, data_loader, loss_fn, device, n_examples):
    model = model.eval()

    losses = []
    correct_predictions = 0

    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader))
        #for d in data_loader:
        for d in tk0:
            ids = d[0]
            mask = d[1]
            targets = d[2]

            outputs = model(
                    input_ids=ids,
                    attention_mask=mask
                  )
            _, preds = torch.max(outputs, dim=1)

            loss = loss_fn(outputs, targets)

            correct_predictions += torch.sum(preds == targets)
            losses.append(loss.item())

    return correct_predictions.double() / n_examples, np.mean(losses)

In [20]:
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
total_steps = len(train_data_loader) * EPOCHS

scheduler = get_linear_schedule_with_warmup(
  optimizer,
  num_warmup_steps=0,
  num_training_steps=total_steps
)

loss_fn = nn.CrossEntropyLoss().to(device)

Trained for 2 epochs. Longer training will improve results.

In [21]:
%%time

history = defaultdict(list)
best_accuracy = 0


for epoch in range(EPOCHS):

    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)

    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,    
        loss_fn, 
        optimizer, 
        device, 
        scheduler, 
        len(train_data_loader)
      )

    print(f'Train loss {train_loss} accuracy {train_acc}')

    val_acc, val_loss = eval_model(
        model,
        val_data_loader,
        loss_fn, 
        device, 
        len(val_data_loader)
      )

    print(f'Val   loss {val_loss} accuracy {val_acc}')
    print()

    history['train_acc'].append(train_acc)
    history['train_loss'].append(train_loss)
    history['val_acc'].append(val_acc)
    history['val_loss'].append(val_loss)

    if val_acc > best_accuracy:
        torch.save(model.state_dict(), 'best_model_state.bin')
        best_accuracy = val_acc

Epoch 1/2
----------


100%|██████████| 1407/1407 [34:40<00:00,  1.48s/it]


Train loss 0.6412758699908864 accuracy 13.562899786780383


100%|██████████| 157/157 [01:26<00:00,  1.82it/s]


Val   loss 0.6382822606973587 accuracy 13.70063694267516

Epoch 2/2
----------


100%|██████████| 1407/1407 [34:35<00:00,  1.47s/it]


Train loss 0.6361725259606688 accuracy 14.287135749822317


100%|██████████| 157/157 [01:26<00:00,  1.81it/s]


Val   loss 0.6377148909173953 accuracy 14.108280254777071

CPU times: user 1h 11min 32s, sys: 19.9 s, total: 1h 11min 52s
Wall time: 1h 12min 11s


In [22]:
def get_predictions(model, data_loader):
    model = model.eval()
  
    review_texts = []
    predictions = []
    prediction_probs = []
    real_values = []

    with torch.no_grad():
        for d in data_loader:
            texts = d[3]
            ids = d[0]
            mask = d[1]
            targets = d[2]

            outputs = model(
            input_ids=ids,
            attention_mask=mask
              )
            _, preds = torch.max(outputs, dim=1)

            probs = F.softmax(outputs, dim=1)

            review_texts.extend(texts)
            predictions.extend(preds)
            prediction_probs.extend(probs)
            real_values.extend(targets)

    predictions = torch.stack(predictions).cpu()
    prediction_probs = torch.stack(prediction_probs).cpu()
    real_values = torch.stack(real_values).cpu()
    return review_texts, predictions, prediction_probs, real_values

In [47]:
def get_Sentiment(review_text):
  text = clean(review_text)
  encoded_review = tokenizer.encode_plus(
    text,
    max_length=MAX_LEN,
    add_special_tokens=True,
    return_token_type_ids=False,
    pad_to_max_length=True,
   return_attention_mask=True,
    return_tensors='pt',
  )

  input_ids = encoded_review['input_ids'].to(device)
  attention_mask = encoded_review['attention_mask'].to(device)

  output = model(input_ids, attention_mask)
  _, prediction = torch.max(output, dim=1)
  # print(f'Review text: {review_text}')
  print(f'Sentiment  : {class_names[int(prediction)]}')

In [48]:
review_text = """The suits are CG, The background is CG, and the effects are Cg. 
The visuals are horrible, we spend too much time watching the heroes fight a CG group of villains, 
instead of focusing on their off-screen development. The Avengers Compound is horrible compared to 
the Avengers Tower in the first two movies. There are too many characters in this movie, and 
it's weird to say that this movie should have been divided into two. 
The pacing of the movie stops just for dumb jokes, Rocket Racoon's joke happens three 
minutes after Black Widow dies. The movie could have been an amazing ending to our 20 years of stories, 
but rushed through those moments, and had to spend more time-fighting villains. 
They could have made another movie about Fat Thor, or Iron Man and Captain America, or The Guardians, or 
Hawkeye, but those were forced into this movie, making it worse. T
"""

In [49]:
get_Sentiment(review_text)

Sentiment  : Negative


