In [1]:
#!pip install transformers



In [1]:
import numpy as np
import pandas as pd
pd.set_option('max_colwidth',None)
import seaborn as sns
import matplotlib.pyplot as plt
import re
import string
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import SpatialDropout1D, Dense, Embedding, SimpleRNN, LSTM, Bidirectional, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from sklearn.model_selection import train_test_split
from nltk import word_tokenize
# Importing the Random Forest model
from sklearn.ensemble import RandomForestClassifier

# Metrics to evaluate the model
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Function to print the classification report and get confusion matrix in a proper format

def metrics_score(actual, predicted):

    print(classification_report(actual, predicted))

    cm = confusion_matrix(actual, predicted)

    plt.figure(figsize = (8, 5))

    sns.heatmap(cm, annot = True,  fmt = '.2f', xticklabels = ['negative', 'positive'], yticklabels = ['negative', 'positive'])

    plt.ylabel('Actual')

    plt.xlabel('Predicted')

    plt.show()

In [4]:
df=pd.read_csv('/content/Movieupdate.csv')

In [5]:
df.head()

Unnamed: 0,text,label
0,grew b watching loving thunderbird mate school watched played thunderbird school lunch school wanted virgil scott one wanted alan counting became art form took child see movie hoping would get glimpse loved child bitterly disappointing high point snappy theme tune could compare original score thunderbird thankfully early saturday morning one television channel still play rerun series gerry anderson wife created jonatha frakes hand director chair version completely hopeless waste film utter rubbish cgi remake may acceptable replacing marionette homo sapiens subsp sapiens huge error judgment,0
1,put movie dvd player sat coke chip expectation hoping movie would contain strong point first movie awsome animation good flowing story excellent voice cast funny comedy kick as soundtrack disappointment found atlantis milo return read review first might let following paragraph directed seen first movie enjoyed primarily point mentioned first scene appears shock picked atlantis milo return display case local videoshop whatever expectation music feel bad imitation first movie voice cast replaced fitting one exception character like voice sweet actual drawing bad animation particular sad sight storyline also pretty weak like three episode schooby doo single adventurous story got last time misunderstand good schooby doo episode laugh single time although might sniggered twice audience seen first movie especially care similar sequel fast review movie stand alone product liked schooby doo might like movie could still enjoy movie nothing else suspect might good kid movie would know might better milo return three episode series cartoon channel breakfast tv,0
2,people know particular time past like feel need try define time others replace woodstock civil war apollo moon landing titanic sinking got realistic flick formulaic soap opera populated entirely low life trash kid young allowed go woodstock failed grade school composition show old meany put movie prove know nuttin topic still make money yeah already know one thing watching film give little insight underclass thinking next time see slut bar look like diane lane running way child abuse let parent worthless raise kid audience abuse simply stick woodstock moonlanding flick ipso facto mean film portrays,0
3,even though great interest biblical movie bored death every minute movie everything bad movie long acting time joke script horrible get point mixing story abraham noah together value time sanity stay away horror,0
4,die hard dad army fan nothing ever change got tape dvd audiobooks every time watch listen brand new film film run certain episode man hour enemy within gate battle school numerous others different edge introduction new general instead captain square brilliant move especially would cash cheque something rarely done follows early year getting equipment uniform starting training great film boring sunday afternoon two draw back one german bogus dodgy accent come one german pronounced letter w like u two casting liz frazer instead familiar janet davis like liz film like carry ons carry correctly janet davis would better choice,1


In [6]:
df.shape

(59947, 2)

## Implementing Bert

In [7]:
X= df['text']
y=df['label']

In [8]:
X_train, X_test, y_train, y_test= train_test_split(X,y, stratify=y, test_size=0.2, random_state=0)

In [9]:

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler


In [10]:
#load the DistilBert tokenizer and model
tokenizer=DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
bert_model=DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.weight', 'pre_classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
#testing
sample_tweet=X_train[0]
inputs=tokenizer(sample_tweet, padding=True, truncation=True, return_tensors='pt')

In [12]:
inputs

{'input_ids': tensor([[  101,  3473,  1038,  3666,  8295,  8505,  9001,  6775,  2082,  3427,
          2209,  8505,  9001,  2082,  6265,  2082,  2359, 17270,  3660,  2028,
          2359,  5070, 10320,  2150,  2396,  2433,  2165,  2775,  2156,  3185,
          5327,  2052,  2131, 12185,  3866,  2775, 19248, 15640,  2152,  2391,
         10245,  7685,  4323,  8694,  2071, 12826,  2434,  3556,  8505,  9001,
         16047,  2220,  5095,  2851,  2028,  2547,  3149,  2145,  2377,  2128,
         15532,  2186, 14926,  5143,  2564,  2580,  6285,  8988,  2050, 25312,
          9681,  2192,  2472,  3242,  2544,  3294, 20625,  5949,  2143, 14395,
         29132,  1039,  5856, 12661,  2089, 11701,  6419, 10115,  7585, 24004,
         20066, 24836, 24807, 20066, 24836,  4121,  7561,  8689,   102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1,

In [13]:
def preprocess_data(tweet, label, tokenizer):
  inputs=tokenizer(tweet.tolist(), padding=True, truncation=True, return_tensors='pt')
  labels=torch.tensor(label.tolist())
  dataset= TensorDataset(inputs['input_ids'], inputs['attention_mask'], labels)
  return dataset

In [14]:
train_dataset =preprocess_data(X_train, y_train, tokenizer)
test_dataset =preprocess_data(X_test, y_test, tokenizer)

In [15]:
batch_size=32

#sample the datset randomly
train_sampler=RandomSampler(train_dataset)
train_dataloader =DataLoader(train_dataset, sampler=train_sampler, batch_size=batch_size)

In [16]:
#Set up the optimizer
optimizer=torch.optim.AdamW(bert_model.parameters(), lr=1e-5)

#Set gradient accumulation
gradient_accumulation =4



In [None]:
#Set the training loop
bert_model.train()

for epoch in range (2):
  print(f'===========Epoch: {epoch+1}==============')
  total_loss =0.0
  for step, batch in enumerate(train_dataloader):
    #CLEAR ANY GRADIENT ALREADY ACCUMULATED
    optimizer.zero_grad()
    outputs=bert_model(batch[0], attention_mask=batch[1], labels=batch[2])
    #compute loss
    loss=output.loss
    loss=loss/gradient-gradient_accumulation
    loss.backward()
    if (step+1) % gradient_accumulation == 0:
      optimizer.step()
      total_loss +=loss.item()
      print(f'---------Adjusted weights after {step + 1} steps ----------')
  print(f'Epoch: {epoch + 1} - Average Loss: {total_loss/len(train_dataloader)}')


