#Installations

In [1]:
%%capture
!pip install transformers
!pip install accelerate -U
!pip install datasets
!pip install huggingface_hub



Dependencies Importations

In [2]:
##for handling path of datasets
import os
from google.colab import drive

##for data handling:

import pandas as pd
import numpy as np
import torch
from torch import nn
import transformers
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import Trainer
from transformers import TrainingArguments
from scipy.special import softmax


##modelling:

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from transformers import pipeline
from datasets import load_dataset
import nltk
nltk.download('punkt')

##others
import warnings
warnings.filterwarnings("ignore")
import os
os.environ["WANDB_DISABLED"] = "true"
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Importing dataset from Google Drive**

In [4]:
data_path=  "/content/drive/My Drive/Colab Notebooks/NLP_Capstone/Transformed_copy.csv"

In [5]:
# read data
data= pd.read_csv(data_path)

In [6]:
#Show first five rows of the data
data.head()

Unnamed: 0.1,Unnamed: 0,content,label,Transformed_content
0,0,recently shown on cable tv the movie opens wit...,1,recently shown on cable tv the movie opens wit...
1,1,i was very surprised with this film. i was tou...,1,i was very surprised with this film i was touc...
2,2,"now, i'm one to watch movies that got poor rev...",0,now im one to watch movies that got poor revie...
3,3,"this film came out 12 years years ago, and was...",1,this film came out years years ago and was a ...
4,4,"when an orphanage manager goes on vacation, hi...",1,when an orphanage manager goes on vacation his...


In [7]:
#Check for null values

data.isna().sum()

Unnamed: 0             0
content                0
label                  0
Transformed_content    0
dtype: int64

In [8]:
#Ensuring there are no null values
data[data["Transformed_content"].isnull()]

Unnamed: 0.1,Unnamed: 0,content,label,Transformed_content


In [9]:
##drop 'Unnamed' and 'content' column to facilitate analysis
data= data.drop(["Unnamed: 0", "content"], axis=1)

In [10]:
data

Unnamed: 0,label,Transformed_content
0,1,recently shown on cable tv the movie opens wit...
1,1,i was very surprised with this film i was touc...
2,0,now im one to watch movies that got poor revie...
3,1,this film came out years years ago and was a ...
4,1,when an orphanage manager goes on vacation his...
5,0,wow this film was just bloody horrid so bad in...
6,0,absolutely one of the worst movies of the year...
7,0,i have to say this is one of the worst films i...
8,0,well the writing was very sloppy the directing...
9,0,just plain good old stupid br br i mean really...


**Data Splitting**

In [11]:
train, eval= train_test_split(data, test_size= 0.2, stratify= data["label"], random_state= 42)

In [12]:
train.head()

Unnamed: 0,label,Transformed_content
8793,0,watching marlen brando on screen is like watch...
9395,1,i enjoy ralph bakshi films wizards cool world ...
1123,1,i thoroughly enjoyed this film for its humor a...
2772,0,i have seen this movie many times and recently...
18360,0,this is probably the most boring worse and use...


In [13]:
eval.head()

Unnamed: 0,label,Transformed_content
247,0,may the saints preserve us because this movie ...
13564,0,i wont say this movie was bad but it wasnt goo...
21333,0,this film is to the fbis history as knotts ber...
387,0,jason alexander is a wonderful actor but its r...
9387,0,filmfour are going to have to do a lot better ...


In [14]:
print(f"new dataframe shapes: train is {train.shape}, eval is {eval.shape}")

new dataframe shapes: train is (19923, 2), eval is (4981, 2)


In [15]:
#saving the train and eval data to csv
train.to_csv("/content/train.csv")
eval.to_csv("/content/eval.csv")

**Load the Dataset**

In [16]:
dataset= load_dataset( "csv", data_files= { "train":"train.csv", "eval":"eval.csv"} )

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating eval split: 0 examples [00:00, ? examples/s]

In [17]:
dataset

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'label', 'Transformed_content'],
        num_rows: 19923
    })
    eval: Dataset({
        features: ['Unnamed: 0', 'label', 'Transformed_content'],
        num_rows: 4981
    })
})

**Tokenization**

In [18]:
#create an instance for tokenizer
tokenizer= AutoTokenizer.from_pretrained("huawei-noah/TinyBERT_General_4L_312D")


Downloading (…)lve/main/config.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

## Preprocessing Data

In [19]:
## changing labels to 0,1 from the initial labels -1, 1

def transform_labels(input):
  label= input["label"]
  num =0

  if label== -1:
    num= 0 # negative sentiment
  elif label== 1:
    num =1 # positive sentiment

  return {"labels": num}

# Function to tokenize
def tokenize(example):
  return tokenizer(example["Transformed_content"], padding= True, max_length=512,truncation=True, return_tensors= "pt")


In [20]:
## Converting reviews to tokens for the model to work with and eliminating features that are not needed for the analysis

dataset= dataset.map(tokenize, batched= True)
remove_columns= ['Unnamed: 0', 'Transformed_content', 'label']
dataset = dataset.map(transform_labels, remove_columns=remove_columns)

Map:   0%|          | 0/19923 [00:00<?, ? examples/s]

Map:   0%|          | 0/4981 [00:00<?, ? examples/s]

Map:   0%|          | 0/19923 [00:00<?, ? examples/s]

Map:   0%|          | 0/4981 [00:00<?, ? examples/s]

In [21]:
dataset

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 19923
    })
    eval: Dataset({
        features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels'],
        num_rows: 4981
    })
})

**Modelling**

In [22]:
# Loading the pretrained model and specifying the number of labels
model= AutoModelForSequenceClassification.from_pretrained("huawei-noah/TinyBERT_General_4L_312D", num_labels= 2)

Downloading pytorch_model.bin:   0%|          | 0.00/62.7M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
#defining my  metric for the modelling

def compute_metrics(pred):
  labels = pred.label_ids
  preds = pred.predictions.argmax(-1)
  accuracy = accuracy_score(labels, preds)
  return {"accuracy": accuracy}

In [24]:
#Setting batch size for the training

batch_size= 16

In [25]:
#creating an instance for the training arguments

training_args = TrainingArguments( output_dir="NLP_Capstone",  learning_rate = 2e-5,weight_decay=0.01, # Adding weight decay to handle overfitting
   num_train_epochs=5, load_best_model_at_end=True,evaluation_strategy="steps",save_strategy="steps",push_to_hub=True

)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


In [26]:
##setting a shuffle seed to avoid randomization at each rerun
train_dataset= dataset['train'].shuffle(seed=10)
eval_dataset= dataset['eval'].shuffle(seed=10)

In [27]:
#making a connection to huggingface
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [28]:
#loading training arguments

trainer = Trainer(
      model= model,
      args= training_args,
      train_dataset= train_dataset,
      eval_dataset= eval_dataset,
      tokenizer= tokenizer,
      compute_metrics=compute_metrics )

In [29]:
#training the model

trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy
500,0.5286,0.416858,0.825136
1000,0.4299,0.413737,0.833166
1500,0.3856,0.371426,0.851235
2000,0.3692,0.317618,0.867095
2500,0.3604,0.386947,0.863481
3000,0.3457,0.412626,0.86308
3500,0.3291,0.427241,0.867496
4000,0.3481,0.375358,0.877535
4500,0.3253,0.429337,0.864887
5000,0.3306,0.380681,0.87894


TrainOutput(global_step=12455, training_loss=0.30344246121976615, metrics={'train_runtime': 1904.3933, 'train_samples_per_second': 52.308, 'train_steps_per_second': 6.54, 'total_flos': 1428379108853760.0, 'train_loss': 0.30344246121976615, 'epoch': 5.0})

In [30]:
trainer.evaluate()


{'eval_loss': 0.31761786341667175,
 'eval_accuracy': 0.8670949608512347,
 'eval_runtime': 23.7677,
 'eval_samples_per_second': 209.57,
 'eval_steps_per_second': 26.212,
 'epoch': 5.0}

In [31]:
#Pushing the model to the hub
trainer.push_to_hub()

'https://huggingface.co/HerbertAIHug/NLP_Capstone/tree/main/'