***Load Data***

In [1]:
import pandas as pd
from datasets import Dataset

data = pd.read_csv('https://github.com/laxmimerit/All-CSV-ML-Data-Files-Download/raw/refs/heads/master/IMDB-Dataset.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


***Split Data***

In [2]:
dataset = Dataset.from_pandas(data)
dataset = dataset.train_test_split(test_size=0.3, seed=42)
dataset

DatasetDict({
    train: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 35000
    })
    test: Dataset({
        features: ['review', 'sentiment'],
        num_rows: 15000
    })
})

***Introduce numerical labels***

In [3]:
label2id = {'positive': 1, 'negative': 0}
id2label = {1: 'positive', 0: 'negative'}

dataset = dataset.map(lambda x: {'label': label2id[x['sentiment']]})

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

In [None]:
dataset['train'][0]

{'review': "This is by far THE WORST movie i have ever watched. I've seen some pretty awful movies in my time but this ones takes the cake, no, wait, i mean the the whole damn bakery. It is so bad that i believe a word to describe the way you will feel after watching this atrocity has yet to be created. Please just do yourself a favor, if you ever get the urge to watch this and watch thirty minutes of that annoying purple dinosaur Barney, then multiply that thirty times fold and you would still only get a small fraction of the horror you would be in store for. In summation, i guess you really can call it a horror movie, but only if you're willing to be scared senseless by the worst acting in the business and utterly pointless story.<br /><br />Real Rating, -10 Disgusting",
 'sentiment': 'negative',
 'label': 0}

***Tokenise Data***

In [4]:
from transformers import AutoTokenizer
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

Using device: cuda


In [5]:
model_ckpt = 'huawei-noah/TinyBERT_General_4L_312D'
tokeniser = AutoTokenizer.from_pretrained(model_ckpt, use_fast=True)


In [6]:
tokeniser(dataset['train'][0]['review'])

def tokenize_function(batch):
    temp = tokeniser(batch['review'], padding=True, truncation=True, max_length=300)
    return temp

tokenised_datasets = dataset.map(tokenize_function, batched=True, batch_size=None)

Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

***Model Evaluation Function*** (From HuggingFace)

In [7]:
import evaluate
import numpy as np

accuracy = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=preds, references=labels)

***Model***

In [9]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained(
    model_ckpt, num_labels=len(label2id), id2label=id2label, label2id=label2id
).to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at huawei-noah/TinyBERT_General_4L_312D and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
args = TrainingArguments(
    output_dir='train_tinybert_imdb',
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    evaluation_strategy='epoch'
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenised_datasets['train'],
    eval_dataset=tokenised_datasets['test'],
    compute_metrics=compute_metrics,
    tokenizer=tokeniser
)




In [15]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,0.3543,0.340336,0.851467
2,0.2858,0.296662,0.877733
3,0.2581,0.293437,0.8812


TrainOutput(global_step=3282, training_loss=0.31383063660969757, metrics={'train_runtime': 6114.6209, 'train_samples_per_second': 17.172, 'train_steps_per_second': 0.537, 'total_flos': 885410612150400.0, 'train_loss': 0.31383063660969757, 'epoch': 3.0})

In [16]:
trainer.evaluate()

{'eval_loss': 0.2934366464614868,
 'eval_accuracy': 0.8812,
 'eval_runtime': 387.5164,
 'eval_samples_per_second': 38.708,
 'eval_steps_per_second': 1.21,
 'epoch': 3.0}

***Model Save and Load for Inference***

In [17]:
trainer.save_model('tinybert_imdb_model')

In [19]:
data = ["The movie was fantastic! I really loved it.",
        "I hated the film. It was the worst I've ever seen.",
        "It was an average movie. Not bad, but not great either."]

In [21]:
from transformers import pipeline

classifier = pipeline('text-classification', model='tinybert_imdb_model', tokenizer=tokeniser, device=device)

classifier(data)

[{'label': 'positive', 'score': 0.990028440952301},
 {'label': 'negative', 'score': 0.9872934818267822},
 {'label': 'negative', 'score': 0.8724518418312073}]

***Upload Model to AWS S3***

In [2]:
import boto3

# Create an S3 client
s3 = boto3.client('s3')

bucket_name = "tinybert-imdb-model-bucket-7429"

def create_bucket(bucket_name):
    resp = s3.list_buckets()
    buckets = [bucket['Name'] for bucket in resp['Buckets']]
    if bucket_name in buckets:
        print(f"Bucket '{bucket_name}' already exists")
        return
    s3.create_bucket(Bucket=bucket_name)
    print(f"Bucket '{bucket_name}' created")

create_bucket(bucket_name)

Bucket 'tinybert-imdb-model-bucket-7429' already exists


In [4]:
import os

def upload_directory(directory_path, s3_prefix, bucket_name=bucket_name):
    for root, _, files in os.walk(directory_path):
        for file in files:
            file_path = os.path.join(root, file).replace('\\', '/')
            relpath = os.path.relpath(file_path, directory_path)
            s3_key = os.path.join(s3_prefix, relpath).replace('\\', '/')
            s3.upload_file(file_path, bucket_name, s3_key)
            print(f"Uploaded '{file_path}' to '{s3_key}'")

upload_directory('tinybert_imdb_model', 'models/tinybert_imdb_model')

Uploaded 'tinybert_imdb_model/config.json' to 'models/tinybert_imdb_model/config.json'
Uploaded 'tinybert_imdb_model/model.safetensors' to 'models/tinybert_imdb_model/model.safetensors'
Uploaded 'tinybert_imdb_model/special_tokens_map.json' to 'models/tinybert_imdb_model/special_tokens_map.json'
Uploaded 'tinybert_imdb_model/tokenizer.json' to 'models/tinybert_imdb_model/tokenizer.json'
Uploaded 'tinybert_imdb_model/tokenizer_config.json' to 'models/tinybert_imdb_model/tokenizer_config.json'
Uploaded 'tinybert_imdb_model/training_args.bin' to 'models/tinybert_imdb_model/training_args.bin'
Uploaded 'tinybert_imdb_model/vocab.txt' to 'models/tinybert_imdb_model/vocab.txt'
