In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
!pip install datasets
!pip install transformers

Collecting datasets
  Downloading datasets-2.15.0-py3-none-any.whl (521 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow-hotfix (from datasets)
  Downloading pyarrow_hotfix-0.6-py3-none-any.whl (7.9 kB)
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m21.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyarrow-hotfix, dill, multiprocess, datasets
Successfully installed datasets-2.15.0 dill-0.3.7 multiprocess-0.70.15 pyarrow-hotfix-0.6


In [5]:
!pip install accelerate>=0.20.1


In [6]:
import pandas as pd
import numpy as np
import re
from datasets import load_dataset, load_metric
from datasets import Dataset
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
import nltk
import time
import torch 
from torch.cuda.amp import autocast 
from torch.utils.data import DataLoader

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [7]:
df = pd.read_csv("/content/drive/MyDrive/imdb/IMDB Dataset.csv")

In [8]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [9]:
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [10]:
df.head()

Unnamed: 0,review,sentiment,label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [11]:
def preprocess_text(text): 
    text = text.lower() 
    text = re.sub(r'<.*?>', '', text) 
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text) 
    tokens = word_tokenize(text) 
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words] 
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [12]:
df.dtypes

review       object
sentiment    object
label         int64
dtype: object

In [13]:
df['cleaned_text'] = df['review'].apply(preprocess_text)

In [14]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42) 
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [15]:
train_dataset

Dataset({
    features: ['review', 'sentiment', 'label', 'cleaned_text', '__index_level_0__'],
    num_rows: 35000
})

In [16]:
test_dataset

Dataset({
    features: ['review', 'sentiment', 'label', 'cleaned_text', '__index_level_0__'],
    num_rows: 15000
})

# Using distilbert model

In [17]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
def tokenize_text_data(sample_text):
    return tokenizer(sample_text['cleaned_text'], padding="max_length", truncation=True, max_length=512)

tokenized_train_dataset = train_dataset.map(tokenize_text_data, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_text_data, batched=True)

train_set = tokenized_train_dataset
test_set = tokenized_test_dataset


Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

In [19]:
def eval_metrics(eval_pred):
    metric = load_metric("f1")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/imdb/results2",
    num_train_epochs=3,
    per_device_train_batch_size=30,
    per_device_eval_batch_size=30,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True,
    metric_for_best_model="f1"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set,
    compute_metrics=eval_metrics
)

trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.2661,0.220353,0.915402
2,0.1418,0.234285,0.919332
3,0.0603,0.333798,0.918374


  metric = load_metric("f1")


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

TrainOutput(global_step=3501, training_loss=0.16684275178169053, metrics={'train_runtime': 2038.9896, 'train_samples_per_second': 51.496, 'train_steps_per_second': 1.717, 'total_flos': 1.390907685888e+16, 'train_loss': 0.16684275178169053, 'epoch': 3.0})

# Saving the Model

In [20]:
model_path = "/content/drive/MyDrive/imdb/saved_model2"
model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('/content/drive/MyDrive/imdb/saved_model2/tokenizer_config.json',
 '/content/drive/MyDrive/imdb/saved_model2/special_tokens_map.json',
 '/content/drive/MyDrive/imdb/saved_model2/vocab.txt',
 '/content/drive/MyDrive/imdb/saved_model2/added_tokens.json',
 '/content/drive/MyDrive/imdb/saved_model2/tokenizer.json')

In [24]:
def collate_fn(batch): 
    return {
        'input_ids': torch.stack([torch.tensor(item['input_ids']) for item in batch]),
        'attention_mask': torch.stack([torch.tensor(item['attention_mask']) for item in batch]),
        'labels': torch.tensor([item['label'] for item in batch])
    }

In [22]:
# Combining train and test datasets 
full_dataset = torch.utils.data.ConcatDataset([train_set, test_set])

In [1]:
!pip install torch torchvision




In [27]:
model.to('cuda')   

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

# Benchmarking the prediction with batchsize = 30, and with FP Quantization , Autocast FP16 operations

In [29]:
def benchmark_prediction_speed_fp16(model, dataset, batch_size=30):
    data_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn) 
    model.eval() 
    start_time = time.time()

    with torch.no_grad():
        for batch in data_loader:
            # Moving  batch to GPU
            batch = {k: v.to('cuda') for k, v in batch.items()} 
            # Autocast to handle FP16 operations
            with autocast():
                outputs = model(**batch) 
     
    total_time = time.time() - start_time
    return total_time

# Benchmark prediction speed with FP16 quantization
total_prediction_time_fp16 = benchmark_prediction_speed_fp16(model, full_dataset)
print(f"Total time taken for predictions with FP16 quantization: {total_prediction_time_fp16:.2f} seconds")


Total time taken for predictions with FP16 quantization: 269.50 seconds


# Comparing prediction time with base model vs FP quantized model

In [2]:
prediction_time_base = 850.89
prediction_time_fp16 = 269.50

In [3]:
speedup_factor = prediction_time_base / prediction_time_fp16

In [4]:
speedup_factor

3.157291280148423