In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install datasets
!pip install transformers



In [3]:
!pip install accelerate>=0.20.1


In [4]:
import pandas as pd
import numpy as np
import re
from datasets import load_dataset, load_metric
from datasets import Dataset
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from collections import Counter
import nltk
import time
import torch
from torch.utils.data import DataLoader

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [5]:
df = pd.read_csv("/content/drive/MyDrive/IMDB Dataset.csv")

In [6]:
df

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
...,...,...
49995,I thought this movie did a down right good job...,positive
49996,"Bad plot, bad dialogue, bad acting, idiotic di...",negative
49997,I am a Catholic taught in parochial elementary...,negative
49998,I'm going to have to disagree with the previou...,negative


In [7]:
df['label'] = df['sentiment'].map({'positive': 1, 'negative': 0})

In [8]:
df.head()

Unnamed: 0,review,sentiment,label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


# Preprocessing

In [9]:
# Function for basic preprocessing => Converting to lowercase, Removing HTML tags, special characters, stopwords and punctuation
def preprocess_text(text): 
    text = text.lower() 
    text = re.sub(r'<.*?>', '', text) 
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text) 
    tokens = word_tokenize(text) 
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words] 
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    return ' '.join(tokens)

In [10]:
df.dtypes

review       object
sentiment    object
label         int64
dtype: object

In [11]:
df['cleaned_text'] = df['review'].apply(preprocess_text)

In [12]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)

# Datasets for train and test sets
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

In [13]:
train_dataset

Dataset({
    features: ['review', 'sentiment', 'label', 'cleaned_text', '__index_level_0__'],
    num_rows: 35000
})

In [14]:
test_dataset

Dataset({
    features: ['review', 'sentiment', 'label', 'cleaned_text', '__index_level_0__'],
    num_rows: 15000
})

# Using distilbert model

In [15]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'pre_classifier.weight', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Tokenization

In [16]:
def tokenize_text_data(sample_text):
    return tokenizer(sample_text['cleaned_text'], padding="max_length", truncation=True, max_length=512)

tokenized_train_dataset = train_dataset.map(tokenize_text_data, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_text_data, batched=True)

train_set = tokenized_train_dataset
test_set = tokenized_test_dataset


Map:   0%|          | 0/35000 [00:00<?, ? examples/s]

Map:   0%|          | 0/15000 [00:00<?, ? examples/s]

# Training

In [18]:
# Metrics for evaluation
def eval_metrics(eval_pred):
    metric = load_metric("f1")
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

training_args = TrainingArguments(
    output_dir="/content/drive/MyDrive/imdb/results2",
    num_train_epochs=3,
    per_device_train_batch_size=28,
    per_device_eval_batch_size=28,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=test_set,
    compute_metrics=eval_metrics
)


trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,0.2571,0.215287,0.91487
2,0.1252,0.257992,0.917821
3,0.0533,0.375713,0.916815


  metric = load_metric("f1")


Downloading builder script:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

TrainOutput(global_step=3750, training_loss=0.14294341074625652, metrics={'train_runtime': 5685.6484, 'train_samples_per_second': 18.468, 'train_steps_per_second': 0.66, 'total_flos': 1.390907685888e+16, 'train_loss': 0.14294341074625652, 'epoch': 3.0})

In [None]:
torch.cuda.empty_cache()

# Benchmarking prediction with batch size  = 16, without FP quantization

In [20]:
# Formatting the batches correctly
def collate_fn(batch):
    # list of token IDs to tensors 
    return {
        'input_ids': torch.stack([torch.tensor(item['input_ids']) for item in batch]),
        'attention_mask': torch.stack([torch.tensor(item['attention_mask']) for item in batch]),
        'labels': torch.tensor([item['label'] for item in batch])
    }

# Measuring prediction speed
def benchmark_prediction_speed(model, dataset, batch_size=16): 
    data_loader = DataLoader(dataset, batch_size=batch_size, collate_fn=collate_fn) 
    model.eval() 
    start_time = time.time() 
    for batch in data_loader: 
        batch = {k: v.to(model.device) for k, v in batch.items()} 
        with torch.no_grad():
            outputs = model(**batch) 
    total_time = time.time() - start_time
    return total_time

# Combining train and test datasets for benchmarking
full_dataset = torch.utils.data.ConcatDataset([train_set, test_set])

# Benchmarking prediction speed for base model
total_prediction_time = benchmark_prediction_speed(model, full_dataset)
print(f"Total time taken for predictions: {total_prediction_time:.2f} seconds")


Total time taken for predictions: 850.89 seconds
