## 1.Environment Setup and Library Version Check


In [1]:
!pip install --upgrade huggingface_hub peft evaluate

import huggingface_hub
import peft

print(huggingface_hub.__version__)
print(peft.__version__)


Collecting huggingface_hub
  Downloading huggingface_hub-0.27.1-py3-none-any.whl.metadata (13 kB)
Collecting peft
  Downloading peft-0.14.0-py3-none-any.whl.metadata (13 kB)
Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading huggingface_hub-0.27.1-py3-none-any.whl (450 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m450.7/450.7 kB[0m [31m15.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading peft-0.14.0-py3-none-any.whl (374 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m374.8/374.8 kB[0m [31m21.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: huggingface_hub, peft, evaluate
  Attempting uninstall: huggingface_hub
    Found existing installation: huggingface-hub 0.24.7
    Uninstalling huggingface-hub-0.24.7:
 

# 2. Import Libraries


In [2]:
import pandas as pd
import numpy as np
import re
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
import matplotlib.pyplot as plt
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
import torch
from datasets import Dataset

print("Libraries imported successfully.")


Libraries imported successfully.


# 3. Load and Preview the Dataset




In [3]:
# Load datasets
train_data = pd.read_csv('/kaggle/input/imdb-dataset/train.csv')
test_data = pd.read_csv('/kaggle/input/imdb-dataset/test.csv')

print("Training Data Sample:")
print(train_data.head())
print("\nTesting Data Sample:")
print(test_data.head())


Training Data Sample:
                                              review sentiment
0  SAPS AT SEA <br /><br />Aspect ratio: 1.37:1<b...  negative
1  If you want mindless action, hot chicks and a ...  positive
2  "The Woman in Black" is easily one of the cree...  positive
3  I can barely find the words to describe how mu...  negative
4  What's in here ?! Let me tell you. It's the pr...  negative

Testing Data Sample:
                                              review sentiment
0  Steven Rea plays a forensic scientist thrust o...  positive
1  As the first of the TV specials offered on the...  positive
2  There may something poetically right in seeing...  negative
3  all i can say about this film is to read the b...  negative
4  I thought it was a pretty good movie and shoul...  positive


# 4. Preprocess and Tokenize the IMDB Dataset


In [6]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Load your training and test data
train_data = pd.read_csv('/kaggle/input/imdb-dataset/train.csv')
test_data = pd.read_csv('/kaggle/input/imdb-dataset/test.csv')

# Preprocessing: Convert the sentiment into binary labels (positive=1, negative=0)
train_data['label'] = train_data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)
test_data['label'] = test_data['sentiment'].apply(lambda x: 1 if x == 'positive' else 0)

# Use only 10% of the data for quick training and testing
train_data_sample = train_data.sample(frac=0.1, random_state=42).reset_index(drop=True)
test_data_sample = test_data.sample(frac=0.1, random_state=42).reset_index(drop=True)

# Tokenization
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Define the dataset for Hugging Face Trainer
train_dataset = Dataset.from_pandas(train_data_sample[['review', 'label']])
test_dataset = Dataset.from_pandas(test_data_sample[['review', 'label']])

# Tokenize the reviews
def tokenize_function(examples):
    return tokenizer(examples['review'], padding='max_length', truncation=True)

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)






Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

## 5. Define, Train, and Evaluate DistilBERT Model


In [7]:
# Define model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english', num_labels=2)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # number of training epochs
    per_device_train_batch_size=8,   # batch size for training
    per_device_eval_batch_size=8,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='./logs',            # directory for storing logs
    logging_steps=10,
    evaluation_strategy="epoch",     # evaluation is done at the end of each epoch
    save_strategy="epoch",           # save model at the end of each epoch
    disable_tqdm=False,
    load_best_model_at_end=True,     # load best model after training
)

# Trainer setup
trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=test_dataset,           # evaluation dataset
    compute_metrics=None,                # Leave metrics calculation separate
)

# Train the model
trainer.train()

# Evaluate the model
results = trainer.evaluate()

print(f"Evaluation Results: {results}")

# Predictions for classification report
predictions = trainer.predict(test_dataset)
y_pred = predictions.predictions.argmax(axis=-1)

# Print classification report
print(classification_report(test_data_sample['label'], y_pred))



Epoch,Training Loss,Validation Loss
1,0.3465,0.256328
2,0.3031,0.304224
3,0.1062,0.366728




Evaluation Results: {'eval_loss': 0.25632813572883606, 'eval_runtime': 20.0697, 'eval_samples_per_second': 99.653, 'eval_steps_per_second': 6.228, 'epoch': 3.0}
              precision    recall  f1-score   support

           0       0.93      0.86      0.89       966
           1       0.88      0.94      0.91      1034

    accuracy                           0.90      2000
   macro avg       0.90      0.90      0.90      2000
weighted avg       0.90      0.90      0.90      2000

