<a href="https://www.kaggle.com/code/hungdongne/movie-review-sentiment-analysis-using-distilbert?scriptVersionId=226322372" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
!pip install datasets==1.18.4

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, accuracy_score
from bs4 import BeautifulSoup
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from nltk.corpus import stopwords

import warnings
warnings.filterwarnings("ignore")

Collecting datasets==1.18.4
  Downloading datasets-1.18.4-py3-none-any.whl.metadata (22 kB)
Collecting responses<0.19 (from datasets==1.18.4)
  Downloading responses-0.18.0-py3-none-any.whl.metadata (29 kB)
Downloading datasets-1.18.4-py3-none-any.whl (312 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m312.1/312.1 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, datasets
  Attempting uninstall: datasets
    Found existing installation: datasets 3.0.1
    Uninstalling datasets-3.0.1:
      Successfully uninstalled datasets-3.0.1
Successfully installed datasets-1.18.4 responses-0.18.0


In [2]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')


df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})

print(df.head(5))

def missing_values_analysis(df):
    na_columns_ = [col for col in df.columns if df[col].isnull().sum() > 0]
    n_miss = df[na_columns_].isnull().sum().sort_values(ascending=True)
    ratio_ = (df[na_columns_].isnull().sum() / df.shape[0] * 100).sort_values(ascending=True)
    missing_df = pd.concat([n_miss, np.round(ratio_, 2)], axis=1, keys=['Total Missing Values', 'Ratio'])
    missing_df = pd.DataFrame(missing_df)
    return missing_df

def check_df(df, head=5):
    print("--------------------- Shape --------------------")
    print(df.shape)
    print("-------------------- Types ---------------------")
    print(df.dtypes)
    print("----------------- NaN Analysis -----------------")
    print(missing_values_analysis(df))
    print("--------------------- Head ---------------------")
    print(df.head())

check_df(df)

                                              review  sentiment
0  One of the other reviewers has mentioned that ...          1
1  A wonderful little production. <br /><br />The...          1
2  I thought this was a wonderful way to spend ti...          1
3  Basically there's a family where a little boy ...          0
4  Petter Mattei's "Love in the Time of Money" is...          1
--------------------- Shape --------------------
(50000, 2)
-------------------- Types ---------------------
review       object
sentiment     int64
dtype: object
----------------- NaN Analysis -----------------
Empty DataFrame
Columns: [Total Missing Values, Ratio]
Index: []
--------------------- Head ---------------------
                                              review  sentiment
0  One of the other reviewers has mentioned that ...          1
1  A wonderful little production. <br /><br />The...          1
2  I thought this was a wonderful way to spend ti...          1
3  Basically there's a family wher

In [3]:
import spacy
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup

nlp = spacy.load("en_core_web_sm")

stop_words = set(stopwords.words("english"))

# Cleaning word
def process(review):
    # Remove HTML tags
    review = BeautifulSoup(review, "html.parser").get_text()
    
    # Remove non-alphabetical characters (e.g., numbers, punctuation)
    review = re.sub(r"[^a-zA-Z]", ' ', review)
    
    # Lowercase the text
    review = review.lower()
    
    # Tokenization and Lemmatization with spaCy
    doc = nlp(review)
    review = [token.lemma_ for token in doc if token.text not in stop_words]
    
    return " ".join(review)

# Example usage:
train_data = []

for i in range(len(df["review"])):
    if (i+1) % 2500 == 0:
        print("Processed reviews:", i+1)
    
    # train_data.append(process(df["review"][i]))
    train_data.append(df["review"][i])

Processed reviews: 2500
Processed reviews: 5000
Processed reviews: 7500
Processed reviews: 10000
Processed reviews: 12500
Processed reviews: 15000
Processed reviews: 17500
Processed reviews: 20000
Processed reviews: 22500
Processed reviews: 25000
Processed reviews: 27500
Processed reviews: 30000
Processed reviews: 32500
Processed reviews: 35000
Processed reviews: 37500
Processed reviews: 40000
Processed reviews: 42500
Processed reviews: 45000
Processed reviews: 47500
Processed reviews: 50000


In [4]:
from sklearn.model_selection import train_test_split

# Split into train_data, validation_data, test_data
train_texts, remaining_texts, train_labels, remaining_labels = train_test_split(
    df['review'].tolist(),
    df['sentiment'].tolist(),
    test_size=0.2,
    random_state=42
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    remaining_texts,
    remaining_labels,
    test_size=0.2,
    random_state=42
)

print(f"Training set size: {len(train_texts)}")
print(f"Validation set size: {len(val_texts)}")
print(f"Test set size: {len(test_texts)}")

Training set size: 40000
Validation set size: 8000
Test set size: 2000


In [5]:
import torch
from transformers import DistilBertForSequenceClassification, AutoTokenizer

# Specify the pre-trained BERT model you want to use
model_name = 'distilbert-base-uncased-finetuned-sst-2-english'

# Initialize the model
model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Enable using GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Tokenize training and validation sets in batches
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=256, return_tensors="pt")
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=256, return_tensors="pt")

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

In [6]:
# Create Dataset
class MovieReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = MovieReviewDataset(train_encodings, train_labels)
val_dataset = MovieReviewDataset(val_encodings, val_labels)

In [7]:
!pip install evaluate
from transformers import BertForSequenceClassification, Trainer, TrainingArguments, TrainerCallback, EarlyStoppingCallback
import evaluate
import numpy as np
from transformers import AdamW, get_linear_schedule_with_warmup

model = DistilBertForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Traing parameter
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    save_strategy="steps",
    eval_steps=500,
    logging_steps=500,
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=12,
    warmup_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    logging_dir="./logs",  
    no_cuda=False
)

accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")
    return {
        "accuracy": accuracy['accuracy'],
        "f1": f1['f1'],
        "precision": precision['precision'],
        "recall": recall['recall']
    }

optimizer = AdamW(model.parameters(), lr=2e-5)

lr_scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=100,
    num_training_steps=len(train_dataset) * training_args.num_train_epochs
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, lr_scheduler),
    callbacks=[EarlyStoppingCallback(3)]
)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Collecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m14.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: datasets, evaluate
  Attempting uninstall: datasets
    Found existing installation: datasets 1.18.4
    Uninstalling datasets-1.18.4:
      Successfully uninstalled datasets-1.18.4
Successfully installed datasets-3.3.2 evaluate-0.4.3


Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.79k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.56k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.38k [00:00<?, ?B/s]

In [8]:
import wandb
wandb.init(mode="disabled")

# Starting train data
trainer.train()

# Evaluate model
eval_results = trainer.evaluate()

print(eval_results)



Step,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
500,0.2996,0.27806,0.89575,0.895752,0.895783,0.89575
1000,0.2623,0.276319,0.89775,0.897693,0.899124,0.89775
1500,0.2519,0.257854,0.905,0.904924,0.905893,0.905
2000,0.2318,0.250644,0.9045,0.904448,0.905051,0.9045
2500,0.2382,0.2615,0.903125,0.903035,0.905245,0.903125
3000,0.1658,0.288587,0.9095,0.909469,0.910466,0.9095
3500,0.17,0.333913,0.906875,0.906697,0.909316,0.906875
4000,0.1713,0.305141,0.911625,0.91162,0.911954,0.911625
4500,0.1691,0.285244,0.903125,0.902999,0.905943,0.903125
5000,0.1825,0.244275,0.91475,0.914733,0.9149,0.91475


{'eval_loss': 0.24427486956119537, 'eval_accuracy': 0.91475, 'eval_f1': 0.9147333699264145, 'eval_precision': 0.9148998328920743, 'eval_recall': 0.91475, 'eval_runtime': 42.0576, 'eval_samples_per_second': 190.216, 'eval_steps_per_second': 11.888, 'epoch': 2.6}


In [9]:
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=128)

    for key in inputs:
        inputs[key] = inputs[key].to(model.device)

    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=-1)
    return "Positive" if prediction.item() == 1 else "Negative"

In [10]:
sentence = "So hot today =_=  don`t like it and i hate my new timetable, having such a bad week"
predict_sentiment(sentence)

'Negative'