In [1]:
!pip install datasets==1.18.4

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re
from bs4 import BeautifulSoup
import nltk
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel, DistilBertForSequenceClassification, DistilBertConfig, Trainer, TrainingArguments
from transformers import get_cosine_schedule_with_warmup
from datasets import load_metric

from torch.optim import AdamW

import warnings
warnings.filterwarnings("ignore")




In [2]:
df = pd.read_csv('/kaggle/input/imdb-dataset-of-50k-movie-reviews/IMDB Dataset.csv')
df['sentiment'] = df['sentiment'].map({'positive': 1, 'negative': 0})
print(df.head(5))

                                              review  sentiment
0  One of the other reviewers has mentioned that ...          1
1  A wonderful little production. <br /><br />The...          1
2  I thought this was a wonderful way to spend ti...          1
3  Basically there's a family where a little boy ...          0
4  Petter Mattei's "Love in the Time of Money" is...          1


In [3]:
def missing_values_analysis(df):
    na_columns_ = [col for col in df.columns if df[col].isnull().sum() > 0]
    n_miss = df[na_columns_].isnull().sum().sort_values(ascending=True)
    ratio_ = (df[na_columns_].isnull().sum() / df.shape[0] * 100).sort_values(ascending=True)
    missing_df = pd.concat([n_miss, np.round(ratio_, 2)], axis=1, keys=['Total Missing Values', 'Ratio'])
    missing_df = pd.DataFrame(missing_df)
    return missing_df

In [None]:
import spacy
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup

# Load spaCy's small English model
nlp = spacy.load("en_core_web_sm")

# Initialize stopwords
stop_words = set(stopwords.words("english"))

def process(review):
    # Remove HTML tags
    review = BeautifulSoup(review, "html.parser").get_text()
    
    # Remove non-alphabetical characters (e.g., numbers, punctuation)
    review = re.sub(r"[^a-zA-Z]", ' ', review)
    
    # Lowercase the text
    review = review.lower()

    review = re.sub(r'\s+', ' ', review).strip()
    
    # Tokenization and Lemmatization with spaCy
    doc = nlp(review)
    review = [token.lemma_ for token in doc if token.text not in stop_words]
    
    return " ".join(review)

train_data = []

for i in range(len(df["review"])):
    if (i+1) % 2500 == 0:
        print("Processed reviews:", i+1)
    
    # train_data.append((df["review"][i]))
    train_data.append(process(df["review"][i]))

Processed reviews: 2500
Processed reviews: 5000
Processed reviews: 7500


In [None]:
train_texts, remaining_texts, train_labels, remaining_labels = train_test_split(
    df['review'].tolist(),
    df['sentiment'].tolist(),
    test_size=0.2,
    random_state=42
)

val_texts, test_texts, val_labels, test_labels = train_test_split(
    remaining_texts,
    remaining_labels,
    test_size=0.2,
    random_state=42
)

print(f"Training set size: {len(train_texts)}")
print(f"Validation set size: {len(val_texts)}")
print(f"Test set size: {len(test_texts)}")

In [None]:
from transformers import AutoModel, AutoTokenizer, Trainer, TrainingArguments
from datasets import load_dataset
import torch
import torch.nn as nn
import torch.nn.functional as F

simcse_model = AutoModel.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")
simcse_tokenizer = AutoTokenizer.from_pretrained("princeton-nlp/sup-simcse-bert-base-uncased")

class ContrastiveLoss(nn.Module):
    def __init__(self, temperature=0.05):
        super(ContrastiveLoss, self).__init__()
        self.temperature = temperature

    def forward(self, anchor, positive):
        sim = F.cosine_similarity(anchor, positive)
        sim = sim / self.temperature
        loss = -torch.log(torch.exp(sim) / torch.sum(torch.exp(sim), dim=-1))
        return loss.mean()

class SimCSETrainer(Trainer):
    def __init__(self, *args, contrastive_loss_fn, **kwargs):
        super().__init__(*args, **kwargs)
        self.contrastive_loss_fn = contrastive_loss_fn

    def compute_loss(self, model, inputs, return_outputs=False):
        input_ids = inputs["input_ids"]
        attention_mask = inputs["attention_mask"]
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.pooler_output
        anchor = embeddings[::2]
        positive = embeddings[1::2]
        
        loss = self.contrastive_loss_fn(anchor, positive)

        return (loss, outputs) if return_outputs else loss

    def prediction_step(self, model, inputs, prediction_loss_only=False, ignore_keys=None):
        if "labels" in inputs:
            del inputs["labels"]
        
        return super().prediction_step(model, inputs, prediction_loss_only, ignore_keys)

# Load dataset
dataset = load_dataset("imdb", split="train[:10%]")
dataset = dataset.map(lambda x: simcse_tokenizer(x['text'], padding=True, truncation=True, max_length=512), batched=True)

training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=1,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=200,
    evaluation_strategy="steps",
    save_strategy="steps",
)

train_data = load_dataset("imdb", split="train[:90%]")
eval_data = load_dataset("imdb", split="train[90%:]")

train_data = train_data.map(lambda x: simcse_tokenizer(x['text'], padding=True, truncation=True, max_length=512), batched=True)
eval_data = eval_data.map(lambda x: simcse_tokenizer(x['text'], padding=True, truncation=True, max_length=512), batched=True)

trainer = SimCSETrainer(
    model=simcse_model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=eval_data,
    contrastive_loss_fn=ContrastiveLoss(temperature=0.05),
)

trainer.train()


In [None]:
train_encodings = simcse_tokenizer(train_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")
val_encodings = simcse_tokenizer(val_texts, truncation=True, padding=True, max_length=512, return_tensors="pt")

In [None]:
class SimCSEMovieReviewDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SimCSEMovieReviewDataset(train_encodings, train_labels)
val_dataset = SimCSEMovieReviewDataset(val_encodings, val_labels)

In [None]:
from transformers import Trainer, TrainingArguments, EarlyStoppingCallback, DataCollatorWithPadding

model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english", num_labels=2
)

optimizer = AdamW(
    model.parameters(),
    lr=2e-5
)

lr_scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=100,
    num_training_steps=len(train_texts) * 2
)

config = DistilBertConfig.from_pretrained(
    "distilbert-base-uncased-finetuned-sst-2-english", 
    num_labels=2, 
    dropout=0.2,
    attention_dropout=0.2
)

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    save_strategy="steps",
    logging_dir="./logs",
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    num_train_epochs=5,
    warmup_steps=200,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    eval_steps=500,
    logging_steps=500,
    learning_rate=1e-5,
    weight_decay=0.01,
)

accuracy_metric = load_metric("accuracy")
f1_metric = load_metric("f1")
precision_metric = load_metric("precision")
recall_metric = load_metric("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_metric.compute(predictions=predictions, references=labels)
    f1 = f1_metric.compute(predictions=predictions, references=labels, average="weighted")
    precision = precision_metric.compute(predictions=predictions, references=labels, average="weighted")
    recall = recall_metric.compute(predictions=predictions, references=labels, average="weighted")
    return {
        "accuracy": accuracy['accuracy'],
        "f1": f1['f1'],
        "precision": precision['precision'],
        "recall": recall['recall']
    }

early_stopping = EarlyStoppingCallback(early_stopping_patience=2)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, lr_scheduler),
    callbacks=[early_stopping]
)

In [None]:
# Huấn luyện mô hình
trainer.train()

In [None]:
def predict_sentiment(text):
    inputs = simcse_tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    
    inputs = {key: value for key, value in inputs.items() if key != 'token_type_ids'}

    for key in inputs:
        inputs[key] = inputs[key].to(model.device)

    outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=-1)
    return "Positive" if prediction.item() == 1 else "Negative"

In [None]:
sentence = "So hot today =_=  don`t like it and i hate my new timetable, having such a bad week"
print("Original sentence: " + sentence)
print("Predict before: " + predict_sentiment(sentence))
sentence = process(sentence)
print("After processing sentence: " + sentence)
print("Predict after: " + predict_sentiment(sentence))

In [None]:
correct_predictions = sum(
    1 for i in range(len(test_texts))
    if (predict_sentiment(process(test_texts[i])) == "Positive") == test_labels[i]
)

accuracy = correct_predictions / len(test_labels)
print(accuracy)