# Data Preprocessing

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv("/kaggle/input/twitter-and-reddit-sentimental-analysis-dataset/Reddit_Data.csv")
print("Shape:", df.shape)
print("Info:", df.info())
print("Sample:", df.sample()['clean_comment'].values)

In [None]:
df.isnull().sum()

In [None]:
df[df['clean_comment'].isna()]

In [None]:
df[df['clean_comment'].isna()]['category'].value_counts()

In [None]:
df.dropna(inplace=True)

In [None]:
df.duplicated().sum()

In [None]:
df[df.duplicated()]

In [None]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

In [None]:
df[(df['clean_comment'].str.strip() == '')]

In [None]:
df = df[~(df['clean_comment'].str.strip() == '')]
df[(df['clean_comment'].str.strip() == '')]

In [None]:
df[df['clean_comment'].apply(lambda x: x.endswith(' ') or x.startswith(' '))]

In [None]:
df['clean_comment'] = df['clean_comment'].str.strip()

df['clean_comment'].apply(lambda x: x.endswith(' ') or x.startswith(' ')).sum()

In [None]:
url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
comments_with_urls = df[df['clean_comment'].str.contains(url_pattern, regex=True)]
comments_with_urls

In [None]:
comments_with_newline = df[df['clean_comment'].str.contains('\n')]
comments_with_newline

In [None]:
df['clean_comment'] = df['clean_comment'].str.replace('\n', ' ', regex=True)

comments_with_newline_remaining = df[df['clean_comment'].str.contains('\n')]
comments_with_newline_remaining

In [None]:
df = df.rename(columns={"clean_comment": "text", "category": "labels"})
df = df[df['text'].apply(lambda x: isinstance(x, str))]
df.head(5)

In [None]:
label_map = {-1: 0, 0: 1, 1: 2}
df['labels'] = df['labels'].map(label_map)

print(df['labels'].value_counts())

In [None]:
class_counts = df['labels'].value_counts().sort_index()  # đảm bảo đúng thứ tự: 0,1,2
print(class_counts)

# Dataset

In [None]:
from datasets import Dataset
dataset = Dataset.from_pandas(df[['text', 'labels']])
dataset = dataset.train_test_split(test_size=0.2)

In [None]:
print(dataset)
print(dataset['train'][0])

In [None]:
print(type(dataset['train'][0]['text']))

# Tokenization & Model

In [None]:
import torch

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print("Using device:", device)
print("GPU count:", torch.cuda.device_count())

In [None]:
!pip install -qU transformers[torch] datasets peft accelerate scikit-learn

In [None]:
from transformers import AutoTokenizer
from datasets import DatasetDict

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize(example):
    return tokenizer(text=example["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = DatasetDict({
    'train': dataset['train'].map(tokenize, batched=True),
    'test': dataset['test'].map(tokenize, batched=True)
})

In [None]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)

    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)

    return {
        'accuracy': acc,
        'precision': precision,
        'recall': recall,
        'f1': f1,
    }

In [None]:
from transformers import AutoModelForSequenceClassification
from peft import get_peft_model, LoraConfig, TaskType

# Load mô hình gốc
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)

# Cấu hình LoRA
peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    task_type=TaskType.SEQ_CLS,  # Classification
    lora_dropout=0.1,
    bias="none",
    target_modules=["query", "value"]  # apply LoRA to Q, V projector
)

# Gắn mô hình PEFT
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# Training

In [None]:
from transformers import TrainingArguments, Trainer, TrainerCallback, TrainerControl
from torch.nn import CrossEntropyLoss

class EarlyStoppingCallback(TrainerCallback):
    def __init__(self, patience=5):
        self.patience = patience
        self.best_loss = float('inf')
        self.counter = 0

    def on_evaluate(self, args, state, control, metrics, **kwargs):
        val_loss = metrics.get("eval_loss")
        if val_loss is None:
            return control
        if val_loss < self.best_loss:
            self.best_loss = val_loss
            self.counter = 0
        else:
            self.counter += 1
            if self.counter >= self.patience:
                control.should_training_stop = True
        return control


training_args = TrainingArguments(
    output_dir="./lora-bert-results",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=40,
    learning_rate=2e-4,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    report_to="none",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True
)

# Custom trainer with weighted loss for imbalance data
class WeightedTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, num_items_in_batch=16):
        labels = inputs.pop("labels")
        # forward pass
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # compute custom loss for 3 labels with different weights
        reduction = "sum" if num_items_in_batch is not None else "mean"
        device = next(model.parameters()).device
        loss_fct = CrossEntropyLoss(weight=self.class_weights.to(device), reduction=reduction)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        if num_items_in_batch is not None:
            loss = loss / num_items_in_batch
    
        return loss if not return_outputs else (loss, outputs)


# Weighted class for imbalance data
counts = torch.tensor(class_counts.values, dtype=torch.float)
weights = counts.sum() / counts
weights = weights / weights.sum() * 3  # Normalize

trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    compute_metrics=compute_metrics,
    class_weights=weights,
    callbacks=[EarlyStoppingCallback()]
)

In [None]:
trainer.train()

In [None]:
trainer.evaluate()

In [None]:
# # 10 more epochs
# trainer.args.num_train_epochs = 40 + 10

# # Resume training
# trainer.train(resume_from_checkpoint=True)
# trainer.evaluate()

In [None]:
print(trainer.state.best_model_checkpoint)
print(trainer.state.best_metric)

In [None]:
print(next(model.parameters()).device)

In [None]:
model = trainer.model
model = model.merge_and_unload() # Merge LoRA weights into base model
model

In [None]:
model.save_pretrained("./lora_bert_sentiment")
tokenizer.save_pretrained("./lora_bert_sentiment")

In [33]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

id2label = {
    0: "negative",
    1: "neutral",
    2: "positive"
}
label2id = {v: k for k, v in id2label.items()}

# Load model from saved checkpoint ("./lora_bert_sentiment")
model = AutoModelForSequenceClassification.from_pretrained("/kaggle/input/lora-bert-sentiment/transformers/default/1/lora_bert_sentiment",
                                                           num_labels=3,
                                                           id2label=id2label,
                                                           label2id=label2id)
tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/lora-bert-sentiment/transformers/default/1/lora_bert_sentiment")

# Create pipeline for inference
pipe = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    device=0  # nếu có GPU
)

print(pipe("I really love this product!"))
print(pipe("I really hate this product!"))
print(pipe("Today is Tuesday"))

Device set to use cuda:0


[{'label': 'positive', 'score': 0.9990984201431274}]
[{'label': 'negative', 'score': 0.998960018157959}]
[{'label': 'neutral', 'score': 0.9984028935432434}]


In [35]:
import time
import numpy as np

text = "I really love this movie! It was fantastic."

# Số lần test
num_runs = 100

# Ghi thời gian chạy
times = []
for i in range(num_runs):
    start = time.time()
    _ = pipe(text)
    end = time.time()
    elapsed = end - start
    times.append(elapsed)

# Tính toán mean và std
mean_time = np.mean(times)
std_time = np.std(times)

print("\n--- Runtime ---")
print(f"Number of runs: {num_runs}")
print(f"mean: {mean_time:.4f}s")
print(f"std: {std_time:.4f}s")


--- Runtime ---
Number of runs: 100
mean: 0.0126s
std: 0.0510s


# Model for production

In [None]:
!pip install optimum[onnxruntime-gpu] transformers[torch] accelerate -U

In [None]:
# Download and install CUDA 11.8
!wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin
!sudo mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600
!wget https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda-repo-ubuntu2004-11-8-local_11.8.0-520.61.05-1_amd64.deb
!sudo dpkg -i cuda-repo-ubuntu2004-11-8-local_11.8.0-520.61.05-1_amd64.deb
!sudo cp /var/cuda-repo-ubuntu2004-11-8-local/cuda-*-keyring.gpg /usr/share/keyrings/
!sudo apt-get update
!sudo apt-get -y install cuda-toolkit-11-8

In [3]:
!python3 -V

Python 3.11.13


In [None]:
!pip install \
    nvidia-cublas-cu11==11.11.3.6 \
    nvidia-cuda-cupti-cu11==11.8.87 \
    nvidia-cuda-nvrtc-cu11==11.8.89 \
    nvidia-cuda-runtime-cu11==11.8.89 \
    nvidia-cudnn-cu11==8.7.0.84 \
    nvidia-cufft-cu11==10.9.0.58 \
    nvidia-curand-cu11==10.3.0.86 \
    nvidia-cusolver-cu11==11.4.1.48 \
    nvidia-cusparse-cu11==11.7.5.86 \
    nvidia-nccl-cu11==2.20.5 \
    nvidia-nvtx-cu11==11.8.86

In [None]:
!ls -l /usr/local/

In [None]:
!ls -l /usr/local | grep cuda

In [None]:
!cat /etc/os-release

In [8]:
import onnxruntime as ort
print("Available providers:", ort.get_available_providers())

Available providers: ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']


In [27]:
from transformers import AutoTokenizer
from optimum.onnxruntime import ORTModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("/kaggle/input/lora-bert-sentiment/transformers/default/1/lora_bert_sentiment")
ort_model = ORTModelForSequenceClassification.from_pretrained("/kaggle/input/lora-bert-sentiment/transformers/default/1/lora_bert_sentiment",
                                                          export=True,
                                                          provider="CUDAExecutionProvider"
                                                         )

[0;93m2025-11-01 02:27:48.783558681 [W:onnxruntime:, transformer_memcpy.cc:111 ApplyImpl] 12 Memcpy nodes are added to the graph main_graph for CUDAExecutionProvider. It might have negative impact on performance (including unable to run CUDA graph). Set session_options.log_severity_level=1 to see the detail logs before this message.[m
[0;93m2025-11-01 02:27:48.787958745 [W:onnxruntime:, session_state.cc:1316 VerifyEachNodeIsAssignedToAnEp] Some nodes were not assigned to the preferred execution providers which may or may not have an negative impact on performance. e.g. ORT explicitly assigns shape related ops to CPU to improve perf.[m
[0;93m2025-11-01 02:27:48.787975950 [W:onnxruntime:, session_state.cc:1318 VerifyEachNodeIsAssignedToAnEp] Rerunning with verbose output on a non-minimal build will show node assignments.[m


In [28]:
ort_model.provider

'CUDAExecutionProvider'

In [29]:
from transformers import pipeline

ort_model.config.id2label = {0: "negative", 1: "neutral", 2: "positive"}
ort_model.config.label2id = {"negative": 0, "neutral": 1, "positive": 2}

onnx_classifier = pipeline("text-classification", model=ort_model, tokenizer=tokenizer)


text = "I really love this product!"
print(onnx_classifier(text))

Device set to use cuda:0


[{'label': 'positive', 'score': 0.9990984201431274}]


In [36]:
import time
import numpy as np

text = "I really love this movie! It was fantastic."

# Số lần test
num_runs = 100

# Ghi thời gian chạy
times = []
for i in range(num_runs):
    start = time.time()
    _ = onnx_classifier(text)
    end = time.time()
    elapsed = end - start
    times.append(elapsed)

# Tính toán mean và std
mean_time = np.mean(times)
std_time = np.std(times)

print("\n--- Runtime ---")
print(f"Number of runs: {num_runs}")
print(f"mean: {mean_time:.4f}s")
print(f"std: {std_time:.4f}s")


--- Runtime ---
Number of runs: 100
mean: 0.0051s
std: 0.0003s


In [32]:
ort_model.save_pretrained("./onnx_lora_bert")
tokenizer.save_pretrained("./onnx_lora_bert")

('./onnx_lora_bert/tokenizer_config.json',
 './onnx_lora_bert/special_tokens_map.json',
 './onnx_lora_bert/vocab.txt',
 './onnx_lora_bert/added_tokens.json',
 './onnx_lora_bert/tokenizer.json')