In [1]:
!pip install transformers datasets scikit-learn pandas tqdm




[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: C:\Users\IKBAR\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
import os
import pandas as pd
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertConfig
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch
import time
from tqdm import tqdm

class SequenceClassificationDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs['input_ids'])

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs['input_ids'][idx],
            'attention_mask': self.inputs['attention_mask'][idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

class DistilBertFineTuning:
    def __init__(self, dataset_path, train_file, validation_file, feature_col, label_col, model_name, batch_size, learning_rate, num_epochs, max_len, optimizer='AdamW', device='cpu'):
        self.dataset_path = dataset_path
        self.train_file = train_file
        self.validation_file = validation_file
        self.feature_col = feature_col
        self.label_col = label_col
        self.model_name = model_name
        self.batch_size = batch_size
        self.learning_rate = learning_rate
        self.num_epochs = num_epochs
        self.max_len = max_len
        self.optimizer_type = optimizer
        self.device = torch.device(device)

        self.tokenizer = DistilBertTokenizer.from_pretrained(self.model_name)

        self.train_df = pd.read_csv(os.path.join(self.dataset_path, self.train_file))
        self.validation_df = pd.read_csv(os.path.join(self.dataset_path, self.validation_file))

        self.num_labels = len(self.train_df[self.label_col].unique())

        self.tokenized_train = self.tokenize_dataset(self.train_df, self.feature_col, self.label_col)
        self.tokenized_validation = self.tokenize_dataset(self.validation_df, self.feature_col, self.label_col)

        self.model_config = DistilBertConfig.from_pretrained(self.model_name, num_labels=self.num_labels)
        self.model = DistilBertForSequenceClassification.from_pretrained(self.model_name, config=self.model_config)
        self.model.to(self.device)

        if self.optimizer_type == 'Adam':
            self.optimizer = torch.optim.Adam(self.model.parameters(), lr=self.learning_rate)
        elif self.optimizer_type == 'AdamW':
            self.optimizer = AdamW(self.model.parameters(), lr=self.learning_rate)
        else:
            raise ValueError("Unsupported optimizer type. Use 'Adam' or 'AdamW'.")

        self.train_dataloader = self.create_dataloader(self.tokenized_train)
        self.validation_dataloader = self.create_dataloader(self.tokenized_validation, shuffle=False)

    def tokenize_dataset(self, df, feature_col, label_col):
        tokens = self.tokenizer(
            list(df[feature_col]),
            padding=True,
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )
        return tokens, list(df[label_col])

    def create_dataloader(self, tokenized_dataset, shuffle=True):
        dataset = SequenceClassificationDataset(tokenized_dataset[0], tokenized_dataset[1])
        return DataLoader(dataset, batch_size=self.batch_size, shuffle=shuffle)

    def evaluate_model(self, dataloader):
        self.model.eval()
        all_labels, all_predictions = [], []

        with torch.no_grad():
            for batch in dataloader:
                inputs = {key: value.to(self.device) for key, value in batch.items()}
                labels = inputs["labels"]
                outputs = self.model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
                logits = outputs.logits
                _, predicted = torch.max(logits, 1)

                all_labels.extend(labels.cpu().numpy())
                all_predictions.extend(predicted.cpu().numpy())

        accuracy = accuracy_score(all_labels, all_predictions)
        precision = precision_score(all_labels, all_predictions, average='weighted')
        recall = recall_score(all_labels, all_predictions, average='weighted')
        f1 = f1_score(all_labels, all_predictions, average='weighted')

        return accuracy, precision, recall, f1

    def train(self):
        for epoch in range(self.num_epochs):
            self.model.train()
            train_losses = []

            for batch in tqdm(self.train_dataloader, desc=f'Epoch {epoch + 1}/{self.num_epochs}'):
                inputs = {key: value.to(self.device) for key, value in batch.items()}
                outputs = self.model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], labels=inputs["labels"])
                loss = outputs.loss
                train_losses.append(loss.item())

                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

            validation_losses = []
            val_acc, val_prec, val_rec, val_f1 = self.evaluate_model(self.validation_dataloader)

            for batch in self.validation_dataloader:
                inputs = {key: value.to(self.device) for key, value in batch.items()}
                outputs = self.model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], labels=inputs["labels"])
                validation_losses.append(outputs.loss.item())

            print(f"Epoch {epoch+1}/{self.num_epochs} - Train Loss: {sum(train_losses)/len(train_losses):.4f} - Val Loss: {sum(validation_losses)/len(validation_losses):.4f} - Val Acc: {val_acc:.4f} - Val F1: {val_f1:.4f}")

    def save_model(self, directory):
        os.makedirs(directory, exist_ok=True)
        self.model.save_pretrained(directory)
        self.tokenizer.save_pretrained(directory)
        
start_time = time.time()
model = 'distilbert'
model_name = 'distilbert-base-uncased'

learning_rate = 2e-5
num_epochs = 3
batch_size = 6
max_len = 512
optimizer = 'AdamW'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

absolute_path = "D:/Berkas/Code/CryptoNew/"
dataset_path = absolute_path + "Datasets/"
train_file = 'train_set.csv'
validation_file = 'validation_set.csv'
feature_col = 'text'
label_col = 'sentiment_numerical'
trained_model = model + '_optimizer_' + optimizer + '_lr_' + str(learning_rate) + '_epochs_' + str(num_epochs) + '_bs_' + str(batch_size) + '_maxlen_' + str(max_len)

classifier = DistilBertFineTuning(dataset_path, train_file, validation_file, feature_col, label_col, model_name, batch_size, learning_rate, num_epochs, max_len, optimizer, device)

classifier.train()
classifier.save_model(absolute_path + 'TrainedModels/' + trained_model)

print("Training time: {:.2f} seconds".format(time.time() - start_time))

  from .autonotebook import tqdm as notebook_tqdm
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1/3: 100%|██████████| 534/534 [00:44<00:00, 11.88it/s]


Epoch 1/3 - Train Loss: 0.9316 - Val Loss: 0.7534 - Val Acc: 0.6900 - Val F1: 0.6882


Epoch 2/3: 100%|██████████| 534/534 [00:43<00:00, 12.20it/s]


Epoch 2/3 - Train Loss: 0.4835 - Val Loss: 0.5473 - Val Acc: 0.8100 - Val F1: 0.8090


Epoch 3/3: 100%|██████████| 534/534 [00:45<00:00, 11.82it/s]


Epoch 3/3 - Train Loss: 0.2271 - Val Loss: 0.5052 - Val Acc: 0.8300 - Val F1: 0.8301
Training time: 152.74 seconds


In [3]:
import os
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer


# Gunakan path lokal (ganti sesuai komputermu)
absolute_path = "D:/Berkas/Code/CryptoNew/"
test_file = 'test_set_distil_adam.csv'
trained_model_name = 'distilbert_optimizer_AdamW_lr_2e-05_epochs_3_bs_6_maxlen_512'

# Baca data uji
test_df = pd.read_csv(os.path.join(absolute_path, 'Datasets', test_file))

# Load model dan tokenizer yang sudah dilatih
model_path = os.path.join(absolute_path, 'TrainedModels', trained_model_name)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Tokenisasi data uji
tokenized_test = tokenizer(
    list(test_df['text']),
    padding=True,
    truncation=True,
    return_tensors='pt'
).to(device)

# Prediksi
model.eval()
with torch.no_grad():
    inputs = {key: value.to(device) for key, value in tokenized_test.items()}
    outputs = model(**inputs)
    logits = outputs.logits
    _, predicted_labels = torch.max(logits, dim=1)

# Simpan hasil prediksi
test_df['distilbert_adamw_ft_prediction'] = predicted_labels.cpu().numpy()
test_df.to_csv(os.path.join(absolute_path, 'Datasets', 'test_set_distil_adamw.csv'), index=False)
print("Prediksi selesai disimpan ke test_set_distil_adamw.csv")


Prediksi selesai disimpan ke test_set_distil_adamw.csv


In [4]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Path ke file hasil prediksi
file_path = "D:/Berkas/Code/CryptoNew/Datasets/test_set_distil_adamw.csv"

# Baca file
df = pd.read_csv(file_path)

# Ambil label asli dan prediksi
y_true = df['sentiment_numerical']
y_pred = df['distilbert_adamw_ft_prediction']

# Hitung metrik
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

# Tampilkan hasil
print(f"Accuracy :  {accuracy:.4f}")
print(f"Precision:  {precision:.4f}")
print(f"Recall   :  {recall:.4f}")
print(f"F1 Score :  {f1:.4f}")


Accuracy :  0.8530
Precision:  0.8534
Recall   :  0.8530
F1 Score :  0.8530
