In [1]:
!pip install transformers datasets scikit-learn pandas tqdm




[notice] A new release of pip is available: 25.0.1 -> 25.1.1
[notice] To update, run: C:\Users\IKBAR\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


In [2]:
import os
import pandas as pd
import torch
import time
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import torch.nn as nn

class SequenceClassificationDataset(Dataset):
    def __init__(self, inputs, labels):
        self.inputs = inputs
        self.labels = labels

    def __len__(self):
        return len(self.inputs['input_ids'])

    def __getitem__(self, idx):
        return {
            'input_ids': self.inputs['input_ids'][idx],
            'attention_mask': self.inputs['attention_mask'][idx],
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

class LlamaClassifier(nn.Module):
    def __init__(self, base_model, num_labels):
        super(LlamaClassifier, self).__init__()
        self.base_model = base_model
        self.classifier = nn.Linear(base_model.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.base_model(input_ids=input_ids, attention_mask=attention_mask, return_dict=True)
        hidden_state = outputs.last_hidden_state  # (batch_size, seq_len, hidden_size)
        pooled_output = hidden_state[:, -1, :]    # Ambil token terakhir
        logits = self.classifier(pooled_output)

        loss = None
        if labels is not None:
            loss_fn = nn.CrossEntropyLoss()
            loss = loss_fn(logits, labels)

        return {'loss': loss, 'logits': logits}

class LlamaFineTuning:
    def __init__(self, dataset_path, train_file, validation_file, feature_col, label_col, model_name, batch_size, learning_rate, num_epochs, max_len, optimizer='AdamW', device='cpu'):
        self.device = torch.device(device)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
        self.tokenizer.pad_token = self.tokenizer.eos_token
        self.tokenizer.padding_side = "left"

        self.train_df = pd.read_csv(os.path.join(dataset_path, train_file))
        self.val_df = pd.read_csv(os.path.join(dataset_path, validation_file))
        self.num_labels = len(self.train_df[label_col].unique())

        self.model = LlamaClassifier(AutoModelForCausalLM.from_pretrained(model_name), self.num_labels)
        self.model.to(self.device)

        self.tokenized_train = self.tokenize_dataset(self.train_df, feature_col, label_col)
        self.tokenized_val = self.tokenize_dataset(self.val_df, feature_col, label_col)

        self.optimizer = AdamW(self.model.parameters(), lr=learning_rate)

        self.train_loader = self.create_dataloader(self.tokenized_train)
        self.val_loader = self.create_dataloader(self.tokenized_val, shuffle=False)

        self.num_epochs = num_epochs

    def tokenize_dataset(self, df, feature_col, label_col):
        tokens = self.tokenizer(
            list(df[feature_col]),
            padding=True,
            truncation=True,
            max_length=max_len,
            return_tensors='pt'
        )
        return tokens, list(df[label_col])

    def create_dataloader(self, tokenized_dataset, shuffle=True):
        dataset = SequenceClassificationDataset(tokenized_dataset[0], tokenized_dataset[1])
        return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

    def evaluate_model(self, dataloader):
        self.model.eval()
        all_labels, all_preds = [], []

        with torch.no_grad():
            for batch in dataloader:
                inputs = {k: v.to(self.device) for k, v in batch.items()}
                outputs = self.model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"])
                logits = outputs["logits"]
                preds = torch.argmax(logits, dim=1)

                all_labels.extend(inputs["labels"].cpu().numpy())
                all_preds.extend(preds.cpu().numpy())

        acc = accuracy_score(all_labels, all_preds)
        prec = precision_score(all_labels, all_preds, average='weighted')
        rec = recall_score(all_labels, all_preds, average='weighted')
        f1 = f1_score(all_labels, all_preds, average='weighted')
        return acc, prec, rec, f1

    def train(self):
        for epoch in range(self.num_epochs):
            self.model.train()
            train_losses = []

            for batch in tqdm(self.train_loader, desc=f'Epoch {epoch+1}/{self.num_epochs}'):
                inputs = {k: v.to(self.device) for k, v in batch.items()}
                outputs = self.model(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], labels=inputs["labels"])
                loss = outputs["loss"]
                train_losses.append(loss.item())

                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

            val_acc, val_prec, val_rec, val_f1 = self.evaluate_model(self.val_loader)

            print(f"Epoch {epoch+1} - Train Loss: {sum(train_losses)/len(train_losses):.4f} - Val Acc: {val_acc:.4f} - F1: {val_f1:.4f}")

    def save_model(self, directory):
        os.makedirs(directory, exist_ok=True)
        self.model.base_model.save_pretrained(directory)
        self.tokenizer.save_pretrained(directory)

# =============================
# ======== Start Train ========
# =============================

start_time = time.time()
model = 'llama'
model_name = 'meta-llama/Llama-2-7b-hf'

learning_rate = 2e-5
num_epochs = 3
batch_size = 2  # karena LLaMA sangat besar
max_len = 512
optimizer = 'AdamW'
device = 'cuda' if torch.cuda.is_available() else 'cpu'

dataset_path = "D:/Berkas/Code/CryptoNew/Datasets/"
train_file = 'train_set.csv'
validation_file = 'validation_set.csv'
feature_col = 'text'
label_col = 'sentiment_numerical'
trained_model = model + '_lr_' + str(learning_rate) + '_epochs_' + str(num_epochs)

classifier = LlamaFineTuning(dataset_path, train_file, validation_file, feature_col, label_col, model_name, batch_size, learning_rate, num_epochs, max_len, optimizer, device)

classifier.train()
classifier.save_model("D:/Berkas/Code/CryptoNew/TrainedModels/" + trained_model)

print("Training time: {:.2f} seconds".format(time.time() - start_time))


OSError: You are trying to access a gated repo.
Make sure to have access to it at https://huggingface.co/meta-llama/Llama-2-7b-hf.
401 Client Error. (Request ID: Root=1-684f1b16-09c8bebb17d0dfc359c882b0;8ce9bfdd-97bd-41a2-8284-01e4ca1fb51a)

Cannot access gated repo for url https://huggingface.co/meta-llama/Llama-2-7b-hf/resolve/main/config.json.
Access to model meta-llama/Llama-2-7b-hf is restricted. You must have access to it and be authenticated to access it. Please log in.

In [5]:
import os
import pandas as pd
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer


# Gunakan path lokal (ganti sesuai komputermu)
absolute_path = "D:/Berkas/Code/CryptoNew/"
test_file = 'test_set.csv'
trained_model_name = 'distilbert_optimizer_Adam_lr_2e-05_epochs_3_bs_6_maxlen_512'

# Baca data uji
test_df = pd.read_csv(os.path.join(absolute_path, 'Datasets', test_file))

# Load model dan tokenizer yang sudah dilatih
model_path = os.path.join(absolute_path, 'TrainedModels', trained_model_name)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = AutoModelForSequenceClassification.from_pretrained(model_path).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# Tokenisasi data uji
tokenized_test = tokenizer(
    list(test_df['text']),
    padding=True,
    truncation=True,
    return_tensors='pt'
).to(device)

# Prediksi
model.eval()
with torch.no_grad():
    inputs = {key: value.to(device) for key, value in tokenized_test.items()}
    outputs = model(**inputs)
    logits = outputs.logits
    _, predicted_labels = torch.max(logits, dim=1)

# Simpan hasil prediksi
test_df['distilbert_adam_ft_prediction'] = predicted_labels.cpu().numpy()
test_df.to_csv(os.path.join(absolute_path, 'Datasets', 'test_set_distil_adam.csv'), index=False)
print("Prediksi selesai disimpan ke test_set_distil_adam.csv")


Prediksi selesai disimpan ke test_set_distil_adam.csv


In [6]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Path ke file hasil prediksi
file_path = "D:/Berkas/Code/CryptoNew/Datasets/test_set_distil_adam.csv"

# Baca file
df = pd.read_csv(file_path)

# Ambil label asli dan prediksi
y_true = df['sentiment_numerical']
y_pred = df['distilbert_adam_ft_prediction']

# Hitung metrik
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, average='weighted')
recall = recall_score(y_true, y_pred, average='weighted')
f1 = f1_score(y_true, y_pred, average='weighted')

# Tampilkan hasil
print(f"Accuracy :  {accuracy:.4f}")
print(f"Precision:  {precision:.4f}")
print(f"Recall   :  {recall:.4f}")
print(f"F1 Score :  {f1:.4f}")


Accuracy :  0.8340
Precision:  0.8399
Recall   :  0.8340
F1 Score :  0.8347
