In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import torch
from torch.utils.data import TensorDataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from sklearn.metrics import mean_squared_error
from tqdm.auto import tqdm

In [None]:
# --- 1. Muat data ---
print("--- Memuat Data ---")
df_train = pd.read_csv('/kaggle/input/bdc-dataset/df_train (1).csv')
df_test = pd.read_csv('/kaggle/input/bdc-dataset/df_test (1).csv')

In [None]:
df_train

In [None]:
# --- 2. Imputasi Missing Value ---
print("\n--- Imputasi Missing Value pada Kolom Target ---")
target_columns = ['task_achievement', 'coherence_and_cohesion', 'lexical_resource', 'grammatical_range']
imputer = SimpleImputer(strategy='mean')
df_train[target_columns] = imputer.fit_transform(df_train[target_columns])

In [None]:
# --- 3. Tokenisasi dengan BERT ---
print("\n--- Tokenisasi dengan BERT ---")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
MAX_LENGTH = 256  # Panjang maksimal token, sesuaikan jika esai Anda sangat panjang

# Tokenisasi data training
tokenized_train = tokenizer(
    df_train['essay'].tolist(),
    padding=True,
    truncation=True,
    max_length=MAX_LENGTH,
    return_tensors='pt'
)

# Tokenisasi data test
tokenized_test = tokenizer(
    df_test['essay'].tolist(),
    padding=True,
    truncation=True,
    max_length=MAX_LENGTH,
    return_tensors='pt'
)

In [None]:
# --- 4. Persiapan Data untuk PyTorch ---
print("\n--- Persiapan Data untuk PyTorch ---")
train_labels = torch.tensor(df_train[target_columns].values, dtype=torch.float32)
train_dataset = TensorDataset(tokenized_train['input_ids'], tokenized_train['attention_mask'], train_labels)

test_dataset = TensorDataset(tokenized_test['input_ids'], tokenized_test['attention_mask'])

BATCH_SIZE = 16  # Ukuran batch, sesuaikan dengan kapasitas GPU
train_dataloader = DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE)
test_dataloader = DataLoader(test_dataset, shuffle=False, batch_size=BATCH_SIZE)

In [None]:
# --- 5. Definisi Model BERT untuk Multi-Target Regresi ---
print("\n--- Definisi Model BERT untuk Multi-Target Regresi ---")
class BertForMultiTargetRegression(torch.nn.Module):
    def __init__(self, num_labels):
        super(BertForMultiTargetRegression, self).__init__()
        # Menggunakan BertForSequenceClassification, tapi kita akan ubah head-nya
        self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=num_labels)
        self.regressor = torch.nn.Linear(self.bert.config.hidden_size, num_labels)
        self.bert.classifier = self.regressor # Ganti lapisan classifier bawaan dengan regressor kita

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=False)
        logits = outputs.logits
        return logits

# Inisialisasi model, optimizer, dan loss function
num_labels = len(target_columns)
model = BertForMultiTargetRegression(num_labels)
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)
loss_fn = torch.nn.MSELoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

In [None]:
# --- 6. Pelatihan Model (Fine-Tuning) ---
print("\n--- Memulai Pelatihan Model (Fine-Tuning) ---")
EPOCHS = 8 # Jumlah epoch, 3-5 adalah nilai yang umum
model.train()

for epoch in range(EPOCHS):
    total_loss = 0
    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        input_ids, attention_mask, labels = batch 
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        
        loss = loss_fn(outputs, labels)
        total_loss += loss.item()
         
        loss.backward()
        optimizer.step()
        
    avg_loss = total_loss / len(train_dataloader)
    print(f"Loss di akhir Epoch {epoch+1}: {avg_loss:.4f}")

In [None]:
# --- 7. Prediksi pada Data Test ---
print("\n--- Memulai Prediksi pada Data Test ---")
model.eval()
test_predictions = []
with torch.no_grad():
    for batch in tqdm(test_dataloader, desc="Predicting"):
        input_ids, attention_mask = batch
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)
        
        outputs = model(input_ids, attention_mask)
        test_predictions.extend(outputs.cpu().numpy())

In [None]:
# --- 8. Simpan Hasil Prediksi ke File Excel ---
print("\n--- Menyimpan Hasil Prediksi ke Excel ---")
predicted_df = pd.DataFrame(test_predictions, columns=[f'predicted_{col}' for col in target_columns])
final_predictions = pd.concat([df_test.reset_index(drop=True), predicted_df], axis=1)
final_predictions.to_excel('hasil_prediksi_esay_bert.xlsx', index=False)

print("\nProses selesai. File 'hasil_prediksi_esay_bert.xlsx' berhasil dibuat.")