In [19]:
# --- Sel 1: Impor library (tidak ada perubahan) ---
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [20]:
# --- Sel 2: Muat data dan imputasi ---
df_train = pd.read_csv('/kaggle/input/bdc-dataset/df_train (1).csv')
df_test = pd.read_csv('/kaggle/input/bdc-dataset/df_test (1).csv')

# Cek missing values
target_columns = ['task_achievement', 'coherence_and_cohesion', 'lexical_resource', 'grammatical_range']
print("Missing values sebelum imputasi:")
print(df_train[target_columns].isnull().sum())

# Imputasi nilai NaN pada target
imputer = SimpleImputer(strategy='mean')
df_train[target_columns] = imputer.fit_transform(df_train[target_columns])

print("\nMissing values setelah imputasi:")
print(df_train[target_columns].isnull().sum())

# Pisahkan fitur dan target (masih berupa teks asli)
X = df_train['essay']
y = df_train[target_columns]

Missing values sebelum imputasi:
task_achievement          245
coherence_and_cohesion    298
lexical_resource          674
grammatical_range         871
dtype: int64

Missing values setelah imputasi:
task_achievement          0
coherence_and_cohesion    0
lexical_resource          0
grammatical_range         0
dtype: int64


In [21]:
# --- Sel 3: Download resource NLTK & Inisialisasi Preprocessing ---
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\\d+', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = nltk.word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words and len(word) > 2]
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    clean_text = ' '.join(tokens)
    return clean_text

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /usr/share/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [22]:
# --- Sel 4: Terapkan preprocessing pada data ---
df_train['essay_clean'] = df_train['essay'].apply(preprocess_text)
df_test['essay_clean'] = df_test['essay'].apply(preprocess_text)

In [32]:
df_train['essay_clean']

0       deciding choose among potential way punishing ...
1       modern era youngster commit offence punished s...
2       modern era youngster commit offence punished s...
3       suggested punishment immature offender violate...
4       controversial dispute whether punishment teena...
                              ...                        
9907    proved sport activity school help child learn ...
9908    people suggest prime way teach child cooperate...
9909    argue best way teacher trained pupil team spor...
9910    nowadays people tend invest huge amount money ...
9911    role school education system always remains qu...
Name: essay_clean, Length: 9912, dtype: object

In [23]:
# --- Sel 5: Vektorisasi Tfidf (sudah menggunakan essay_clean) ---
vectorizer = TfidfVectorizer(max_features=5000)
X_tfidf = vectorizer.fit_transform(df_train['essay_clean'])
X_test_tfidf = vectorizer.transform(df_test['essay_clean'])

In [24]:
# --- Sel 6: Training XGBoost untuk masing-masing target ---
predictions = {}

for target in target_columns:
    print(f"--- Melatih Model untuk: {target} ---")
    y_target = y[target]

    # Pisahkan data training dan validasi
    X_train, X_val, y_train, y_val = train_test_split(X_tfidf, y_target, test_size=0.2, random_state=42)

    # Inisialisasi dan latih model XGBoost
    xgb = XGBRegressor(objective='reg:squarederror',
                       n_estimators=1000,
                       learning_rate=0.05,
                       max_depth=5,
                       subsample=0.8,
                       colsample_bytree=0.8,
                       random_state=42,
                       n_jobs=-1) # Hapus tree_method dan eval_metric untuk menghindari error GPU dan warning

    xgb.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)

    # Evaluasi model
    y_pred = xgb.predict(X_val)
    mse = mean_squared_error(y_val, y_pred)
    print(f'Mean Squared Error untuk {target}: {mse:.4f}')

    # Prediksi pada data test
    test_predictions = xgb.predict(X_test_tfidf)
    df_test[f'predicted_{target}'] = test_predictions
    print("-" * 35)

# Tampilkan hasil prediksi
print("\nHasil Prediksi pada Data Test:")
print(df_test.head())

--- Melatih Model untuk: task_achievement ---
Mean Squared Error untuk task_achievement: 1.1195
-----------------------------------
--- Melatih Model untuk: coherence_and_cohesion ---
Mean Squared Error untuk coherence_and_cohesion: 1.6605
-----------------------------------
--- Melatih Model untuk: lexical_resource ---
Mean Squared Error untuk lexical_resource: 1.0495
-----------------------------------
--- Melatih Model untuk: grammatical_range ---
Mean Squared Error untuk grammatical_range: 1.0723
-----------------------------------

Hasil Prediksi pada Data Test:
                                              prompt  \
0  In many countries, more and more young people ...   
1  Some believe that people are naturally born le...   
2  Every day, millions of tons of food are wasted...   
3  some people think that the best way to solve g...   
4  Some people think the best way to solve global...   

                                               essay  \
0  Unemployment in young generati

In [25]:
# --- Tambahkan kode ini di akhir sel terakhir Anda ---

# Simpan hasil prediksi ke file Excel
df_test.to_csv('hasil_prediksi_esay.csv', index=False)

print("\nFile 'hasil_prediksi_esay.csv' berhasil disimpan.")


File 'hasil_prediksi_esay.csv' berhasil disimpan.
