In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier

In [2]:
# Load data dari direktori input Kaggle
train = pd.read_csv("/kaggle/input/hands-on-2-pembelajaran-mesin-c-i-iup/coupon/train.csv")
test = pd.read_csv("/kaggle/input/hands-on-2-pembelajaran-mesin-c-i-iup/coupon/test.csv")

display(train.head())
display(test.head())

Unnamed: 0,id,destination,passanger,weather,temperature,time,coupon,expiration,gender,age,...,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp,Y
0,0,No Urgent Place,Friend(s),Sunny,80,6PM,Coffee House,2h,Female,21,...,less1,gt8,1~3,1~3,1,0,0,0,1,0
1,1,Home,Alone,Sunny,80,6PM,Coffee House,2h,Female,50plus,...,never,less1,less1,less1,1,0,0,0,1,0
2,2,Home,Alone,Sunny,55,6PM,Restaurant(20-50),1d,Male,36,...,never,4~8,gt8,less1,1,1,0,0,1,1
3,3,No Urgent Place,Friend(s),Sunny,80,2PM,Restaurant(<20),2h,Male,26,...,gt8,gt8,4~8,less1,1,1,0,0,1,0
4,4,No Urgent Place,Kid(s),Sunny,80,2PM,Restaurant(<20),1d,Male,50plus,...,1~3,4~8,4~8,less1,1,0,0,0,1,1


Unnamed: 0,id,destination,passanger,weather,temperature,time,coupon,expiration,gender,age,...,Bar,CoffeeHouse,CarryAway,RestaurantLessThan20,Restaurant20To50,toCoupon_GEQ5min,toCoupon_GEQ15min,toCoupon_GEQ25min,direction_same,direction_opp
0,0,Home,Alone,Snowy,30,6PM,Coffee House,1d,Female,31,...,less1,less1,gt8,4~8,less1,1,1,0,0,1
1,1,Home,Alone,Sunny,55,6PM,Bar,1d,Female,50plus,...,less1,never,gt8,4~8,1~3,1,0,0,1,0
2,2,No Urgent Place,Alone,Sunny,80,10AM,Coffee House,2h,Female,31,...,never,less1,4~8,1~3,1~3,1,1,0,0,1
3,3,Home,Alone,Sunny,80,6PM,Restaurant(<20),2h,Female,36,...,never,never,1~3,less1,never,1,0,0,1,0
4,4,Home,Alone,Sunny,80,6PM,Coffee House,1d,Male,26,...,less1,less1,1~3,less1,less1,1,0,0,0,1


In [3]:
# Tandai data asal
train['is_train'] = 1
test['is_train'] = 0

# Gabungkan untuk preprocessing bersama
data = pd.concat([train, test], ignore_index=True)

In [4]:
import re

def parse_time(t):
    
    if pd.isna(t):
        return np.nan
    t_str = str(t).strip().upper()
    if re.match(r'^\d+$', t_str):
        return int(t_str)
    try:
        return pd.to_datetime(t_str, format='%I%p').hour
    except:
        return np.nan

# Terapkan parsing ke kolom 'time'
data['parsed_time'] = data['time'].apply(parse_time)

# Drop kolom time asli dan gunakan parsed_time
data.drop(columns=['time'], inplace=True)
data.rename(columns={'parsed_time': 'time'}, inplace=True)

# Encoding waktu secara siklikal
data['time_sin'] = np.sin(data['time'] * (2 * np.pi / 24))
data['time_cos'] = np.cos(data['time'] * (2 * np.pi / 24))


In [5]:
# Encode kolom kategorikal
cat_cols = data.select_dtypes(include='object').columns.difference(['id'])
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
data[cat_cols] = encoder.fit_transform(data[cat_cols])

# Normalisasi kolom numerik
num_cols = ['temperature', 'age', 'income']
scaler = StandardScaler()
data[num_cols] = scaler.fit_transform(data[num_cols])

In [6]:
# Kembalikan data terpisah
train_clean = data[data['is_train'] == 1].drop(columns=['is_train'])
test_clean = data[data['is_train'] == 0].drop(columns=['is_train', 'Y'])

X = train_clean.drop(columns=['id', 'Y'])
y = train_clean['Y']
X_test = test_clean.drop(columns=['id'])

In [7]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
models = [
    ('xgb', XGBClassifier(random_state=42, n_jobs=-1, eval_metric='logloss')),
    ('lgb', LGBMClassifier(random_state=42, n_jobs=-1)),
    ('catboost', CatBoostClassifier(random_state=42, verbose=0))
]

# Model meta (final estimator)
stacked_model = StackingClassifier(estimators=models, final_estimator=LogisticRegression())


In [9]:
stacked_model.fit(X_train, y_train)

[LightGBM] [Info] Number of positive: 1816, number of negative: 1384
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003408 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 136
[LightGBM] [Info] Number of data points in the train set: 3200, number of used features: 25
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.567500 -> initscore=0.271658
[LightGBM] [Info] Start training from score 0.271658
[LightGBM] [Info] Number of positive: 1453, number of negative: 1107
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000258 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 136
[LightGBM] [Info] Number of data points in the train set: 2560, number of used features: 25
[LightGBM] [Info] [binary:Bo

In [10]:
val_proba = stacked_model.predict_proba(X_val)[:, 1]
roc_auc = roc_auc_score(y_val, val_proba)
print(f"ROC AUC (Validation): {roc_auc:.4f}")


ROC AUC (Validation): 0.8209


In [11]:
# Prediksi probabilitas test set
test_proba = stacked_model.predict_proba(X_test)[:, 1]

# Buat file submission
submission = pd.DataFrame({
    'id': test['id'].astype(int),
    'Y': test_proba
}).sort_values('id')

# Simpan file
submission.to_csv("submission.csv", index=False)
print("File submission.csv berhasil dibuat dengan probabilitas!")


File submission.csv berhasil dibuat dengan probabilitas!
