In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

In [2]:
train_data = pd.read_csv("../data/train.csv")
test_data = pd.read_csv("../data/test.csv")

In [3]:
# 데이터 샘플링 (예: 10% 데이터만 사용)
sample_fraction = 1.0  # 샘플 비율 (1%)
train_data = train_data.sample(frac=sample_fraction, random_state=42)
test_data = test_data.sample(frac=sample_fraction, random_state=42)

## data preprocessing

In [4]:
# 필요 없는 변수 제거
train_data.drop(['id'], axis=1, inplace=True)
test_data.drop(['id'], axis=1, inplace=True)

In [5]:
# 범주형 변수 object로 변환
# 범주형 데이터 int -> object로 변환
train_data['Driving_License'] = train_data['Driving_License'].astype('object')
train_data['Region_Code'] = train_data['Region_Code'].astype('object')
train_data['Previously_Insured'] = train_data['Previously_Insured'].astype('object')
train_data['Policy_Sales_Channel'] = train_data['Policy_Sales_Channel'].astype('object')
#train_data['Response'] = train_data['Response'].astype('object')

test_data['Driving_License'] = test_data['Driving_License'].astype('object')
test_data['Region_Code'] = test_data['Region_Code'].astype('object')
test_data['Previously_Insured'] = test_data['Previously_Insured'].astype('object')
test_data['Policy_Sales_Channel'] = test_data['Policy_Sales_Channel'].astype('object')

In [6]:
# 데이터 분할
x_train = train_data.drop(['Response'], axis=1)
y_train = train_data['Response']
x_test = test_data

# 범주형 데이터 더미처리
x_train = pd.get_dummies(x_train)
x_test = pd.get_dummies(x_test)

In [7]:
# 학습 데이터와 테스트 데이터에서 동일한 컬럼을 유지
x_train, x_test = x_train.align(x_test, join='outer', axis=1, fill_value=0)

In [8]:
x_test

Unnamed: 0,Age,Annual_Premium,Driving_License_0,Driving_License_1,Gender_Female,Gender_Male,Policy_Sales_Channel_1.0,Policy_Sales_Channel_10.0,Policy_Sales_Channel_100.0,Policy_Sales_Channel_101.0,...,Region_Code_6.0,Region_Code_7.0,Region_Code_8.0,Region_Code_9.0,Vehicle_Age_1-2 Year,Vehicle_Age_< 1 Year,Vehicle_Age_> 2 Years,Vehicle_Damage_No,Vehicle_Damage_Yes,Vintage
2487904,41,40255.0,False,True,False,True,False,False,False,False,...,False,False,False,False,True,False,False,False,True,158
112016,27,29366.0,False,True,True,False,False,False,False,False,...,False,False,False,False,False,True,False,True,False,258
5541771,23,33807.0,False,True,False,True,False,False,False,False,...,False,False,False,False,False,True,False,True,False,96
3807403,27,35341.0,False,True,False,True,False,False,False,False,...,False,False,False,False,False,True,False,True,False,206
973350,46,2630.0,False,True,True,False,False,False,False,False,...,False,False,False,False,True,False,False,False,True,291
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1692743,52,2630.0,False,True,False,True,False,False,False,False,...,False,False,False,False,True,False,False,False,True,194
6550634,23,37494.0,False,True,False,True,False,False,False,False,...,False,False,False,False,False,True,False,False,True,117
6423388,40,36367.0,False,True,False,True,False,False,False,False,...,False,False,False,False,True,False,False,False,True,227
6962611,52,39828.0,False,True,True,False,False,False,False,False,...,False,False,False,False,True,False,False,False,True,158


In [9]:
from sklearn.model_selection import train_test_split

# Val DataSet으로 성능 확인
x_tr, x_val, y_tr, y_val = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

# 데이터 정규화
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
x_tr = scaler.fit_transform(x_tr)
x_val = scaler.transform(x_val)

In [10]:
# 성능 확인
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, roc_auc_score

In [11]:
# lightgbm
from lightgbm import LGBMClassifier

model = LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.1, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=31, objective=None,
               random_state=4923, reg_alpha=0.0, reg_lambda=0.0, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

model.fit(x_tr, y_tr)

y_pred = model.predict_proba(x_val)[:,1]

display(roc_auc_score(y_val, y_pred))

[LightGBM] [Info] Number of positive: 1132273, number of negative: 8071565
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.704683 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1200
[LightGBM] [Info] Number of data points in the train set: 9203838, number of used features: 220
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.123022 -> initscore=-1.964120
[LightGBM] [Info] Start training from score -1.964120


0.8752898832337903

In [12]:
# Cross Validation, visualization
#from sklearn.model_selection import cross_val_score
#
#scores = cross_val_score(model, x_train, y_train, cv=5, scoring='roc_auc')
#display(scores)


In [13]:
# 0.1 :0.867285698064226
# 0.3 :0.8740761725236561
# 1.0 :0.8752898832337903