In [9]:
!pip install kaggle
from google.colab import files
files.upload()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Saving kaggle.json to kaggle (1).json


{'kaggle.json': b'{"username":"kimbbaro","key":"0eee4ddbd0a80636c2a3f8146af07993"}'}

In [10]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

# Permission Warning 방지
!chmod 600 ~/.kaggle/kaggle.json

In [11]:
!kaggle competitions download -c porto-seguro-safe-driver-prediction

porto-seguro-safe-driver-prediction.zip: Skipping, found more recently modified local copy (use --force to force download)


In [12]:
!unzip /content/porto-seguro-safe-driver-prediction.zip

Archive:  /content/porto-seguro-safe-driver-prediction.zip
replace sample_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: sample_submission.csv   
replace test.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: test.csv                y
y

replace train.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename:   inflating: train.csv               y



In [13]:
MAX_ROUNDS = 400
OPTIMIZE_ROUNDS = False
LEARNING_RATE = 0.07
EARLY_STOPPING_ROUNDS = 50

In [14]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
#LabelEncoder는 머신러닝에서 주로 범주형 변수를 다룰 때 사용하는 변환 방법 중 하나입니다
from sklearn.preprocessing import LabelEncoder
#파이썬 코드를 최적화된 기계어로 컴파일하여 빠르게 실행할 수 있습니다. 
from numba import jit
import time
#가비지 컬렉션은 동적으로 할당된 메모리 중 더 이상 사용되지 않는 부분을 자동으로 탐지하여 해제하는 프로세스를 의미합니다.
import gc

In [15]:
@jit
def eval_gini(y_true, y_prob):
  y_true = np.asarray(y_true)
  #argsort() 함수는 배열의 원소를 오름차순으로 정렬한 후, 해당 원소의 인덱스를 반환합니다. 
  #즉, y_prob를 정렬한 후, 각 원소의 인덱스를 반환하게 됩니다. 이를 이용하여 y_true도 y_prob를 기준으로 정렬합니다.
  y_true = y_true[np.argsort(y_prob)]
  ntrue = 0
  gini =0
  delta = 0
  n = len(y_true)
  for i in range(n-1, -1,-1):
    y_i = y_true[i]
    ntrue +=y_i
    gini +=y_i * delta
    delta +=1-y_i
  gini = 1-2*gini / (ntrue * (n-ntrue))
  return gini
  

Target encoding은 범주형 피처를 수치형 피처로 변환하는 방법 중 하나입니다. 이 방법은 각 범주형 값을 해당 범주의 타깃 값의 평균으로 대체하여 범주형 값을 수치형 값으로 변환합니다.

예를 들어, "도시"라는 범주형 피처가 있다고 가정해보겠습니다. "서울", "부산", "대전" 등의 도시명이 해당 피처에 존재한다면, 이 피처를 target encoding을 사용하여 수치형 값으로 변환할 수 있습니다. 이때, 변환하는 과정은 다음과 같습니다.

각 도시별 타깃 변수(예: 선호도)의 평균을 계산합니다.
각 도시별로 계산한 평균 값을 해당 도시의 값으로 대체합니다.
범주형 값을 수치형 값으로 대체합니다.
이렇게 변환된 피처를 기존의 범주형 변수와 달리 연속형 변수처럼 다룰 수 있습니다. Target encoding은 범주형 변수의 정보를 최대한 유지하면서도 선형 회귀와 같은 대부분의 머신러닝 알고리즘에서 수치형 변수를 사용할 수 있는 장점이 있습니다. 그러나, target encoding이 과적합(overfitting)될 가능성이 있어, 많은 양의 데이터가 필요합니다. 또한, 범주형 변수가 많은 경우 피처 공간이 매우 커질 수 있으며, 이는 모델 성능에 영향을 미칠 수 있습니다.

이 코드는 XGBoost 모델을 학습할 때, 성능 측정 지표로 Gini 계수를 사용하도록 설정하는 함수입니다.

dtrain.get_label()를 통해 학습 데이터의 실제 레이블 값을 가져옵니다. 이후 eval_gini(labels, preds) 함수를 사용하여 예측 값과 실제 값의 Gini 계수를 계산합니다.

Gini 계수는 예측 모델의 성능을 측정하는 지표 중 하나로, 분류 문제에서 모델이 얼마나 잘 분류되었는지 측정하는 값입니다. 이 값을 최대화하는 것이 목표입니다.

최종적으로 return [("gini", gini_score)]를 통해 반환되는 값은 학습한 모델의 성능 측정 지표로, [("gini", 계산된 Gini 계수)] 형태로 반환됩니다.


In [28]:
def gini_xgb(preds, dtrain):
  labels = dtrain.get_label()
  gini_score = -eval_gini(labels, preds)
  return [("gini", gini_score)]

def add_noise(series, noise_level):
  return series*(1+noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None,    
                  val_series=None,
                  tst_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):

    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    temp = pd.concat([trn_series, target], axis=1)

    averages = temp.groupby(by=trn_series.name)[target.name].agg(["mean", "count"])

    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))

    prior = target.mean()

    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)

    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=trn_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)

    ft_trn_series.index = trn_series.index
    ft_val_series = pd.merge(
        val_series.to_frame(val_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=val_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)

    ft_val_series.index = val_series.index
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)

    ft_tst_series.index = tst_series.index
    return add_noise(ft_trn_series, noise_level), add_noise(ft_val_series, noise_level), add_noise(ft_tst_series, noise_level)

In [20]:
train_df = pd.read_csv('/content/train.csv', na_values="-1")
test_df = pd.read_csv('/content/test.csv', na_values="-1")

In [21]:
train_features = [
    "ps_car_13",  #            : 1571.65 / shadow  609.23
	"ps_reg_03",  #            : 1408.42 / shadow  511.15
	"ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
	"ps_ind_03",  #            : 1219.47 / shadow  230.55
	"ps_ind_15",  #            :  922.18 / shadow  242.00
	"ps_reg_02",  #            :  920.65 / shadow  267.50
	"ps_car_14",  #            :  798.48 / shadow  549.58
	"ps_car_12",  #            :  731.93 / shadow  293.62
	"ps_car_01_cat",  #        :  698.07 / shadow  178.72
	"ps_car_07_cat",  #        :  694.53 / shadow   36.35
	"ps_ind_17_bin",  #        :  620.77 / shadow   23.15
	"ps_car_03_cat",  #        :  611.73 / shadow   50.67
	"ps_reg_01",  #            :  598.60 / shadow  178.57
	"ps_car_15",  #            :  593.35 / shadow  226.43
	"ps_ind_01",  #            :  547.32 / shadow  154.58
	"ps_ind_16_bin",  #        :  475.37 / shadow   34.17
	"ps_ind_07_bin",  #        :  435.28 / shadow   28.92
	"ps_car_06_cat",  #        :  398.02 / shadow  212.43
	"ps_car_04_cat",  #        :  376.87 / shadow   76.98
	"ps_ind_06_bin",  #        :  370.97 / shadow   36.13
	"ps_car_09_cat",  #        :  214.12 / shadow   81.38
	"ps_car_02_cat",  #        :  203.03 / shadow   26.67
	"ps_ind_02_cat",  #        :  189.47 / shadow   65.68
	"ps_car_11",  #            :  173.28 / shadow   76.45
	"ps_car_05_cat",  #        :  172.75 / shadow   62.92
	"ps_calc_09",  #           :  169.13 / shadow  129.72
	"ps_calc_05",  #           :  148.83 / shadow  120.68
	"ps_ind_08_bin",  #        :  140.73 / shadow   27.63
	"ps_car_08_cat",  #        :  120.87 / shadow   28.82
	"ps_ind_09_bin",  #        :  113.92 / shadow   27.05
	"ps_ind_04_cat",  #        :  107.27 / shadow   37.43
	"ps_ind_18_bin",  #        :   77.42 / shadow   25.97
	"ps_ind_12_bin",  #        :   39.67 / shadow   15.52
	"ps_ind_14",  #            :   37.37 / shadow   16.65
]
# add combinations
combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
]

In [22]:
id_test = test_df['id'].values
id_train = train_df['id'].values
y = train_df['target']
start = time.time()

for n_c, (f1, f2) in enumerate(combs):
  name1 = f1 + "_plus_" + f2
  print('current feature %60s %4d in %5.1f'
          % (name1, n_c + 1, (time.time() - start) / 60), end='')
  print('\r' * 75, end='')

  train_df[name1] = train_df[f1].apply(lambda x:str(x)) + "_" + train_df[f2].apply(lambda x:str(x))
  test_df[name1] = test_df[f1].apply(lambda x:str(x)) + "_" + train_df[f2].apply(lambda x:str(x))

  lbl = LabelEncoder()
  lbl.fit(list(train_df[name1].values) + list(test_df[name1].values))

  train_df[name1] = lbl.transform(list(train_df[name1].values))
  test_df[name1] = lbl.transform(list(test_df[name1].values))

  train_features.append(name1)

X = train_df[train_features]
test_df = test_df[train_features]

f_cats = [f for f in X.columns if "_cat" in f]



In [23]:
y_valid_pred = 0*y
y_test_pred = 0

In [25]:
train_df.head()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,ps_ind_09_bin,ps_ind_10_bin,ps_ind_11_bin,ps_ind_12_bin,ps_ind_13_bin,ps_ind_14,ps_ind_15,ps_ind_16_bin,ps_ind_17_bin,ps_ind_18_bin,ps_reg_01,ps_reg_02,ps_reg_03,ps_car_01_cat,ps_car_02_cat,ps_car_03_cat,ps_car_04_cat,ps_car_05_cat,ps_car_06_cat,ps_car_07_cat,ps_car_08_cat,ps_car_09_cat,ps_car_10_cat,ps_car_11_cat,ps_car_11,ps_car_12,ps_car_13,ps_car_14,ps_car_15,ps_calc_01,ps_calc_02,ps_calc_03,ps_calc_04,ps_calc_05,ps_calc_06,ps_calc_07,ps_calc_08,ps_calc_09,ps_calc_10,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin,ps_reg_01_plus_ps_car_02_cat,ps_reg_01_plus_ps_car_04_cat
0,7,0,2,2.0,5,1.0,0.0,0,1,0,0,0,0,0,0,0,11,0,1,0,0.7,0.2,0.71807,10.0,1.0,,0,1.0,4,1.0,0,0.0,1,12,2.0,0.4,0.883679,0.37081,3.605551,0.6,0.5,0.2,3,1,10,1,10,1,5,9,1,5,8,0,1,1,0,0,1,19,70
1,9,0,1,1.0,7,0.0,0.0,0,0,1,0,0,0,0,0,0,3,0,0,1,0.8,0.4,0.766078,11.0,1.0,,0,,11,1.0,1,2.0,1,19,3.0,0.316228,0.618817,0.388716,2.44949,0.3,0.1,0.3,2,1,9,5,8,1,7,3,1,1,9,0,1,1,0,1,0,21,80
2,13,0,5,4.0,9,1.0,0.0,0,0,1,0,0,0,0,0,0,12,1,0,0,0.0,0.0,,7.0,1.0,,0,,14,1.0,1,2.0,1,60,1.0,0.316228,0.641586,0.347275,3.316625,0.5,0.7,0.1,2,2,9,1,8,2,7,4,2,7,7,0,1,1,0,1,0,1,0
3,16,0,0,1.0,2,0.0,0.0,1,0,0,0,0,0,0,0,0,8,1,0,0,0.9,0.2,0.580948,7.0,1.0,0.0,0,1.0,11,1.0,1,3.0,1,104,1.0,0.374166,0.542949,0.294958,2.0,0.6,0.9,0.1,2,4,7,1,8,4,2,2,2,4,9,0,0,0,0,0,0,23,90
4,17,0,0,2.0,0,1.0,0.0,1,0,0,0,0,0,0,0,0,9,1,0,0,0.7,0.6,0.840759,11.0,1.0,,0,,14,1.0,1,2.0,1,82,3.0,0.31607,0.565832,0.365103,2.0,0.4,0.6,0.0,2,2,6,3,10,2,12,3,1,1,3,0,0,0,1,1,0,19,70


In [24]:
K = 5
kf = KFold(n_splits = K, random_state = 1, shuffle = True)
np.random.seed(0)

In [26]:
model = XGBClassifier(
    n_estimators = MAX_ROUNDS,
    max_depth = 4,
    objective = "binary:logistic",
    learning_rate = LEARNING_RATE,
    subsample =.8,
    min_child_weight = 6,
    colsample_bytree = .8,
    scale_pos_weight = 1.6,
    gamma = 10,
    reg_alpha = 8,
    reg_lambda = 1.3,

)

In [29]:
for i, (train_index, test_index) in enumerate(kf.split(train_df)):
  y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
  X_train ,X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index, :].copy()
  X_test = test_df.copy()
  print("\nFold ", i)

  for f in f_cats:
    X_train[f+"_avg"], X_valid[f+"_avg"], X_test[f+"_avg"] = target_encode(
        trn_series = X_train[f],
        val_series = X_valid[f],
        tst_series = X_test[f],
        target = y_train,
        min_samples_leaf = 200,
        smoothing = 10,
        noise_level = 0
    )

  if OPTIMIZE_ROUNDS:
    eval_set = [(X_valid, y_valid)]
    fit_model = model.fit(X_train, y_train,
                          eval_set = eval_set,
                          eval_metric = gini_xgb,
                          early_stopping_rounds = EARLY_STOPPING_ROUNDS,
                          verbose = False)
    print( "  Best N trees = ", model.best_ntree_limit )
    print( "  Best gini = ", model.best_score )
  else:
      fit_model = model.fit( X_train, y_train )

  pred = fit_model.predict_proba(X_valid)[:,1]
  print( "  Gini = ", eval_gini(y_valid, pred) )
  y_valid_pred.iloc[test_index] = pred

  y_test_pred += fit_model.predict_proba(X_test)[:,1]

  del X_test, X_train, X_valid, y_train

y_test_pred /=K

print( "\nGini for full training set:" )
eval_gini(y, y_valid_pred)


Fold  0


Compilation is falling back to object mode WITH looplifting enabled because Function "eval_gini" failed type inference due to: non-precise type pyobject
During: typing of argument at <ipython-input-15-df6620488e58> (3)

File "<ipython-input-15-df6620488e58>", line 3:
def eval_gini(y_true, y_prob):
  y_true = np.asarray(y_true)
  ^

  @jit
Compilation is falling back to object mode WITHOUT looplifting enabled because Function "eval_gini" failed type inference due to: Cannot determine Numba type of <class 'numba.core.dispatcher.LiftedLoop'>

File "<ipython-input-15-df6620488e58>", line 11:
def eval_gini(y_true, y_prob):
    <source elided>
  n = len(y_true)
  for i in range(n-1, -1,-1):
  ^

  @jit

File "<ipython-input-15-df6620488e58>", line 3:
def eval_gini(y_true, y_prob):
  y_true = np.asarray(y_true)
  ^

Fall-back from the nopython compilation path to the object mode compilation path has been detected, this is deprecated behaviour.

For more information visit https://numba.readthe

  Gini =  0.2865950079341033

Fold  1
  Gini =  0.28296717249610914

Fold  2
  Gini =  0.27633743707830494

Fold  3
  Gini =  0.29983102530363126

Fold  4
  Gini =  0.2842156908077206

Gini for full training set:


Compilation is falling back to object mode WITH looplifting enabled because Function "eval_gini" failed type inference due to: non-precise type pyobject
During: typing of argument at <ipython-input-15-df6620488e58> (3)

File "<ipython-input-15-df6620488e58>", line 3:
def eval_gini(y_true, y_prob):
  y_true = np.asarray(y_true)
  ^

  @jit
Compilation is falling back to object mode WITHOUT looplifting enabled because Function "eval_gini" failed type inference due to: Cannot determine Numba type of <class 'numba.core.dispatcher.LiftedLoop'>

File "<ipython-input-15-df6620488e58>", line 11:
def eval_gini(y_true, y_prob):
    <source elided>
  n = len(y_true)
  for i in range(n-1, -1,-1):
  ^

  @jit

File "<ipython-input-15-df6620488e58>", line 3:
def eval_gini(y_true, y_prob):
  y_true = np.asarray(y_true)
  ^

Fall-back from the nopython compilation path to the object mode compilation path has been detected, this is deprecated behaviour.

For more information visit https://numba.readthe

0.2857881755343149

In [30]:
val = pd.DataFrame()
val['id'] = id_train
val['target'] = y_valid_pred.values
val.to_csv("xgb_valid.csv", float_format="%.6f", index = False)

In [31]:
sub = pd.DataFrame()
sub["id"] = id_test
sub["target"] = y_test_pred
sub.to_csv("xgb_submit.csv", float_format = "%.6f", index= False)
