In [2]:
!pip install kaggle
from google.colab import files
files.upload()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"kimbbaro","key":"0eee4ddbd0a80636c2a3f8146af07993"}'}

In [3]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

# Permission Warning 방지
!chmod 600 ~/.kaggle/kaggle.json

In [4]:
!kaggle competitions download -c porto-seguro-safe-driver-prediction

Downloading porto-seguro-safe-driver-prediction.zip to /content
 93% 71.0M/76.5M [00:00<00:00, 106MB/s] 
100% 76.5M/76.5M [00:00<00:00, 89.3MB/s]


In [5]:
!unzip /content/porto-seguro-safe-driver-prediction.zip

Archive:  /content/porto-seguro-safe-driver-prediction.zip
  inflating: sample_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


In [6]:
MAX_ROUNDS = 400
OPTIMIZE_ROUNDS = False
LEARNING_RATE = 0.07
EARLY_STOPPING_ROUNDS = 50


In [7]:
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from numba import jit
import time
import gc

In [8]:
@jit
def eval_gini(y_true, y_prob):
  y_true = np.asarray(y_true)
  y_true = y_true[np.argsort(y_prob)]
  ntrue = 0
  gini = 0
  delta = 0
  n = len(y_true)
  for i in range(n-1,-1,-1):
    y_i = y_true[i]
    ntrue +=y_i
    gini +=y_i * delta
    delta +=1-y_i

  gini = 1 - 2 * gini / (ntrue * (n - ntrue))
  return gini

assert len(trn_series) == len(target)
assert trn_series.name == tst_series.name


함수의 인자로 들어온 데이터의 일관성을 검증하기 위한 코드입니다. 첫 번째 assert 문은 훈련 데이터와 타겟 데이터가 같은 길이인지 확인합니다. 두 번째 assert 문은 훈련 데이터와 검증 데이터가 같은 열 이름을 가지는지 확인합니다. 이러한 검증 과정은 데이터가 올바르게 처리되어 예상치 못한 결과가 발생하는 것을 방지하고, 코드의 안정성과 일관성을 유지하는 데 도움을 줍니다.

In [9]:
def gini_xgb(preds, dtrain):
  labels = dtrain.get_label()
  gini_score = -eval_gini(labels, preds)
  return [("gini", gini_score)]

def add_noise(series, noise_level):
  return series * (1 + noise_level * np.random.randn(len(series)))

def target_encode(trn_series=None,    # Revised to encode validation series
                  val_series=None,
                  tst_series=None,
                  target=None,
                  min_samples_leaf=1,
                  smoothing=1,
                  noise_level=0):
  assert len(trn_series) == len(target)
  assert trn_series.name == tst_series.name
  temp = pd.concat([trn_series, target], axis = 1)

  #훈련데이터의 이름으로 묶고, target값에 해당하는 평균을 재는거
  averages = temp.groupby(by = trn_series.name)[target.name].agg(["mean", "count"])

  smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
  #전체 데이터의 평균을 말함.
  prior = target.mean()

  averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
  averages.drop(["mean", "count"], axis=1, inplace=True)


# 위 코드에서 pd.merge 함수는 두 개의 DataFrame을 병합합니다. 

# 첫 번째 인자는 trn_series.to_frame(trn_series.name)으로, trn_series의 값을 하나의 열로 가지는 새로운 DataFrame을 생성합니다. 

# 두 번째 인자는 averages.reset_index().rename(columns={'index': target.name, target.name: 'average'})으로, 

# trn_series.name 열에 대해 그룹화한 결과에 reset_index()를 적용하여 인덱스를 열로 변환한 DataFrame을 생성합니다. 

# 이 때, target.name 열의 이름을 target.name으로 변경하고, target.name 열에 해당하는 평균값 열의 이름을 'average'로 변경합니다.

# 이렇게 생성된 두 개의 DataFrame은 on=trn_series.name으로 trn_series.name 열을 기준으로 병합됩니다. 

# 이 때, how='left'로 지정되어 있으므로, trn_series.to_frame(trn_series.name)에는 모든 행이 포함되고, 

# averages.reset_index().rename(columns={'index': target.name, target.name: 'average'})에는 

# trn_series.name 열의 고유한 값만 포함됩니다.

# 마지막으로, 병합된 DataFrame에서 'average' 열만 선택하여 반환하고, fillna(prior)를 적용하여 결측값을 prior 값으로 채워줍니다. 

# 이후에 rename(trn_series.name + '_mean')을 적용하여 열 이름을 trn_series.name + '_mean'으로 변경합니다. 

# 결과적으로, ft_trn_series는 trn_series에 대한 target encoding 값을 가지는 Series 객체가 됩니다.


  ft_trn_series = pd.merge(
       trn_series.to_frame(trn_series.name),
       averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
       on=trn_series.name,
       how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)

  ft_trn_series.index = trn_series.index
  ft_val_series = pd.merge(
        val_series.to_frame(val_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=val_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
  ft_val_series.index = val_series.index
  ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={'index': target.name, target.name: 'average'}),
        on=tst_series.name,
        how='left')['average'].rename(trn_series.name + '_mean').fillna(prior)
    # pd.merge does not keep the index so restore it
  ft_tst_series.index = tst_series.index
  return add_noise(ft_trn_series, noise_level), add_noise(ft_val_series, noise_level), add_noise(ft_tst_series, noise_level)

In [12]:
# Read data
train_df = pd.read_csv('/content/train.csv', na_values="-1") # .iloc[0:200,:]
test_df = pd.read_csv('/content/test.csv', na_values="-1")

In [13]:
train_features = [
    "ps_car_13",  #            : 1571.65 / shadow  609.23
	"ps_reg_03",  #            : 1408.42 / shadow  511.15
	"ps_ind_05_cat",  #        : 1387.87 / shadow   84.72
	"ps_ind_03",  #            : 1219.47 / shadow  230.55
	"ps_ind_15",  #            :  922.18 / shadow  242.00
	"ps_reg_02",  #            :  920.65 / shadow  267.50
	"ps_car_14",  #            :  798.48 / shadow  549.58
	"ps_car_12",  #            :  731.93 / shadow  293.62
	"ps_car_01_cat",  #        :  698.07 / shadow  178.72
	"ps_car_07_cat",  #        :  694.53 / shadow   36.35
	"ps_ind_17_bin",  #        :  620.77 / shadow   23.15
	"ps_car_03_cat",  #        :  611.73 / shadow   50.67
	"ps_reg_01",  #            :  598.60 / shadow  178.57
	"ps_car_15",  #            :  593.35 / shadow  226.43
	"ps_ind_01",  #            :  547.32 / shadow  154.58
	"ps_ind_16_bin",  #        :  475.37 / shadow   34.17
	"ps_ind_07_bin",  #        :  435.28 / shadow   28.92
	"ps_car_06_cat",  #        :  398.02 / shadow  212.43
	"ps_car_04_cat",  #        :  376.87 / shadow   76.98
	"ps_ind_06_bin",  #        :  370.97 / shadow   36.13
	"ps_car_09_cat",  #        :  214.12 / shadow   81.38
	"ps_car_02_cat",  #        :  203.03 / shadow   26.67
	"ps_ind_02_cat",  #        :  189.47 / shadow   65.68
	"ps_car_11",  #            :  173.28 / shadow   76.45
	"ps_car_05_cat",  #        :  172.75 / shadow   62.92
	"ps_calc_09",  #           :  169.13 / shadow  129.72
	"ps_calc_05",  #           :  148.83 / shadow  120.68
	"ps_ind_08_bin",  #        :  140.73 / shadow   27.63
	"ps_car_08_cat",  #        :  120.87 / shadow   28.82
	"ps_ind_09_bin",  #        :  113.92 / shadow   27.05
	"ps_ind_04_cat",  #        :  107.27 / shadow   37.43
	"ps_ind_18_bin",  #        :   77.42 / shadow   25.97
	"ps_ind_12_bin",  #        :   39.67 / shadow   15.52
	"ps_ind_14",  #            :   37.37 / shadow   16.65
]
# add combinations
combs = [
    ('ps_reg_01', 'ps_car_02_cat'),  
    ('ps_reg_01', 'ps_car_04_cat'),
]

위 코드는 Scikit-learn의 LabelEncoder()를 사용하여 범주형 변수를 수치형 변수로 변환하는 코드입니다.

먼저, LabelEncoder()를 lbl 객체로 생성합니다. 이후, fit() 메소드를 사용하여 train 데이터와 test 데이터에서 변수 name1의 유니크한 값을 추출하고, 이를 알파벳순으로 정렬하여 각각의 클래스에 고유한 인덱스 번호를 부여합니다.

이후, transform() 메소드를 사용하여 train 데이터와 test 데이터에서 변수 name1의 값을 각각 인덱스 번호로 변환합니다. 이를 통해 범주형 변수를 수치형 변수로 변환하여 모델 학습에 사용할 수 있게 됩니다.

In [14]:
id_test = test_df['id'].values
id_train = train_df['id'].values
y = train_df['target']

start = time.time()
for n_c, (f1, f2) in enumerate(combs):
    name1 = f1 + "_plus_" + f2
    print('current feature %60s %4d in %5.1f'
          % (name1, n_c + 1, (time.time() - start) / 60), end='')
    print('\r' * 75, end='')
    
    #이게 정확히 뭔지 모르겠어
    #이걸 왜 하는거지?
    #값이 어디에 있는거지?
    train_df[name1] = train_df[f1].apply(lambda x: str(x)) + "_" + train_df[f2].apply(lambda x: str(x))
    test_df[name1] = test_df[f1].apply(lambda x: str(x)) + "_" + test_df[f2].apply(lambda x: str(x))
    # Label Encode
    lbl = LabelEncoder()
    lbl.fit(list(train_df[name1].values) + list(test_df[name1].values))
    train_df[name1] = lbl.transform(list(train_df[name1].values))
    test_df[name1] = lbl.transform(list(test_df[name1].values))

    train_features.append(name1)
    
X = train_df[train_features]
test_df = test_df[train_features]

f_cats = [f for f in X.columns if "_cat" in f]



In [20]:
#테스트
train_df[name1].values

array([70, 80,  0, ..., 90, 90, 10])

In [21]:
y_valid_pred = 0*y
y_test_pred = 0

In [22]:
K = 5
kf = KFold(n_splits = K, random_state = 1, shuffle = True)
np.random.seed(0)

In [23]:
model = XGBClassifier(    
                        n_estimators=MAX_ROUNDS,
                        max_depth=4,
                        objective="binary:logistic",
                        learning_rate=LEARNING_RATE, 
                        subsample=.8,
                        min_child_weight=6,
                        colsample_bytree=.8,
                        scale_pos_weight=1.6,
                        gamma=10,
                        reg_alpha=8,
                        reg_lambda=1.3,
                     )

In [24]:
for i, (train_index, test_index) in enumerate(kf.split(train_df)):
    
    # Create data for this fold
    y_train, y_valid = y.iloc[train_index].copy(), y.iloc[test_index]
    X_train, X_valid = X.iloc[train_index,:].copy(), X.iloc[test_index,:].copy()
    X_test = test_df.copy()
    print( "\nFold ", i)
    
    # Enocode data
    for f in f_cats:
        X_train[f + "_avg"], X_valid[f + "_avg"], X_test[f + "_avg"] = target_encode(
                                                        trn_series=X_train[f],
                                                        val_series=X_valid[f],
                                                        tst_series=X_test[f],
                                                        target=y_train,
                                                        min_samples_leaf=200,
                                                        smoothing=10,
                                                        noise_level=0
                                                        )
    # Run model for this fold
    if OPTIMIZE_ROUNDS:
        eval_set=[(X_valid,y_valid)]
        fit_model = model.fit( X_train, y_train, 
                               eval_set=eval_set,
                               eval_metric=gini_xgb,
                               early_stopping_rounds=EARLY_STOPPING_ROUNDS,
                               verbose=False
                             )
        print( "  Best N trees = ", model.best_ntree_limit )
        print( "  Best gini = ", model.best_score )
    else:
        fit_model = model.fit( X_train, y_train )
        
    # Generate validation predictions for this fold
    pred = fit_model.predict_proba(X_valid)[:,1]
    print( "  Gini = ", eval_gini(y_valid, pred) )
    y_valid_pred.iloc[test_index] = pred
    
    # Accumulate test set predictions
    y_test_pred += fit_model.predict_proba(X_test)[:,1]
    
    del X_test, X_train, X_valid, y_train
    
y_test_pred /= K  # Average test set predictions

print( "\nGini for full training set:" )
eval_gini(y, y_valid_pred)


Fold  0


Compilation is falling back to object mode WITH looplifting enabled because Function "eval_gini" failed type inference due to: non-precise type pyobject
During: typing of argument at <ipython-input-8-4f788ae9d301> (3)

File "<ipython-input-8-4f788ae9d301>", line 3:
def eval_gini(y_true, y_prob):
  y_true = np.asarray(y_true)
  ^

  @jit
Compilation is falling back to object mode WITHOUT looplifting enabled because Function "eval_gini" failed type inference due to: Cannot determine Numba type of <class 'numba.core.dispatcher.LiftedLoop'>

File "<ipython-input-8-4f788ae9d301>", line 9:
def eval_gini(y_true, y_prob):
    <source elided>
  n = len(y_true)
  for i in range(n-1,-1,-1):
  ^

  @jit

File "<ipython-input-8-4f788ae9d301>", line 3:
def eval_gini(y_true, y_prob):
  y_true = np.asarray(y_true)
  ^

Fall-back from the nopython compilation path to the object mode compilation path has been detected, this is deprecated behaviour.

For more information visit https://numba.readthedocs.i

  Gini =  0.2865950079341033

Fold  1
  Gini =  0.28296717249610914

Fold  2
  Gini =  0.27633743707830494

Fold  3
  Gini =  0.29983102530363126

Fold  4
  Gini =  0.2842156908077206

Gini for full training set:


Compilation is falling back to object mode WITH looplifting enabled because Function "eval_gini" failed type inference due to: non-precise type pyobject
During: typing of argument at <ipython-input-8-4f788ae9d301> (3)

File "<ipython-input-8-4f788ae9d301>", line 3:
def eval_gini(y_true, y_prob):
  y_true = np.asarray(y_true)
  ^

  @jit
Compilation is falling back to object mode WITHOUT looplifting enabled because Function "eval_gini" failed type inference due to: Cannot determine Numba type of <class 'numba.core.dispatcher.LiftedLoop'>

File "<ipython-input-8-4f788ae9d301>", line 9:
def eval_gini(y_true, y_prob):
    <source elided>
  n = len(y_true)
  for i in range(n-1,-1,-1):
  ^

  @jit

File "<ipython-input-8-4f788ae9d301>", line 3:
def eval_gini(y_true, y_prob):
  y_true = np.asarray(y_true)
  ^

Fall-back from the nopython compilation path to the object mode compilation path has been detected, this is deprecated behaviour.

For more information visit https://numba.readthedocs.i

0.2857881755343149

In [25]:
val = pd.DataFrame()
val['id'] = id_train
val['target'] = y_valid_pred.values
val.to_csv('xgb_valid.csv', float_format='%.6f', index=False)

In [26]:
sub = pd.DataFrame()
sub['id'] = id_test
sub['target'] = y_test_pred
sub.to_csv('xgb_submit.csv', float_format='%.6f', index=False)