# Binary classification

- Porto Seguro's Safe Driver Prediction
- [자료1](https://www.kaggle.com/bertcarremans/data-preparation-exploration), [자료2](https://www.kaggle.com/arthurtok/interactive-porto-insights-a-plot-ly-tutorial), [자료3](https://www.kaggle.com/aharless/xgboost-cv-lb-284), [자료4](https://www.kaggle.com/gpreda/porto-seguro-exploratory-analysis-and-prediction)

## Loading

In [None]:
# 기본: 데이터 다루기
import numpy as np
import pandas as pd
from PIL import Image, ImageDraw, ImageFont
# 정규식
import re
import time

# Numba: python 연산을 더 빠르게 해주는 compiler를 이용할 수 있게 하는 library
from numba import jit
# compile 후 이용하듯 먼저 작은 값으로 연산시켜서 구성을 저장하고 실제 큰 값을 최적화시켜 이용하는 방식
import gc # Garbage Collector

from collections import Counter
import missingno as msno

# Sklearn package, 모델 적합에 이용
# sklearn.preprocessing.Imputer는 0.22 ver에서 삭제되었다.
from sklearn.utils import shuffle
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, LabelEncoder

from sklearn.feature_selection import VarianceThreshold, SelectFromModel, mutual_info_classif
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score

from sklearn.linear_model import LogisticRegression
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# plotting
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from IPython.display import Image as PImage

# 새로운 plotting package 등장
import plotly.offline as py
import plotly.graph_objs as go
import plotly.tools as tls
py.init_notebook_mode(connected=True)

# option
pd.set_option("display.max_columns", 100)
from subprocess import check_call
import warnings
warnings.filterwarnings("ignore")

In [None]:
train = pd.read_csv("../input/porto-seguros-safe-driver-prediction-dataset/train.csv")
test = pd.read_csv("../input/porto-seguros-safe-driver-prediction-dataset/test.csv")

In [None]:
train.head()

In [None]:
train.shape

In [None]:
test.shape

In [None]:
train.info()

In [None]:
train.drop_duplicates()
train.shape

내 이럴 줄 알았다. inplace 매개변수를 쓰던, output을 원 데이터로 해주던 초기화해주는 장치가 있어야 drop_duplicates()한 것을 저장할 수 있다.

## preprocessing: Metadata

- feature를 이용한 변수 분석, 시각화, 모델링 등에 도움이 되어 (좀 더 실정에 맞는 모델을 구성할 수 있음) 데이터 DataFrame 자체의 metadata를 아는 것은 중요하다.
- 그런데 여기서 얘기하는 metadata는 feature 데이터의 자체적 특성(or 분류)를 말하는 것 같다.

- **role**: input, ID, target
- **level**: nominal, interval, ordinal, binary
- **keep**: True or False
- **dtype**: int, float, str

In [None]:
data = []
for f in train.columns:
    if f == "target":
        role = "target"
    elif f == "id":
        role = "id"
    else:
        role = "input"
    
    if "bin" in f or f == "target":
        level = "binary"
    elif "cat" in f or f == "id":
        level = "nominal"
    elif train[f].dtype == float:
        level = "interval"
    else:
        level = "ordinal"
    
    keep = True
    if f == "id":
        keep = False
    
    category = "none"
    if "ind" in f:
        category = "individual"
    elif "reg" in f:
        category = "registration"
    elif "car" in f:
        category = "car"
    elif "calc" in f:
        category = "calculated"
    
    dtype = train[f].dtype
    f_dict = {"varname": f, "role": role, "level": level, "keep": keep, "dtype": dtype,
              "category": category}
    data.append(f_dict)

In [None]:
meta = pd.DataFrame(data, columns=["varname", "role", "level", "keep", "dtype", "category"])
meta.set_index("varname", inplace=True)

In [None]:
meta

In [None]:
meta[(meta.level == "nominal") & (meta.keep)].index

In [None]:
pd.DataFrame({"count": meta.groupby(["category"])["category"].size()}).reset_index()

In [None]:
pd.DataFrame({"count": meta.groupby(["role", "level"])["role"].size()}).reset_index()

## Descriptive statistics

- explore the categorical variables

In [None]:
v = meta[(meta.level == "interval") & (meta.keep)].index
train[v].describe()

reg variables

- -1 = NA이기 때문에 reg_03에서 결측값 존재 (설명에선 하나래)
- 다른 reg에 비해 (reg_03의) max 값이 꽤 큰 편이라 표준화시킬 예정인가봄

car variables

- car_12, car_14에 NA 존재
- 여기도 13, 15의 max가 큰 편이라 표준화시킬 생각인 듯하다.

calc variables

- not missing values
- 굳이 표준화시킬 필요 없음.

### Ordinal variables

In [None]:
v = meta[(meta.level == "ordinal") & (meta.keep)].index
train[v].describe()

car_11에만 missing values

### Binary variables

In [None]:
v = meta[(meta.level == "binary") & (meta.keep)].index
train[v].describe()

not missing values, and don't use scaler

## Handling imbalanced classes

#### 작업하기 전에 target data 분포에 대해 확인 먼저 하기

In [None]:
data = [go.Bar(x=train["target"].value_counts().index.values,
               y=train["target"].value_counts().values,
               text="Distribution of target variable")]

layout = go.Layout(title="Target variable distribution")
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename="basic-bar")

이를 통해 target data가 불균형하게 분포하고 있음을 알 수 있다. 불균형을 최소화하는 방향으로.

In [None]:
Counter(train.dtypes.values)

In [None]:
desired_apriori = 0.10

idx_0 = train[train.target == 0].index
idx_1 = train[train.target == 1].index

nb_0 = len(train.loc[idx_0])
nb_1 = len(train.loc[idx_1])

undersampling_rate = ((1 - desired_apriori) * nb_1) / (nb_0 * desired_apriori)
undersampled_nb_0 = int(undersampling_rate * nb_0)
print("Rate to undersample records with target=0: {}".format(undersampling_rate),
      "Number of records with target=0 after undersampling: {}".format(undersampled_nb_0),
      sep="\n")

In [None]:
undersampled_idx = shuffle(idx_0, random_state=37, n_samples=undersampled_nb_0)
idx_list = list(undersampled_idx) + list(idx_1)
train = train.loc[idx_list].reset_index(drop=True)

## Data Quality Checks

결측값 찾기

In [None]:
train.isnull().any().any()

In [None]:
vars_with_missing = []

for f in train.columns:
    missings = train[train[f] == -1][f].count()
    if missings > 0:
        vars_with_missing.append(f)
        missings_perc = missings / train.shape[0]
        
        print("Variables {} has {} records ({:.2%}) with missing values"
             .format(f, missings, missings_perc))

print("In total, there are {} variables with missing values".format(len(vars_with_missing)))

In [None]:
train_copy = train
train_copy = train_copy.replace(-1, np.NaN)

In [None]:
msno.matrix(train_copy.iloc[:, 2:39], figsize=(20, 14), color=(0.42, 0.1, 0.05))

생각보다 결측값이 많네. car_02랑 car_11만 없는 것으로 보인다.

In [None]:
vars_to_drop = ["ps_car_03_cat", "ps_car_05_cat"]
train.drop(vars_to_drop, inplace=True, axis=1)
meta.loc[(vars_to_drop), "keep"] = False

In [None]:
mean_imp = SimpleImputer(missing_values=-1, strategy="mean")
mode_imp = SimpleImputer(missing_values=-1, strategy="mose_frequent")
train["ps_reg_03"] = mean_imp.fit_transform(train[["ps_reg_03"]]).ravel()
train["ps_car_12"] = mean_imp.fit_transform(train[["ps_car_12"]]).ravel()
train["ps_car_14"] = mean_imp.fit_transform(train[["ps_car_14"]]).ravel()
train["ps_car_11"] = mean_imp.fit_transform(train[["ps_car_11"]]).ravel()

### Checking the cardinality of the categorical variables

In [None]:
v = meta[(meta.level == "nominal") & (meta.keep)].index
for f in v:
    dist_values = train[f].value_counts().shape[0]
    print("Variable {} has {} distinct values".format(f, dist_values))

Script by https://www.kaggle.com/ogrellier<br />
Code: https://www.kaggle.com/ogrellier/python-target-encoding-for-categorical-features

In [None]:
def add_noise(series, noise_level):
    return series * (1 + noise_level * np.random.randn(len(series)))

assert: 가정 설정문

- 형식: assert 조건, 메시지
- 조건을 만족하지 않으면 AssertError를 준다.

In [None]:
def target_encode(trn_series=None, tst_series=None, val_series=None, target=None,
                  min_samples_leaf=1, smoothing=1, noise_level=0):
    assert len(trn_series) == len(target)
    assert trn_series.name == tst_series.name
    
    temp = pd.concat([trn_series, target], axis=1)
    averages = temp.groupby(trn_series.name)[target.name].agg(["mean", "count"])
    smoothing = 1 / (1 + np.exp(-(averages["count"] - min_samples_leaf) / smoothing))
    prior = target.mean()
    
    averages[target.name] = prior * (1 - smoothing) + averages["mean"] * smoothing
    averages.drop(["mean", "count"], axis=1, inplace=True)
    
    ft_trn_series = pd.merge(
        trn_series.to_frame(trn_series.name),
        averages.reset_index().rename(columns={"index": target.name, target.name: "average"}),
        on=trn_series.name, how="left"
    )["average"].rename(trn_series.name + "_mean").fillna(prior)
    ft_trn_series.index = trn_series.index
    
    
    ft_tst_series = pd.merge(
        tst_series.to_frame(tst_series.name),
        averages.reset_index().rename(columns={"index": target.name, target.name: "average"}),
        on=tst_series.name, how="left"
    )["average"].rename(trn_series.name + "_mean").fillna(prior)
    ft_tst_series.index = tst_series.index
    
    if val_series is not None:
        ft_val_series = pd.merge(
            val_series.to_frame(val_series.name),
            averages.reset_index().rename(columns={"index": target.name, target.name: "average"}),
            on=val_series.name, how="left"
        )["average"].rename(trn_series.name + "_mean").fillna(prior)
        ft_val_series.index = val_series.index
        
        return add_noise(ft_trn_series, noise_level), add_noise(ft_val_series, noise_level), add_noise(ft_tst_series, noise_level)

    return add_noise(ft_trn_series, noise_level), add_noise(ft_tst_series, noise_level)

In [None]:
train_encoded, test_encoded = target_encode(train["ps_car_11_cat"], test["ps_car_11_cat"],
                                            target=train.target, min_samples_leaf=100,
                                            smoothing=10, noise_level=0.01)

In [None]:
train["ps_car_11_cat_te"] = train_encoded
train.drop("ps_car_11_cat", axis=1, inplace=True)

meta.loc["ps_car_11_cat", "keep"] = False

test["ps_car_11_cat_te"] = test_encoded
test.drop("ps_car_11_cat", axis=1, inplace=True)

## Exploratory Data Visualization

### Categorical variables

- \_, ax = plt.subplots(3, 4)로 구제하고 싶었지만, sns라 그럴 수 없었다.

In [None]:
v = meta[(meta.level == "nominal") & (meta.keep)].index
for f in v:
    plt.figure(figsize=(20, 10))
    
    cat_perc = train[[f, "target"]].groupby([f], as_index=False).mean()
    cat_perc.sort_values("target", ascending=False, inplace=True)
    
    sns.barplot(f, "target", data=cat_perc, order=cat_perc[f])
    
    plt.ylabel("% target", fontsize=18)
    plt.xlabel(f, fontsize=18)
    plt.tick_params(axis="both", which="major", labelsize=18)
    plt.show()

데이터 타입만 고려한 거라 다를 수 있지만, interval이 float를 대변하기에 int에 관한 것도 찍어봤다.

In [None]:
# ordinal이 int가 맞을까
v = meta[(meta.dtype == int) & (meta.keep)].index
plotting_data = [go.Heatmap(
    z=train[v].corr().values, x=train[v].columns.values, y=train[v].columns.values,
    colorscale="Viridis", reversescale=False, opacity=1.0)] # text=True

layout = go.Layout(
    title="Pearson Correlation of Integer-type features",
    xaxis=dict(ticks='', nticks=36), yaxis=dict(ticks=''), width=900, height=700)
fig = go.Figure(data=data, layout=layout)
py.iplot(fig, filename="labelled-heatmap")

### Interval variables

In [None]:
intervals = meta[(meta.level == "interval") & (meta.keep)].index
targets = meta[(meta.role == "target")].index

mf = mutual_info_classif(train[intervals].values, train[targets].values, n_neighbors=3,
                         random_state=17)
print(mf)

KNN distance(entropy)를 기반으로 interval type input과 target 간의 밀접도 또는 의존도(결과적으로 상관계수)를 확인했다.

In [None]:
def corr_heatmap(v):
    correlations = train[v].corr()
    
    cmap = sns.diverging_palette(220, 10, as_cmap=True)
#     cmap = plt.cm.magma
    plt.figure(figsize=(10, 10))
    sns.heatmap(correlations, cmap=cmap, vmax=1.0, center=0, fmt=".2f",
                square=True, linewidths=.5, annot=True, cbar_kws={"shrink": .75})
    plt.show()

In [None]:
v = meta[(meta.level == "interval") & (meta.keep)].index
corr_heatmap(v)

고려해야 할 상관계수

- reg_02, reg_03: 0.7
- car_12, car_13: 0.67
- car_12, car_14: 0.58
- car_13, car_15: 0.53

컴퓨터 부담 덜 주겠다고 sampling하시겠답디다. 그리고 sampling했기 때문에 그림이 다를 수 있다.

In [None]:
s = train.sample(frac=0.1)

In [None]:
# plt.figure(figsize=(15, 15))
sns.lmplot("ps_reg_02", "ps_reg_03", data=s, hue="target", palette="Set1",
           scatter_kws={"alpha": 0.3})
plt.show()

In [None]:
# plt.figure(figsize=(15, 15))
sns.lmplot("ps_car_12", "ps_car_13", data=s, hue="target", palette="Set1",
           scatter_kws={"alpha": 0.3})
plt.show()

In [None]:
# plt.figure(figsize=(15, 15))
sns.lmplot("ps_car_12", "ps_car_14", data=s, hue="target", palette="Set1",
           scatter_kws={"alpha": 0.3})
plt.show()

In [None]:
# plt.figure(figsize=(15, 15))
sns.lmplot("ps_car_13", "ps_car_15", data=s, hue="target", palette="Set1",
           scatter_kws={"alpha": 0.3})
plt.show()

심심하니까 ps_car_12랑 ps_car_15도 확인해보자.

In [None]:
sample = s[["ps_car_12", "ps_car_15", "target"]]
sns.pairplot(sample, hue="target", palette="Set1", diag_kind="kde")
plt.show()

저것만 보기엔 그래프가 굉장히 많으 정보를 담고 있으니 상관계수가 높은 feature는 다 봐보자.

In [None]:
var = ["ps_reg_01", "ps_reg_02", "ps_reg_03", "ps_car_12", "ps_car_13", "ps_car_15", "target"]
sample = s[var]
sns.pairplot(sample, hue="target", palette="Set1", diag_kind="kde")
plt.show()

target에 대해 가우스 분포 쪽으로 한 번 더 확인해보자.

In [None]:
v = meta[(meta.level == "interval") & (meta.keep)].index
t1, t0 = train.loc[train["target"] != 0], train.loc[train["target"] == 0]

# sns.set_style("whitegrid")
plt.figure()
_, _ = plt.subplots(3, 4, figsize=(16, 12))

i = 0
for feature in v:
    i += 1
    plt.subplot(3, 4, i)
    
    sns.kdeplot(t1[feature], bw=.5, label="target = 1")
    sns.kdeplot(t0[feature], bw=.5, label="target = 0")
    
    plt.ylabel("Density plot", fontsize=12)
    plt.xlabel(feature, fontsize=12)
    locs, labels = plt.xticks()
    plt.tick_params(axis="both", which="major", labelsize=12)
plt.show()

### Checking the Binary features inspection

0, 1 비율을 모든 binary에 대해 한 번에 나타내고자 함.

In [None]:
bin_col = meta[(meta.level == "binary") & (meta.keep)].index
bin_col = train[bin_col].columns

In [None]:
zero_list = []
one_list = []
for col in bin_col:
    zero_list.append((train[col] == 0).sum())
    one_list.append((train[col] == 1).sum())

In [None]:
trace1 = go.Bar(x=bin_col, y=zero_list, name="Zero count")
trace2 = go.Bar(x=bin_col, y=one_list, name="One count")

layout = go.Layout(barmode="stack", title="Count of 1 and 0 in binary variables")
fig = go.Figure(data=[trace1, trace2], layout=layout)
py.iplot(fig, filename="stacked-bar")

interval이 그랬던 것처럼 얘도 가우스분포(앞에도 그랬지만, 다차항이 아니고 정확히 정규분포도 아니라서 그냥 종모양 분포를 얘기하는 것)로 확인해보자.

In [None]:
# sns.set_style("whitegrid")
plt.figure()
_, _ = plt.subplots(6, 3, figsize=(12, 24))

i = 0
for feature in bin_col:
    i += 1
    plt.subplot(6, 3, i)
    
    sns.kdeplot(t1[feature], bw=.5, label="target = 1")
    sns.kdeplot(t0[feature], bw=.5, label="target = 0")
    
    plt.ylabel("Density plot", fontsize=12)
    plt.xlabel(feature, fontsize=12)
    
    locs, labels = plt.xticks()
    plt.tick_params(axis="both", which="major", labelsize=12)
plt.show()

### Checking the correlations between ordinal variables

In [None]:
v = meta[(meta.level == "ordinal") & (meta.keep)].index
corr_heatmap(v)

## Feature engineering

### creating dummy variables

In [None]:
v = meta[(meta.level == "nominal") & (meta.keep)].index
print("Before dummification we have {} variables in train".format(train.shape[1]))

In [None]:
train = pd.get_dummies(train, columns=v, drop_first=True)
print("After dummification we have {} variables in train".format(train.shape[1]))

### creating interaction variables

In [None]:
v = meta[(meta.level == "interval") & (meta.keep)].index
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)

interactions = pd.DataFrame(data=poly.fit_transform(train[v]),
                            columns=poly.get_feature_names(v))
interactions.drop(v, axis=1, inplace=True)
print("Before creating interactions we have {} variables in train".format(train.shape[1]))

In [None]:
train = pd.concat([train, interactions], axis=1)
print("After creating interactions we have {} variables in train".format(train.shape[1]))

## Feature selection

### Removing features with low or zero variance

In [None]:
selector = VarianceThreshold(threshold=.01)
selector.fit(train.drop(["id", "target"], axis=1))

In [None]:
f = np.vectorize(lambda x: not x)
v = train.drop(["id", "target"], axis=1).columns[f(selector.get_support())]
print("{} variables have too low variance".format(len(v)),
      "These variables are {}".format(list(v)), sep="\n")

### Feature importance: Random Forest

In [None]:
X_train = train.drop(["id", "target"], axis=1)
y_train = train["target"]
feat_labels = X_train.columns

rf에 estimator를 1000개 넣더니 제대로 돌아가지 않는다. 주의를 요한다.

In [None]:
# rf = RandomForestClassifier(n_estimators=1000, random_state=0, n_jobs=-1)
rf = RandomForestClassifier(n_estimators=150, max_depth=8, min_samples_leaf=4,
                            max_features=0.2, n_jobs=-1, random_state=0)
rf.fit(X_train, y_train)
importances = rf.feature_importances_
print("Random Forest Training Done!!")

In [None]:
indices = np.argsort(rf.feature_importances_)[::-1]
for f in range(X_train.shape[1]):
    print("%2d) %-*s %f" % (f + 1, 30, feat_labels[indices[f]], importances[indices[f]]))

당연히 숫자로만 보기엔 밋밋하니까 시각화!

In [None]:
trace = go.Scatter(
    y=importances, x=feat_labels, mode="markers",
    marker=dict(sizemode="diameter", sizeref=1, size=13, color=importances,
#                 size=importances, color=np.random.randn(500),
                colorscale="Portland", showscale=True), text=feat_labels)

layout = go.Layout(
    autosize=True, title="Random Forest Feature Importance", hovermode="closest",
    xaxis=dict(ticklen=5, showgrid=False, zeroline=False, showline=False),
    yaxis=dict(title="Feature Importance", ticklen=5, showgrid=False, zeroline=False, gridwidth=2),
    showlegend=False)
fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig, filename="scatter2010")

In [None]:
x, y = (list(x) for x in zip(*sorted(zip(importances, feat_labels), reverse=False)))
trace = go.Bar(x=x, y=y, marker=dict(color=x, colorscale="Viridis", reversescale=True),
               name="Random Forest Feature importance", orientation='h')

layout = dict(title="Barplot of Feature importances", width=900, height=2000,
              yaxis=dict(showgrid=False, showline=False, showticklabels=True))
fig = go.Figure(data=[trace])
fig["layout"].update(layout)
py.iplot(fig, filename="plots")

#### Selecting features with a Random Forest and SelectFromModel

In [None]:
sfm = SelectFromModel(rf, threshold="median", prefit=True)
print("Number of features before selection: {}".format(X_train.shape[1]))

In [None]:
n_features = sfm.transform(X_train).shape[1]
print("Number of features after selection: {}".format(n_features))

In [None]:
selected_vars = list(feat_labels[sfm.get_support()])
train = train[selected_vars + ["target"]]

### Feature importance: Gradient Boosting model

In [None]:
gb = GradientBoostingClassifier(n_estimators=100, max_depth=3, min_samples_leaf=4,
                                max_features=0.2, random_state=0)
gb.fit(X_train, y_train)
importances = gb.feature_importances_
print("Gradient Boosting model Training Done!!")

In [None]:
trace = go.Scatter(
    y=importances, x=feat_labels, mode="markers",
    marker=dict(sizemode="diameter", sizeref=1, size=13, color=importances,
#                 size=importances, color=np.random.randint(500),
                colorscale="Portland", showscale=True), text=feat_labels)

layout = go.Layout(
    autosize=True, title="Gradient Boosting Machine Feature Importance", hovermode="closest",
    xaxis=dict(ticklen=5, showgrid=False, zeroline=False, showline=False),
    yaxis=dict(title="Feature Importance", ticklen=5, showgrid=False, zeroline=False,
               gridwidth=2), showlegend=False)
fig = go.Figure(data=[trace], layout=layout)
py.iplot(fig, filename="scatter2010")

In [None]:
x, y = (list(x) for x in zip(*sorted(zip(importances, feat_labels), reverse=False)))
trace = go.Bar(x=x, y=y, marker=dict(color=x, colorscale="Viridis", reversescale=True),
               name="Gradient Boosting Classifier Feature importance", orientation='h')

layout = dict(title="Barplot of Feature importances", width=900, height=2000,
              yaxis=dict(showgrid=False, showline=False, showticklabels=True))
fig = go.Figure(data=[trace])
fig["layout"].update(layout)
py.iplot(fig, filename="plots")

### Decision Tree visualization

In [None]:
dt = tree.DecisionTreeClassifier(max_depth=3)
dt.fit(X_train, y_train)

In [None]:
# with open("tree1.dot", 'w') as f:
#     f = tree.export_graphviz(dt, out_file=f, max_depth=4, impurity=False,
#                              feature_name=feat_labels, class_names=["No", "Yes"],
#                              rounded=True, filled=True)
# check_call(["dot", "-Tpng", "tree1.dot", "-o", "tree1.png"])

# img = Image.open("tree1.png")
# draw = ImageDraw(img)
# img.save("sample-out.png")
# PImage("sample-out.png",)

### XGBoost CV

tree 결정계수로 많이 이용하는 Gini 계수 값으로 보자.

In [None]:
MAX_ROUNDS = 400
OPTIMIZE_ROUNDS = False
LEARNING_RATE = 0.07
EARLY_STOPPING_ROUNDS = 50

In [None]:
@jit
def eval_gini(y_true, y_prob):
    y_true = np.asarray(y_true)
    y_true = y_true[np.argsort(y_prob)]
    
    ntrue = 0, gini = 0, delta = 0
    n = len(y_true)
    for i in range(n - 1, -1, -1):
        yi = y_true[i]
        ntrue += yi
        gini += yi * delta
        delta += 1 - yi
    gini = 1 - 2 * gini / (nture * (n - ntrue))
    return gini

In [None]:
def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = -eval_gini(labels, preds)
    return [("gini", gini_score)]

In [None]:
combs = [("ps_reg_01", "ps_car_02_cat"), ("ps_reg_01", "ps_car_04_cat")]

In [None]:
id_train = train.id.values
id_test = test.id.values
y = train["target"]

In [None]:
new_features = []
start = time.time()
for nc, (f1, f2) in enumerate(combs):
    name = f1 + "_plus_" + f2
    print("current feature %60s %4d in %5.1f" % (name, nc + 1, (time.time() - start) / 60))
    
    train[name] = train[f1].apply(lambda x: str(x)) + "_" + train[f2].apply(lambda x: str(x))
    test[name] = test[f1].apply(lambda x: str(x)) + "_" + test[f2].apply(lambda x: str(x))
    
    lbl = LabelEncoder()
    lbl.fit(list(train[name].values) + list(test[name].values))
    train[name] = lbl.transform(list(train[name].values))
    test[name] = lbl.transform(list(test[name].values))
    
    new_features.append(name)

In [None]:
f_cats = [meta[(meta.level == "nominal") & (meta.keep)].index,
          f for f in new_features if "_cat" in f]

In [None]:
y_valid_pred = 0 * y
y_test_pred = 0

In [None]:
K = 5
kf = KFold(n_splits=K, random_state=1, shuffle=True)
np.random.seed(0)

In [None]:
model = XGBClassifier(
    n_estimators=MAX_ROUNDS, max_depth=4, objective="binary:logistic",
    learning_rate=LEARNING_RATE, subsample=.8, min_child_weight=6, colsample_bytree=.8,
    scale_pos_weight=1.6, gamma=10, reg_alpha=8, reg_lambda=1.3)

CV할 거다, 파이팅 컴퓨터.

In [None]:
for i, (train_index, test_index) in enumerate(kf.split(train)):
    y_train, y_valid = y.iloc[train_index].copy(), y,iloc[test_index]
    X_train, X_valid = X.iloc[train_index].copy(), X.iloc[test_index, :].copy()
    X_test = test.copy()
    print("\nFold", i)
    
    for f in f_cats:
        X_train[f + "_avg"], X_valid[f + "_avg"], X_test[f + "_avg"] = target_encode(
            X_train[f], X_test[f], X_valid[f], y_train, 200, 10, 0)
    
    if OPTIMIZE_ROUNDS:
        eval_set = [(X_valid, y_valid)]
        fit_model = model.fit(X_train, y_train, eval_set=eval_set, eval_metric=gini_xgb,
                              early_stopping_rounds=EARLY_STOPPING_ROUNDS, verbose=False)
        print("\tBest N trees =", model.best_ntree_limit, "\tBest gini =", model.best_score)
    else:
        fit_model = model.fit(X_train, y_train)
    
    pred = fit_model.predict(X_valid)[:, 1]
    print("\tGini =", eval_gini(y_valid, pred))
    y_valid_pred.iloc[test_index] = pred
    y_test_pred += fit_model.predict_proba(X_test)[:, 1]
    
    del X_test, X_train, X_valid, y_train

y_test_pred /= K
print("\nGini for full training set:", eval_gini(y, y_valid_pred))

In [None]:
# val = pd.DataFrame()
# val["id"] = id_train
# val["target"] = y_valid_pred.values
# val.to_csv("xgb_valid.csv", float_format="%.6f", index=False)

In [None]:
# sub = pd.DataFrame()
# sub["id"] = id_test
# sub["target"] = y_test_pred
# sub.to_csv("xgb_submit.csv", float_format="%.6f", index=False)

## Feature scaling

scaler = StandardScaler()
scaler.fit_transform(train.drop(["target"], axis=1))