In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [19]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import gc
import datetime
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgbm
from lightgbm import early_stopping
import warnings
warnings.simplefilter("ignore")

NUM_FOLDS = 5

In [6]:
%%time
df_train = pd.read_parquet("/kaggle/input/amex-data-integer-dtypes-parquet-format/train.parquet")

In [8]:
# S_2를 datatime으로 
df_train["S_2"] = pd.to_datetime(df_train["S_2"])
df_train["days"] = (df_train["S_2"] - df_train.groupby(["customer_ID"])["S_2"].transform("min")).dt.days.astype("int16") + 1

# float32 -> float16
for col in df_train[df_train.columns[df_train.dtypes=="float32"]]:
    df_train[col] = df_train[col].astype("float16")

In [9]:
gc.collect()

In [10]:
print( df_train.shape) 
print( df_train['customer_ID'].value_counts().shape )
print( df_train['customer_ID'].value_counts() )
print( )

In [11]:
df_train = df_train.groupby(["customer_ID"]).tail(1).set_index('customer_ID')

In [12]:
%%time
df_train_labels = pd.read_csv("/kaggle/input/amex-default-prediction/train_labels.csv")
df_train_labels.info()

In [13]:
df_train_labels["target"] = df_train_labels["target"].astype("int8")
print(df_train_labels.shape)
df_train_labels.head()

In [14]:
%%time
df_train = df_train.merge(df_train_labels, on="customer_ID", how='left')
print(df_train.shape)
print(df_train.head())
del df_train_labels
gc.collect()

In [15]:
# https://www.kaggle.com/code/cdeotte/xgboost-starter-0-793/notebook
# https://www.kaggle.com/kyakovlev
# https://www.kaggle.com/competitions/amex-default-prediction/discussion/327534
def amex_metric_mod(y_true, y_pred):
    
    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)

    return 0.5 * (gini[1]/gini[0] + top_four),_

In [16]:
FEATURES = df_train.columns.drop(["target","customer_ID","S_2"])
categorical_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
cat_col=[]
n=0
for col in df_train[FEATURES]:
    for coll in categorical_cols:
        if col==coll:
            cat_col.append(n)
            break
    n+=1
cat_col

In [17]:
params = {}
feature_importances = []
scores = []
models = []
pred_val=[]
yval=[]

In [20]:
# 교차 검증 클래스 - 학습용 데이터 셋 인덱스, 검증용 데이터 셋 인덱스 
skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=2022)
list( enumerate(skf.split(df_train[FEATURES],df_train["target"])) )

In [21]:
%%time
params = {}
feature_importances = []  # 특성 중요도 
scores = []               # fold 별 점수 
models = []               # 모델 
pred_val=[]
yval=[]

# 교차 검증 클래스ㅁ
skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=2022)

# 폴드별 데이터 나누기 
for fold,(train_idx, val_idx) in enumerate(skf.split(df_train[FEATURES],df_train["target"])):
    
    print('FOLD:',fold)
    
    # 데이터 나누기
    X_train = df_train.loc[train_idx, FEATURES].values
    y_train = df_train.loc[train_idx, 'target'].values
    X_val = df_train.loc[val_idx, FEATURES].values
    y_val = df_train.loc[val_idx, 'target'].values

    print("y_train t=0 count:", len(y_train[y_train==0]))
    print("y_train t=1 count:", len(y_train[y_train==1]))
    print("y_val t=0 count:", len(y_val[y_val==0]))
    print("y_val t=1 count:", len(y_val[y_val==1]))


    params = {
        "num_iterations":10000,
        'learning_rate': 0.05,
    }
    
    # LGBM 알고리즘
    model = lgbm.LGBMClassifier(**params).fit(
        X_train,y_train,
        eval_set=[(X_val,y_val),(X_train,y_train)],
        verbose=100,
        callbacks=[early_stopping(100)],
        categorical_feature=cat_col
    )
    
    # 특성 중요도
    feature_importances.append(model.feature_importances_)   
    models.append(model)
    pred_val = np.append(pred_val,model.predict_proba(X_val)[:,1])
    yval = np.append(yval,y_val)   
    
    del X_train,y_train,X_val,y_val,model
    gc.collect()


score = amex_metric_mod(yval, pred_val)[0]
print('score:', score)
f=open("score.txt","a");f.write(str(score));f.close()

In [22]:
del df_train,train_idx,val_idx,yval,pred_val
gc.collect()

In [23]:
df_feat_imp = pd.DataFrame(index=FEATURES)
df_feat_imp["imp0"] = feature_importances[0]
df_feat_imp["imp1"] = feature_importances[1]
df_feat_imp["imp2"] = feature_importances[2]
df_feat_imp["imp3"] = feature_importances[3]
df_feat_imp["imp4"] = feature_importances[4]
df_feat_imp["mean_imp"] = df_feat_imp.mean(axis=1).values

df_feat_imp = df_feat_imp.sort_values(by="mean_imp",ascending=False)

df_feat_imp.to_csv("feat_imp.csv")

fig, ax = plt.subplots(figsize=(20,5))
sns.barplot(x=df_feat_imp.index,y=df_feat_imp["mean_imp"])
plt.xticks([])
print(df_feat_imp)

In [24]:
df_test = pd.read_parquet("/kaggle/input/amex-data-integer-dtypes-parquet-format/test.parquet")

print("convert float32 columns to float16")
for col in df_test[df_test.columns[df_test.dtypes=="float32"]]:
    df_test[col] = df_test[col].astype("float16")

print("date and time")
df_test["S_2"] = pd.to_datetime(df_test["S_2"])
df_test["days"] = (df_test["S_2"] - df_test.groupby(["customer_ID"])["S_2"].transform("min")).dt.days.astype("int16") + 1

print("grouping")
df_test = df_test.groupby(["customer_ID"]).tail(1).set_index('customer_ID')

In [31]:
pred = models[2].predict(df_test.loc[:,FEATURES])

In [32]:
subm = pd.read_csv("/kaggle/input/amex-default-prediction/sample_submission.csv")
subm["prediction"] = pred
subm.to_csv("submission.csv", index=False)