In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from itertools import cycle

import warnings, gc
warnings.filterwarnings('ignore')

[<a href="#Table-of-Contents">Back to Top</a>]

In [3]:
df_train = pd.read_feather('../input/amexfeather/train_data.ftr')
df_train = df_train.groupby('customer_ID').tail(1).set_index('customer_ID')

df_test = pd.read_feather('../input/amexfeather/test_data.ftr')
df_test = df_test.groupby('customer_ID').tail(1).set_index('customer_ID')

df_subm = pd.read_csv("../input/amex-default-prediction/sample_submission.csv")

In [4]:
df_train_row_count, df_train_column_count=df_train.shape


In [5]:
df_test_row_count, df_test_column_count=df_test.shape


In [6]:
df_train['S_2'] = pd.to_datetime(df_train['S_2'])
df_test['S_2'] = pd.to_datetime(df_test['S_2'])

In [7]:
print('Duration of Train Date: ', df_train['S_2'].min(), df_train['S_2'].max())
print('Duration of Test Date: ', df_test['S_2'].min(), df_test['S_2'].max())

Duration of Train Date:  2018-03-01 00:00:00 2018-03-31 00:00:00
Duration of Test Date:  2019-04-01 00:00:00 2019-10-31 00:00:00


In [8]:
df_train.head()

Unnamed: 0_level_0,S_2,P_2,D_39,B_1,B_2,R_1,S_3,D_41,B_3,D_42,...,D_137,D_138,D_139,D_140,D_141,D_142,D_143,D_144,D_145,target
customer_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a,2018-03-13,0.93457,0.009117,0.009384,1.007812,0.006104,0.13501,0.001604,0.007175,,...,,,0.007187,0.004234,0.005085,,0.00581,0.00297,0.00853,0
00000fd6641609c6ece5454664794f0340ad84dddce9a267a310b5ae68e9d8e5,2018-03-25,0.880371,0.178101,0.034698,1.003906,0.006912,0.165527,0.00555,0.00507,,...,,,0.002981,0.007481,0.007874,,0.003284,0.00317,0.008514,0
00001b22f846c82c51f6e3958ccd81970162bae8b007e80662ef27519fcc18c1,2018-03-12,0.880859,0.009705,0.004284,0.8125,0.006451,,0.003796,0.007195,,...,,,0.007381,0.006622,0.000965,,0.002201,0.000834,0.003445,0
000041bdba6ecadd89a52d11886e8eaaec9325906c9723355abb5ca523658edc,2018-03-29,0.621582,0.001082,0.012566,1.005859,0.007828,0.287842,0.004532,0.009941,,...,,,0.002705,0.006184,0.001899,,0.008186,0.005558,0.002983,0
00007889e4fcd2614b6cbe7f8f3d2e5c728eca32d9eb8ad51ca8b8c4a24cefed,2018-03-30,0.87207,0.005573,0.007679,0.815918,0.001247,,0.000231,0.005527,,...,,,0.002974,0.004162,0.005764,,0.008156,0.006943,0.000905,0


In [9]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 458913 entries, 0000099d6bd597052cdcda90ffabf56573fe9d7c79be5fbac11a8ed792feb62a to fffff1d38b785cef84adeace64f8f83db3a0c31e8d92eaba8b115f71cab04681
Columns: 190 entries, S_2 to target
dtypes: category(11), datetime64[ns](1), float16(177), int64(1)
memory usage: 170.2+ MB


In [10]:
cat_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

In [11]:
feat_Delinquency = [c for c in df_train.columns if c.startswith('D_')]
feat_Spend = [c for c in df_train.columns if c.startswith('S_')]
feat_Payment = [c for c in df_train.columns if c.startswith('P_')]
feat_Balance = [c for c in df_train.columns if c.startswith('B_')]
feat_Risk = [c for c in df_train.columns if c.startswith('R_')]
print(f'Total amount of Delinquency variables: {len(feat_Delinquency)}')
print(f'Total amount of Expenses variables: {len(feat_Spend)}')
print(f'Total amount of Payment variables: {len(feat_Payment)}')
print(f'Total amount of Balance variables: {len(feat_Balance)}')
print(f'Total amount of Risk variables: {len(feat_Risk)}')

Total amount of Delinquency variables: 96
Total amount of Expenses variables: 22
Total amount of Payment variables: 3
Total amount of Balance variables: 40
Total amount of Risk variables: 28


[<a href="#Table-of-Contents">Back to Top</a>]

In [12]:
labels=['Delinquency', 'Spend','Payment','Balance','Risk']
values= [len(feat_Delinquency), len(feat_Spend),len(feat_Payment), len(feat_Balance),len(feat_Risk)]

In [13]:
df_train.isna().sum()

S_2            0
P_2         2969
D_39           0
B_1            0
B_2           31
           ...  
D_142     378598
D_143       2830
D_144          0
D_145       2830
target         0
Length: 190, dtype: int64

In [14]:
sum(df_train.isna().sum())

12995954

In [15]:
target_class = pd.DataFrame({'count': df_train.target.value_counts(),
                             'percentage': df_train['target'].value_counts() / df_train.shape[0] * 100
})

In [16]:
target_class 

Unnamed: 0,count,percentage
0,340085,74.106639
1,118828,25.893361


In [17]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Pie(values = target_class['count'],labels = target_class.index,hole = 0.6, 
                     hoverinfo ='label+percent'))
fig.update_traces(textfont_size = 12, hoverinfo ='label+percent',textinfo ='label', 
                  showlegend = False,marker = dict(colors =["#9acd32","#87ceeb"]),
                  title = dict(text = 'Distribution of a Target Variable'))  
fig.show()

In [18]:
stat_plot = df_train.reset_index().groupby('S_2')['customer_ID'].nunique().reset_index()
fig = go.Figure()
fig.add_trace(go.Scatter(x = stat_plot['S_2'], y = stat_plot['customer_ID']))
fig.update_layout(title="Customer Presences", width = 700, height = 500,xaxis_title ='Presencs Date',
                  paper_bgcolor='rgb(0,0,0,0)',plot_bgcolor='rgb(0,0,0,0)') 
fig['data'][0]['line']['color']="#8338ec"
fig.show()

In [19]:
gc.collect()

288

In [20]:
del_cols = [c for c in df_train.columns if (c.startswith(('D','t'))) & (c not in cat_cols)]
df_del = df_train[del_cols]
spd_cols = [c for c in df_train.columns if (c.startswith(('S','t'))) & (c not in cat_cols)]
df_spd = df_train[spd_cols]
pay_cols = [c for c in df_train.columns if (c.startswith(('P','t'))) & (c not in cat_cols)]
df_pay = df_train[pay_cols]
bal_cols = [c for c in df_train.columns if (c.startswith(('B','t'))) & (c not in cat_cols)]
df_bal = df_train[bal_cols]
ris_cols = [c for c in df_train.columns if (c.startswith(('R','t'))) & (c not in cat_cols)]
df_ris = df_train[ris_cols]

In [22]:
S_cols = [c for c in df_train.columns if (c.startswith(('S')))]
df_S = df_train[S_cols]

In [27]:
P_cols = [c for c in df_train.columns if (c.startswith(('P')))]
df_P = df_train[P_cols]

In [28]:
B_cols = [c for c in df_train.columns if (c.startswith(('B')))]
df_B = df_train[B_cols]

In [29]:
R_cols = [c for c in df_train.columns if (c.startswith(('R')))]
df_R = df_train[R_cols]

In [30]:
gc.collect()

391

[<a href="#Table-of-Contents">Back to Top</a>]

# 06.Model Training

In [31]:
from sklearn.preprocessing import LabelEncoder
lab_enc = LabelEncoder()
for cat_feat in cat_cols:
    df_train[cat_feat] = lab_enc.fit_transform(df_train[cat_feat])
    df_test[cat_feat] = lab_enc.transform(df_test[cat_feat])

In [32]:
#Reference: https://www.kaggle.com/code/inversion/amex-competition-metric-python/notebook
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [33]:
# define dataset
X = df_train.drop('target', axis=1)
y = df_train['target']

In [34]:
from sklearn.model_selection import train_test_split

# creating dataset split for prediction
X_train, X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=42) # 80-20 split

# Checking split 
print('X_train:', X_train.shape)
print('y_train:', y_train.shape)
print('X_test:', X_test.shape)
print('y_test:', y_test.shape)

X_train: (367130, 189)
y_train: (367130,)
X_test: (91783, 189)
y_test: (91783,)


In [35]:
from catboost import CatBoostClassifier
clf = CatBoostClassifier(iterations = 3000, random_state = 42, nan_mode ='Min',task_type ="GPU")
clf.fit(X_train, y_train, eval_set = [(X_test, y_test)], cat_features=cat_cols,  verbose = 100)
preds = clf.predict_proba(X_test)[:, 1]

Learning rate set to 0.027352
0:	learn: 0.6589316	test: 0.6588583	best: 0.6588583 (0)	total: 56.2ms	remaining: 2m 48s
100:	learn: 0.2443477	test: 0.2449993	best: 0.2449993 (100)	total: 5.48s	remaining: 2m 37s
200:	learn: 0.2330854	test: 0.2344294	best: 0.2344294 (200)	total: 10.4s	remaining: 2m 25s
300:	learn: 0.2285909	test: 0.2305575	best: 0.2305575 (300)	total: 15.7s	remaining: 2m 21s
400:	learn: 0.2257680	test: 0.2283929	best: 0.2283929 (400)	total: 20.7s	remaining: 2m 13s
500:	learn: 0.2236704	test: 0.2269468	best: 0.2269468 (500)	total: 26s	remaining: 2m 9s
600:	learn: 0.2219627	test: 0.2258888	best: 0.2258888 (600)	total: 31.8s	remaining: 2m 6s
700:	learn: 0.2205532	test: 0.2251693	best: 0.2251693 (700)	total: 37.3s	remaining: 2m 2s
800:	learn: 0.2193332	test: 0.2246047	best: 0.2246047 (800)	total: 42.2s	remaining: 1m 55s
900:	learn: 0.2182313	test: 0.2241846	best: 0.2241846 (900)	total: 47.6s	remaining: 1m 50s
1000:	learn: 0.2171808	test: 0.2238509	best: 0.2238509 (1000)	total:

In [36]:
y_preds = clf.predict_proba(df_test)[:, 1]
y_preds

array([0.02010863, 0.00235499, 0.05614063, ..., 0.45576455, 0.3040801 ,
       0.06703274])

In [37]:
from IPython.display import HTML
df_subm["prediction"] = y_preds
df_subm.to_csv('submission_catb.csv', index=False)


def create_download_link(title = "Download CSV file", filename = "data.csv"):  
    html = '<a href={filename}>{title}</a>'
    html = html.format(title=title,filename=filename)
    return HTML(html)

# create a link to download the dataframe which was saved with .to_csv method
create_download_link(filename='submission_catb.csv')

In [38]:
#done

[<a href="#Table-of-Contents">Back to Top</a>]