In [34]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import gc

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/amexfeather/test_data_f32.ftr
/kaggle/input/amexfeather/train_data.ftr
/kaggle/input/amexfeather/train_data_f32.ftr
/kaggle/input/amexfeather/test_data.ftr
/kaggle/input/amex-default-prediction/sample_submission.csv
/kaggle/input/amex-default-prediction/train_data.csv
/kaggle/input/amex-default-prediction/test_data.csv
/kaggle/input/amex-default-prediction/train_labels.csv


# Load Data

In [35]:
train_data = pd.read_feather('../input/amexfeather/train_data.ftr')
# Keep the latest statement features for each customer
train_data = train_data.groupby('customer_ID').tail(1).set_index('customer_ID', drop=True).sort_index()

test_data = pd.read_feather('../input/amexfeather/test_data.ftr')
# Keep the latest statement features for each customer
test_data = test_data.groupby('customer_ID').tail(1).set_index('customer_ID')

submission_data = pd.read_csv("../input/amex-default-prediction/sample_submission.csv")

In [36]:
train_data_row_count, train_data_column_count=train_data.shape
print('#Train rows :', train_data_row_count)
print('#Train columns :', train_data_column_count)

test_data_row_count, test_data_column_count = test_data.shape
print('#Test rows:', test_data_row_count)
print('#Test columns:', test_data_column_count)

#Train rows : 458913
#Train columns : 190
#Test rows: 924621
#Test columns: 189


In [37]:
categorical_cols = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']

num_cols = [col for col in train_data.columns if col not in categorical_cols + ["target"]]

print(f'Total number of features: {len(train_data.columns)-1}')
print(f'Total number of categorical features: {len(categorical_cols)}')
print(f'Total number of continuos features: {len(num_cols)}')

Total number of features: 189
Total number of categorical features: 11
Total number of continuos features: 178


# Data Preprocessing and Feature Engineering

## Handling missing values

In [38]:
train_missing_data = pd.DataFrame(train_data.isnull().sum()/len(train_data))
train_need_drop = train_missing_data.loc[train_missing_data[0] >= 0.6]
print('number of column w/ >= 40% missing value = ', len(train_need_drop))

number of column w/ >= 40% missing value =  25


In [39]:
#Drop unusefull colums

train_data = train_data.drop(list(train_need_drop.T.columns), axis=1)
test_data = test_data.drop(list(train_need_drop.T.columns), axis=1)

In [40]:
col_list = []
for col in train_data.columns:
    col_list.append(col)
col_list.remove('target')

In [41]:
categorical_cols = ['B_30','B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_68']

num_cols = [col for col in train_data.columns if col not in categorical_cols + ["target"]]

## Fill null values
* numerical - median
* categorical - mode

In [42]:
#Fill null values with median and mode

for col in num_cols:
    train_data[col] = train_data[col].fillna(train_data[col].median())
    
for col1 in categorical_cols:
    train_data[col1] = train_data[col1].fillna(train_data[col1].mode())
    
for col1 in num_cols:
    test_data[col1] = test_data[col1].fillna(test_data[col1].median())
    
for cols2 in categorical_cols:
    test_data[cols2] =  test_data[cols2].fillna(test_data[cols2].mode()[0])

In [43]:
#Check again null values

#print(train_data.isnull().sum().to_string())
#print(test_data.isnull().sum().to_string())

## Categorical Encoding 

In [44]:
from sklearn.preprocessing import OrdinalEncoder

enc = OrdinalEncoder()

train_data[categorical_cols] = enc.fit_transform(train_data[categorical_cols])
test_data[categorical_cols] = enc.transform(test_data[categorical_cols])

## Remove highly correlated fatures

In [45]:
train_data_without_target = train_data.drop(["target"],axis=1)

cor_matrix = train_data_without_target.corr()
col_core = set()

for i in range(len(cor_matrix.columns)):
    for j in range(i):
        if(cor_matrix.iloc[i, j] > 0.9):
            col_name = cor_matrix.columns[i]
            col_core.add(col_name)
col_core

{'B_11',
 'B_13',
 'B_15',
 'B_23',
 'B_33',
 'B_37',
 'D_104',
 'D_119',
 'D_141',
 'D_143',
 'D_74',
 'D_75',
 'D_77',
 'S_24',
 'S_7'}

In [46]:
train_data = train_data.drop(col_core, axis=1)
test_data = test_data.drop(col_core, axis=1)

In [47]:
train_data.shape

(458913, 150)

In [48]:
test_data.shape

(924621, 149)

In [49]:
train_data = train_data.drop(['S_2'], axis=1)
test_data = test_data.drop(['S_2'], axis=1)

## Split train dataset

In [50]:
num_columns = [col for col in train_data.columns if col not in ["target"]]
X = train_data[num_columns]
y = train_data['target']

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

x_train,x_test,y_train,y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## Feature Scalling

In [52]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scale = scaler.fit_transform(x_train)
X_test_scale = scaler.transform(x_test)

# Model training

# SVM

In [53]:
#Import svm model
from sklearn import svm

#Create a svm Classifier
model = svm.SVC(kernel='linear') # Linear Kernel

#Train the model using the training sets
model.fit(x_train,y_train)

# KNN

In [54]:
from sklearn.neighbors import KNeighborsClassifier
#Create KNN classifier
model = KNeighborsClassifier(n_neighbors = 3)
# Fit the classifier to the data
model.fit(x_train,y_train)

# CatBoost

In [55]:
from catboost import CatBoostClassifier
model = CatBoostClassifier(iterations = 3000, random_state = 42, nan_mode ='Min',task_type ="GPU")
model.fit(x_train, y_train, eval_set = [(x_test, y_test)], cat_features=categorical_cols,  verbose = 100)

# Random Foreset

In [56]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(iterations = 3000, random_state = 42, nan_mode ='Min',task_type ="GPU")
model.fit(x_train, y_train)

## LGBM

In [57]:
import lightgbm as lgb

d_train = lgb.Dataset(x_train, label=y_train, categorical_feature = categorical_cols)

params = {'objective': 'binary','n_estimators': 1500,'metric': 'binary_logloss','boosting': 'gbdt','num_leaves': 90,'reg_lambda' : 50,'colsample_bytree': 0.19,'learning_rate': 0.03,'min_child_samples': 2400,'max_bins': 600,'seed': 42,'verbose': -1}

#trained model with 100 iterations
model = lgb.train(params, d_train, 100)

# XGB

In [58]:
import xgboost as xgb

model = xgb.XGBClassifier(learning_rate =0.01,n_estimators=1000,max_depth=4,min_child_weight=6,gamma=0,subsample=0.8,colsample_bytree=0.8,reg_alpha=0.005,objective= 'binary:logistic',nthread=4,scale_pos_weight=1,seed=27)
model.fit(x_train, y_train)

# Predictions

In [59]:
predictions = model.predict(test_data[num_columns])
predictions

array([0.01449198, 0.00251177, 0.03211666, ..., 0.49430904, 0.17036849,
       0.03143368])

In [60]:
y_pred = model.predict(x_test)

# Evaluation

## Mean Absolute Error

In [61]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, roc_auc_score

mean_absolute_error(y_test,y_pred)

0.1369969383338288

In [62]:
roc_auc_score(y_test, y_pred)

0.9610777627456796

## AMEX Metric

In [63]:
# AMEX Metric
def amex_metric(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:

    def top_four_percent_captured(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        four_pct_cutoff = int(0.04 * df['weight'].sum())
        df['weight_cumsum'] = df['weight'].cumsum()
        df_cutoff = df.loc[df['weight_cumsum'] <= four_pct_cutoff]
        return (df_cutoff['target'] == 1).sum() / (df['target'] == 1).sum()
        
    def weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        df = (pd.concat([y_true, y_pred], axis='columns')
              .sort_values('prediction', ascending=False))
        df['weight'] = df['target'].apply(lambda x: 20 if x==0 else 1)
        df['random'] = (df['weight'] / df['weight'].sum()).cumsum()
        total_pos = (df['target'] * df['weight']).sum()
        df['cum_pos_found'] = (df['target'] * df['weight']).cumsum()
        df['lorentz'] = df['cum_pos_found'] / total_pos
        df['gini'] = (df['lorentz'] - df['random']) * df['weight']
        return df['gini'].sum()

    def normalized_weighted_gini(y_true: pd.DataFrame, y_pred: pd.DataFrame) -> float:
        y_true_pred = y_true.rename(columns={'target': 'prediction'})
        return weighted_gini(y_true, y_pred) / weighted_gini(y_true, y_true_pred)

    g = normalized_weighted_gini(y_true, y_pred)
    d = top_four_percent_captured(y_true, y_pred)

    return 0.5 * (g + d)

In [64]:
x_test_new = x_test[['P_2']].rename(columns={'P_2': 'prediction'})
x_test_new["prediction"] = y_pred #Change this
print(amex_metric(y_test.to_frame() , x_test_new))

0.7912758314886099


# Generate Output

In [65]:
output = pd.DataFrame({'customer_ID': submission_data.customer_ID, 'prediction': predictions})
output.to_csv('submission-15.csv', index=False)