# Zindi Online Challenge
https://zindi.africa/hackathons/dsn-pre-bootcamp-hackathon-expresso-churn-prediction-challenge/submissions

## Problem Preparation

#### 1. Libraries


In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier

from sklearn.model_selection import (GridSearchCV, cross_val_score,cross_val_predict,StratifiedKFold,learning_curve)
import xgboost as xgb



In [2]:
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
sns.set_style("whitegrid")
sns.set(rc={"figure.figsize":(10,6)})

#### 2. Dataset

In [3]:
train = pd.read_csv('Train.csv')
test = pd.read_csv('Test.csv')
SampleSubmission= pd.read_csv('SampleSubmission.csv')

## Data summary

#### 1. Descriptive Statistics

In [4]:
train.shape

(56000, 52)

In [5]:
train.head()

Unnamed: 0,Applicant_ID,form_field1,form_field2,form_field3,form_field4,form_field5,form_field6,form_field7,form_field8,form_field9,...,form_field42,form_field43,form_field44,form_field45,form_field46,form_field47,form_field48,form_field49,form_field50,default_status
0,Apcnt_1000000,3436.0,0.28505,1.656,0.0,0.0,0.0,10689720.0,252072.0,4272776.0,...,0.392854,2.02,0.711632,0.0,0.0,charge,,1.129518,0.044335,no
1,Apcnt_1000004,3456.0,0.674,0.2342,0.0,0.0,0.0,898979.0,497531.0,9073814.0,...,0.314281,8.08,0.183584,,0.0,charge,349.80573,1.620483,0.322436,no
2,Apcnt_1000008,3276.0,0.53845,3.151,0.0,6.282,,956940.0,,192944.0,...,0.162965,18.18,0.791136,0.0,0.0,charge,,1.51337,0.01164,yes
3,Apcnt_1000012,3372.0,0.17005,0.505,0.0,0.0,192166.0,3044703.0,385499.0,3986472.0,...,0.488884,2.02,0.685168,,0.0,charge,89.9401,0.664452,0.082729,no
4,Apcnt_1000016,3370.0,0.7727,1.101,0.0,0.0,1556.0,214728.0,214728.0,1284089.0,...,0.275,12.12,0.438168,0.0,0.0,charge,97.887502,1.427891,0.04563,no


In [6]:
print(train.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 56000 entries, 0 to 55999
Data columns (total 52 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Applicant_ID    56000 non-null  object 
 1   form_field1     53471 non-null  float64
 2   form_field2     52156 non-null  float64
 3   form_field3     55645 non-null  float64
 4   form_field4     55645 non-null  float64
 5   form_field5     55645 non-null  float64
 6   form_field6     42640 non-null  float64
 7   form_field7     50837 non-null  float64
 8   form_field8     42640 non-null  float64
 9   form_field9     47992 non-null  float64
 10  form_field10    55645 non-null  float64
 11  form_field11    24579 non-null  float64
 12  form_field12    46105 non-null  float64
 13  form_field13    50111 non-null  float64
 14  form_field14    56000 non-null  int64  
 15  form_field15    33525 non-null  float64
 16  form_field16    42964 non-null  float64
 17  form_field17    44849 non-null 

In [7]:
train.default_status.nunique(), train.default_status.unique().tolist()

(2, ['no', 'yes'])

In [8]:
train.form_field47.nunique(), train. form_field47.unique().tolist()

(2, ['charge', 'lending'])

In [9]:
train.default_status.replace( {'yes': 1,'no': 0}, inplace = True)
train['form_field47'].replace({'charge': 1,'lending': 0},inplace = True)
test['form_field47'].replace({'charge': 1,'lending': 0},inplace = True)

In [10]:
#train_df = train.replace(np.nan, 0)
#train_df.head()

In [11]:
#test = test.replace(np.nan, 0)
#test.head()

## Data Preparation

#### 1. Feature Selection

In [12]:
features = train.select_dtypes(exclude = object).columns.drop(['default_status'])

In [13]:
features

Index(['form_field1', 'form_field2', 'form_field3', 'form_field4',
       'form_field5', 'form_field6', 'form_field7', 'form_field8',
       'form_field9', 'form_field10', 'form_field11', 'form_field12',
       'form_field13', 'form_field14', 'form_field15', 'form_field16',
       'form_field17', 'form_field18', 'form_field19', 'form_field20',
       'form_field21', 'form_field22', 'form_field23', 'form_field24',
       'form_field25', 'form_field26', 'form_field27', 'form_field28',
       'form_field29', 'form_field30', 'form_field31', 'form_field32',
       'form_field33', 'form_field34', 'form_field35', 'form_field36',
       'form_field37', 'form_field38', 'form_field39', 'form_field40',
       'form_field41', 'form_field42', 'form_field43', 'form_field44',
       'form_field45', 'form_field46', 'form_field47', 'form_field48',
       'form_field49', 'form_field50'],
      dtype='object')

## Algorithms Evaluation

#### 1. Test options and evaluation metric

In [14]:
X = train[features]
y = train['default_status']

In [15]:
def performance_metric(y, pred):
    return roc_auc_score(y, pred, labels=[0, 1])

In [16]:
num_of_folds = 5

kfold = StratifiedKFold(num_of_folds)

In [17]:

result_list = []
result= 0


for i, (train_split,test_split) in enumerate(kfold.split(X, y)):
    
    x_train, y_train = X.loc[train_split, features], y.loc[train_split]
    x_test, y_test = X.loc[test_split, features], y.loc[test_split]
    
    xgb_Model = xgb.XGBClassifier(objective ='binary:logistic', colsample_bytree = 0.3, learning_rate = 0.4, max_depth = 5,
                                                   alpha = 10, n_estimators = 35)
   
    xgb_Model.fit( x_train, y_train,)
    
    y_test_predicted = xgb_Model.predict_proba(x_test)[:, 1]
    roc_auc=performance_metric(y_test, y_test_predicted)
    result_list.append(roc_auc)
    result += roc_auc/num_of_folds
    predicted_Y =  xgb_Model.predict_proba(test[features])[:, 1]
   
    print('{} Folds : {}'.format(i, roc_auc))

print()
print()
print('Average_result : ',result )

0 Folds : 0.8354863407779555
1 Folds : 0.8325957770283595
2 Folds : 0.8377030618447612
3 Folds : 0.8259184773427161
4 Folds : 0.8440022181651847


Average_result :  0.8351411750317953


In [28]:
parameters = {'n_estimators': 4000,'learning_rate': 0.01,'objective': 'CrossEntropy',
              'eval_metric':'AUC','random_seed': 2254,'early_stopping_rounds': 200,'use_best_model': True,}
values = []
result_list1 = []
result1= 0

for i, (train_split,test_split) in enumerate(kfold.split(X, y)):
    
    x_train, y_train = X.loc[train_split, features], y.loc[train_split]
    x_test, y_test = X.loc[test_split, features], y.loc[test_split]
    
    
    catB_Model = CatBoostClassifier(**parameters)

    catB_Model.fit(x_train, y_train,eval_set=[(x_test,y_test)],verbose=100)
    
    y_test_predicted1= catB_Model.predict_proba(x_test)[:, 1]
    roc_auc1=performance_metric(y_test, y_test_predicted1)
    result_list1.append(roc_auc1)
    result1 += roc_auc1/num_of_folds
    
    predicted_Y1 = catB_Model.predict_proba(test[features])[:, 1]
    values.append(predicted_Y1)
    print('Fold {} : {}'.format(i, roc_auc1))

print()
print()
print('Average_result : ',result1 )

0:	test: 0.7985595	best: 0.7985595 (0)	total: 34ms	remaining: 2m 16s
100:	test: 0.8254345	best: 0.8254345 (100)	total: 3.43s	remaining: 2m 12s
200:	test: 0.8293706	best: 0.8293706 (200)	total: 7.03s	remaining: 2m 12s
300:	test: 0.8316713	best: 0.8316713 (300)	total: 10.6s	remaining: 2m 10s
400:	test: 0.8332517	best: 0.8332517 (400)	total: 13.8s	remaining: 2m 4s
500:	test: 0.8342362	best: 0.8342455 (497)	total: 17.4s	remaining: 2m 1s
600:	test: 0.8352527	best: 0.8352604 (599)	total: 20.9s	remaining: 1m 58s
700:	test: 0.8360072	best: 0.8360072 (700)	total: 24.7s	remaining: 1m 56s
800:	test: 0.8364889	best: 0.8364898 (799)	total: 27.7s	remaining: 1m 50s
900:	test: 0.8369889	best: 0.8369889 (900)	total: 30.7s	remaining: 1m 45s
1000:	test: 0.8373658	best: 0.8373658 (1000)	total: 33.8s	remaining: 1m 41s
1100:	test: 0.8377140	best: 0.8377140 (1100)	total: 37.1s	remaining: 1m 37s
1200:	test: 0.8379048	best: 0.8379272 (1186)	total: 40.6s	remaining: 1m 34s
1300:	test: 0.8381059	best: 0.8381121 (

## Model Finalization

#### 1. Predictions on validation dataset

In [19]:
"{}fold cross_validation,result:{}".format(num_of_folds,result1)

'5fold cross_validation,result:0.8408121271077278'

In [20]:
predicted_Y_df= pd.DataFrame(values).T

In [21]:
predicted_Y_df.columns = ['fold_'+ str(i) for i in range(1, num_of_folds+ 1)]

In [22]:
predicted_Y_df.head()

Unnamed: 0,fold_1,fold_2,fold_3,fold_4,fold_5
0,0.33923,0.281797,0.324865,0.285715,0.263713
1,0.392309,0.363056,0.461989,0.356928,0.389761
2,0.328001,0.389073,0.387181,0.408335,0.332204
3,0.709937,0.754311,0.717963,0.769209,0.718489
4,0.157444,0.185968,0.16452,0.155695,0.181729


In [30]:
SampleSubmission['default_status']=np.mean(values,axis = 0)

In [31]:
SampleSubmission.to_csv('sample_submissionsH.csv', index=False)