In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score
import xgboost as xgb

# Evaluation functions

In [8]:
#evaluation
def myxval(model,train_df,bemeno_valtozok,fold_num=10):
    train_df['xval']=train_df['ID']%fold_num
    auc_list=[]
    for i in range(fold_num):
        mini_train = train_df[train_df['xval']!=i].copy()
        mini_test = train_df[train_df['xval']==i].copy()
        _=model.fit(mini_train[bemeno_valtozok],mini_train['TARGET'])
        mini_test['p1'] = model.predict_proba(mini_test[bemeno_valtozok])[:,1]
        auc = roc_auc_score(mini_test['TARGET'],mini_test['p1'])
        auc_list.append(auc)
    return np.mean(auc_list)

In [10]:
alldf=pd.read_csv('../data/selected_df.csv')
alldf.shape

(50000, 79)

In [11]:
target='TARGET'
in_attr=list(alldf.columns)[3:]
len(in_attr)

76

In [12]:
train_df=alldf[alldf['train_or_test']=="train"].copy()
test_df =alldf[alldf['train_or_test']=="test"].copy()

In [None]:
# Remark
# The paramers of different models were optimised by trying out the different possibilities one-by-one. 
# I didn't put the code here for sake of acceptable running time for the notebook. 


In [13]:
xgb_model = xgb.XGBClassifier(colsample_bytree=1,
                               gamma=5, learning_rate=0.1,
                                reg_alpha=0.05, reg_lambda=0.3,
                               max_depth=50, min_child_weight=5,
                               n_estimators=100,
                               seed=42,
                               silent= 1,
                               subsample=0.8,
                               random_state=42)
print(myxval(xgb_model,train_df,in_attr,10))
feature_importancesXGB = pd.DataFrame({'imp':xgb_model.feature_importances_,'att':in_attr}).sort_values('imp', ascending=False)['att'].to_list()
test_df['predXGB'] = xgb_model.predict_proba(test_df[in_attr])[:,1]
#print(feature_importancesXGB)

0.7517778376865629


In [14]:
RFmodel = RandomForestClassifier(random_state=42,n_estimators=100,max_depth=20) 
print(myxval(RFmodel,train_df,in_attr,10))
feature_importancesRF = pd.DataFrame({'imp':RFmodel.feature_importances_,'att':in_attr}).sort_values('imp', ascending=False)['att'].to_list()
test_df['predRF'] = RFmodel.predict_proba(test_df[in_attr])[:,1]
#print(feature_importancesRF)

0.7491957662836348


In [16]:
GBmodel = GradientBoostingClassifier(random_state=42,n_estimators=100,max_depth=20,learning_rate=0.3 )
print(myxval(GBmodel,train_df,in_attr,10))
feature_importancesGB = pd.DataFrame({'imp':GBmodel.feature_importances_,'att':in_attr}).sort_values('imp', ascending=False)['att'].to_list()
test_df['predGB'] = GBmodel.predict_proba(test_df[in_attr])[:,1]
#print(feature_importancesGB)

0.744038702891068


## Submission

In [20]:
test_df['p1'] = test_df['predXGB']*test_df['predRF'] 
#test_df['p2'] = (test_df['predXGB']+test_df['predRF'])/2 #not selected for final scoring
test_df['p3'] = test_df['predXGB']*test_df['predRF']*test_df['predGB'] 
test_df['p4'] = (test_df['predXGB']+test_df['predRF']+test_df['predGB'])/3 

In [21]:
#submission
predictions = ['p1','p2','p3','p4']
submissions = ['Sub1.csv', 'Sub2.csv','Sub3.csv','Sub4.csv']
for i in range(len(predictions)):
    submission_df = test_df[['ID',predictions[i]]].copy()
    submission_df.columns=['Id', 'Predicted']
    submission_df.to_csv(submissions[i],index=False)