### Importing necessary packages

In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import model_selection, preprocessing
from xgboost.sklearn import XGBClassifier as XGBC
from xgboost.sklearn import XGBRegressor as XGBReg

### Importing train and test data 

In [2]:
app_train = pd.read_csv('~/Desktop/kaggle_default_risk/Default_data/application_train.csv')
app_test = pd.read_csv('~/Desktop/kaggle_default_risk/Default_data/application_test.csv')

In [3]:
# Checking size of both data files
print('Training set size:', app_train.shape)
print('Test set size:', app_test.shape)

Training set size: (307511, 122)
Test set size: (48744, 121)


In [4]:
print(app_train.describe)

<bound method NDFrame.describe of         SK_ID_CURR  TARGET NAME_CONTRACT_TYPE CODE_GENDER FLAG_OWN_CAR  \
0           100002       1         Cash loans           M            N   
1           100003       0         Cash loans           F            N   
2           100004       0    Revolving loans           M            Y   
3           100006       0         Cash loans           F            N   
4           100007       0         Cash loans           M            N   
5           100008       0         Cash loans           M            N   
6           100009       0         Cash loans           F            Y   
7           100010       0         Cash loans           M            Y   
8           100011       0         Cash loans           F            N   
9           100012       0    Revolving loans           M            N   
10          100014       0         Cash loans           F            N   
11          100015       0         Cash loans           F            N   
12  

In [5]:
# How many people repaid:0, and how many didn't:1 (target column)
app_train['TARGET'].value_counts()

0    282686
1     24825
Name: TARGET, dtype: int64

In [6]:
# What types of data are in our set
app_train.dtypes.value_counts()

float64    65
int64      41
object     16
dtype: int64

In [7]:
# Finding the number of unique classes in each column
app_train.select_dtypes('object').apply(pd.Series.nunique, axis = 0)

NAME_CONTRACT_TYPE             2
CODE_GENDER                    3
FLAG_OWN_CAR                   2
FLAG_OWN_REALTY                2
NAME_TYPE_SUITE                7
NAME_INCOME_TYPE               8
NAME_EDUCATION_TYPE            5
NAME_FAMILY_STATUS             6
NAME_HOUSING_TYPE              6
OCCUPATION_TYPE               18
WEEKDAY_APPR_PROCESS_START     7
ORGANIZATION_TYPE             58
FONDKAPREMONT_MODE             4
HOUSETYPE_MODE                 3
WALLSMATERIAL_MODE             7
EMERGENCYSTATE_MODE            2
dtype: int64

In [8]:
# Appending training and test data
df = app_train.append(app_test).reset_index()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [9]:
# One hot encoding of categorical variables
df = pd.get_dummies(df)

In [10]:
print('Shape of training features: ', app_train.shape)
print('Shape of testing features: ', app_test.shape)
print('Shape of testing features: ', df.shape)

Shape of training features:  (307511, 122)
Shape of testing features:  (48744, 121)
Shape of testing features:  (356255, 247)


In [11]:
#Spitting data into train and test
train_df = df[df['TARGET'].notnull()]

#slice into train and test
y = train_df['TARGET']
test_df = df[df['TARGET'].isnull()]

In [12]:
# Don't want to train on target column
del train_df['TARGET']
del test_df['TARGET']

In [14]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(train_df, y, random_state = 0)

In [15]:
X_train.isnull().sum().sort_values(ascending = False)[:20]

COMMONAREA_MODE             161161
COMMONAREA_MEDI             161161
COMMONAREA_AVG              161161
NONLIVINGAPARTMENTS_MEDI    160182
NONLIVINGAPARTMENTS_MODE    160182
NONLIVINGAPARTMENTS_AVG     160182
LIVINGAPARTMENTS_MEDI       157700
LIVINGAPARTMENTS_MODE       157700
LIVINGAPARTMENTS_AVG        157700
FLOORSMIN_MEDI              156535
FLOORSMIN_MODE              156535
FLOORSMIN_AVG               156535
YEARS_BUILD_MODE            153432
YEARS_BUILD_MEDI            153432
YEARS_BUILD_AVG             153432
OWN_CAR_AGE                 152228
LANDAREA_MEDI               136933
LANDAREA_MODE               136933
LANDAREA_AVG                136933
BASEMENTAREA_AVG            135037
dtype: int64

In [16]:
## Filling NaN values with mean
X_train = X_train.fillna(X_train.mean())

In [17]:
# Double checking NaN values have been taken care of
X_train.isnull().sum().sort_values(ascending = False)[:20]

WEEKDAY_APPR_PROCESS_START_WEDNESDAY    0
REGION_POPULATION_RELATIVE              0
OBS_60_CNT_SOCIAL_CIRCLE                0
OBS_30_CNT_SOCIAL_CIRCLE                0
NONLIVINGAREA_MODE                      0
NONLIVINGAREA_MEDI                      0
NONLIVINGAREA_AVG                       0
NONLIVINGAPARTMENTS_MODE                0
NONLIVINGAPARTMENTS_MEDI                0
NONLIVINGAPARTMENTS_AVG                 0
LIVINGAREA_MODE                         0
LIVINGAREA_MEDI                         0
LIVINGAREA_AVG                          0
LIVINGAPARTMENTS_MODE                   0
LIVINGAPARTMENTS_MEDI                   0
LIVINGAPARTMENTS_AVG                    0
LIVE_REGION_NOT_WORK_REGION             0
LIVE_CITY_NOT_WORK_CITY                 0
LANDAREA_MODE                           0
LANDAREA_MEDI                           0
dtype: int64

In [18]:
## XGBoosting of data
clf = XGBC(silent = False,verbose = True)

clf.fit(X_train, y_train, eval_set = [(X_test, y_test)], eval_metric = 'auc', 
        verbose = True, early_stopping_rounds = 10)

[0]	validation_0-auc:0.666857
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.668446
[2]	validation_0-auc:0.67506
[3]	validation_0-auc:0.678174
[4]	validation_0-auc:0.677953
[5]	validation_0-auc:0.682149
[6]	validation_0-auc:0.683095
[7]	validation_0-auc:0.684302
[8]	validation_0-auc:0.685592
[9]	validation_0-auc:0.684744
[10]	validation_0-auc:0.685222
[11]	validation_0-auc:0.688237
[12]	validation_0-auc:0.688743
[13]	validation_0-auc:0.689709
[14]	validation_0-auc:0.690195
[15]	validation_0-auc:0.691333
[16]	validation_0-auc:0.689875
[17]	validation_0-auc:0.690377
[18]	validation_0-auc:0.690393
[19]	validation_0-auc:0.690721
[20]	validation_0-auc:0.691927
[21]	validation_0-auc:0.692055
[22]	validation_0-auc:0.693879
[23]	validation_0-auc:0.695087
[24]	validation_0-auc:0.695805
[25]	validation_0-auc:0.696222
[26]	validation_0-auc:0.696964
[27]	validation_0-auc:0.697122
[28]	validation_0-auc:0.698042
[29]	validation_0-auc:0.698286
[30]	validation_0

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=False, subsample=1, verbose=True)

In [19]:
## Predict probabilities
preds = clf.predict_proba(test_df)[:,1]

In [20]:
## Array of predictions from model
preds

array([0.05642974, 0.08812372, 0.07856593, ..., 0.06928429, 0.06262127,
       0.21844716], dtype=float32)

In [21]:
test_df.head(10)

Unnamed: 0,index,AMT_ANNUITY,AMT_CREDIT,AMT_GOODS_PRICE,AMT_INCOME_TOTAL,AMT_REQ_CREDIT_BUREAU_DAY,AMT_REQ_CREDIT_BUREAU_HOUR,AMT_REQ_CREDIT_BUREAU_MON,AMT_REQ_CREDIT_BUREAU_QRT,AMT_REQ_CREDIT_BUREAU_WEEK,...,WALLSMATERIAL_MODE_Panel,"WALLSMATERIAL_MODE_Stone, brick",WALLSMATERIAL_MODE_Wooden,WEEKDAY_APPR_PROCESS_START_FRIDAY,WEEKDAY_APPR_PROCESS_START_MONDAY,WEEKDAY_APPR_PROCESS_START_SATURDAY,WEEKDAY_APPR_PROCESS_START_SUNDAY,WEEKDAY_APPR_PROCESS_START_THURSDAY,WEEKDAY_APPR_PROCESS_START_TUESDAY,WEEKDAY_APPR_PROCESS_START_WEDNESDAY
307511,0,20560.5,568800.0,450000.0,135000.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,0,1,0
307512,1,17370.0,222768.0,180000.0,99000.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
307513,2,69777.0,663264.0,630000.0,202500.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,1,0,0,0,0,0
307514,3,49018.5,1575000.0,1575000.0,315000.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,1
307515,4,32067.0,625500.0,625500.0,180000.0,,,,,,...,0,0,0,1,0,0,0,0,0,0
307516,5,34600.5,959688.0,810000.0,270000.0,0.0,0.0,0.0,1.0,0.0,...,0,0,0,0,1,0,0,0,0,0
307517,6,22117.5,499221.0,373500.0,180000.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
307518,7,14220.0,180000.0,180000.0,166500.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,1,0,0,0,0,0,0
307519,8,28957.5,364896.0,315000.0,315000.0,0.0,0.0,0.0,0.0,0.0,...,0,1,0,0,0,0,0,1,0,0
307520,9,5337.0,45000.0,45000.0,162000.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,1,0


In [22]:
test_df['TARGET'] = preds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [23]:
test_df[['SK_ID_CURR', 'TARGET']]

Unnamed: 0,SK_ID_CURR,TARGET
307511,100001,0.056430
307512,100005,0.088124
307513,100013,0.078566
307514,100028,0.046820
307515,100038,0.257659
307516,100042,0.138790
307517,100057,0.032321
307518,100065,0.063049
307519,100066,0.023039
307520,100067,0.141701


In [24]:
## Create Submission File
test_df[['SK_ID_CURR', 'TARGET']].to_csv("xgb_default.csv", index = False)