In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm, tree
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

import xgboost

from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier

train = pd.read_csv("train_values.csv")
y_labels = pd.read_csv("train_labels.csv")
train['accepted'] = y_labels['accepted']

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


#### Handle Missing Values

In [2]:
train.fillna(train.median(), inplace=True)
train.describe()

Unnamed: 0,row_id,loan_type,property_type,loan_purpose,occupancy,loan_amount,preapproval,msa_md,state_code,county_code,...,applicant_sex,applicant_income,population,minority_population_pct,ffiecmedian_family_income,tract_to_msa_md_income_pct,number_of_owner-occupied_units,number_of_1_to_4_family_units,lender,accepted
count,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,...,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0,500000.0
mean,249999.5,1.366276,1.04765,2.06681,1.10959,221.753158,2.764722,181.606972,23.726924,144.542062,...,1.462374,100.121312,5396.982356,31.225669,69158.876302,92.200385,1423.172866,1880.147458,3720.121344,0.500228
std,144337.711634,0.690555,0.231404,0.948371,0.326092,590.641648,0.543061,138.464169,15.982768,100.243612,...,0.677685,147.47444,2667.723303,25.798784,14478.232811,13.990187,721.027517,893.717989,1838.313175,0.5
min,0.0,1.0,1.0,1.0,1.0,1.0,1.0,-1.0,-1.0,-1.0,...,1.0,1.0,14.0,0.534,17858.0,3.981,4.0,1.0,0.0,0.0
25%,124999.75,1.0,1.0,1.0,1.0,93.0,3.0,25.0,6.0,57.0,...,1.0,49.0,3805.0,11.191,60071.0,89.145,963.0,1323.0,2442.0,0.0
50%,249999.5,1.0,1.0,2.0,1.0,162.0,3.0,192.0,26.0,131.0,...,1.0,74.0,4975.0,22.901,67526.0,100.0,1327.0,1753.0,3731.0,1.0
75%,374999.25,2.0,1.0,3.0,1.0,266.0,3.0,314.0,37.0,246.0,...,2.0,112.0,6379.0,44.486,74714.25,100.0,1754.0,2275.0,5436.0,1.0
max,499999.0,4.0,3.0,3.0,3.0,100878.0,3.0,408.0,52.0,324.0,...,4.0,10139.0,37097.0,100.0,125248.0,100.0,8771.0,13623.0,6508.0,1.0


#### Feature Engineering

In [3]:
# add loan_income_ratio
train['loan_income_ratio'] = train['loan_amount'] / train['applicant_income']

# add location_code = state_code * county_code
train['location_code'] = train['state_code'] * train['county_code']

# add applicant_race_ethnicity = applicant_ethnicity * applicant_race
train['applicant_race_eth'] = train['applicant_race'] * train['applicant_ethnicity']

# add family_income = high, medium, low
train['family_income'] = train['ffiecmedian_family_income'].apply(lambda value: ('low' if value <= 30000.00 else 'medium') if value <= 90000.00 else 'high')
train['family_income'] = pd.Categorical(train['family_income'], categories=['low', 'medium', 'high'])

train.head(3)

Unnamed: 0,row_id,loan_type,property_type,loan_purpose,occupancy,loan_amount,preapproval,msa_md,state_code,county_code,...,tract_to_msa_md_income_pct,number_of_owner-occupied_units,number_of_1_to_4_family_units,lender,co_applicant,accepted,loan_income_ratio,location_code,applicant_race_eth,family_income
0,0,3,1,1,1,70.0,3,18,37,246,...,50.933,716.0,2642.0,4536,False,1,2.916667,9102,10,medium
1,1,1,1,3,1,178.0,3,369,52,299,...,100.0,1622.0,2108.0,2458,False,0,3.122807,15548,5,medium
2,2,2,1,3,1,163.0,3,16,10,306,...,100.0,760.0,1048.0,5710,False,1,2.432836,3060,10,medium


#### Encoding

In [4]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
le = LabelEncoder()
train['co_applicant'] = le.fit_transform(train['co_applicant'])
train['family_income'] = le.fit_transform(train['family_income'])
train.head()

Unnamed: 0,row_id,loan_type,property_type,loan_purpose,occupancy,loan_amount,preapproval,msa_md,state_code,county_code,...,tract_to_msa_md_income_pct,number_of_owner-occupied_units,number_of_1_to_4_family_units,lender,co_applicant,accepted,loan_income_ratio,location_code,applicant_race_eth,family_income
0,0,3,1,1,1,70.0,3,18,37,246,...,50.933,716.0,2642.0,4536,0,1,2.916667,9102,10,2
1,1,1,1,3,1,178.0,3,369,52,299,...,100.0,1622.0,2108.0,2458,0,0,3.122807,15548,5,2
2,2,2,1,3,1,163.0,3,16,10,306,...,100.0,760.0,1048.0,5710,0,1,2.432836,3060,10,2
3,3,1,1,1,1,155.0,1,305,47,180,...,100.0,2025.0,2299.0,5888,1,1,1.47619,8460,10,2
4,4,1,1,1,1,305.0,3,24,37,20,...,82.2,1464.0,1847.0,289,0,1,4.295775,740,6,2


#### Train Test Split

In [5]:
from sklearn.model_selection import train_test_split

# target
y = train.pop('accepted')

features = train.drop(columns=['row_id', 'number_of_1_to_4_family_units', 
                                'population', 'tract_to_msa_md_income_pct', 'applicant_ethnicity', 
                                'number_of_owner-occupied_units', 'msa_md', 'state_code', 'county_code',
                               'ffiecmedian_family_income', 'minority_population_pct', 'applicant_race'
                               ])

# features
X = features.values

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 7)

In [7]:
X_train.shape, X_test.shape

((400000, 14), (100000, 14))

#### Scaling

In [8]:
from sklearn.preprocessing import StandardScaler

# fit the scaler based upon the training set
scaler = StandardScaler().fit(X_train)

# scale the train features array 
X_train = scaler.transform(X_train)

# scale the test features array (based on the training fit)
X_test = scaler.transform(X_test)

In [9]:
print(X_train[0])

[-0.53006789 -0.20639017 -1.12519645  2.73471192  0.16107647  0.43341641
 -0.68216568  1.2777048   1.37043465  1.2245599  -0.18104255  0.06533304
 -1.45017223 -3.16584101]


#### XGBoost

In [10]:
# build and fit xgb (10 mins)
xgb = xgboost.XGBClassifier(objective = 'binary:hinge', eval_metric='error', min_child_weight = 5,
                            max_depth = 11, min_samples_split = 2, min_samples_leaf=50, n_estimators = 600,
                            random_state = 50, n_jobs=-1, gamma=0, scale_pos_weight=1, nthred=4,
                            learning_rate=0.1, reg_alpha=0.005, subsample=0.8)

xgb = xgb.fit(X_train, y_train)

# print the xgb score 
print(xgb.score(X_train, y_train))

0.7663


  if diff:


In [None]:
# apply the model to the test set and output score
xgb.score(X_test, y_test)

In [None]:
# xgb feature importances
xgb_feature_importances = xgb.feature_importances_
feature_importances = pd.Series(xgb_feature_importances, index = features.columns)
n_features = (feature_importances>0).sum()
feature_importances.sort_values().tail(30).plot(kind="barh", edgecolor="black", color="#1F77B4", figsize=(12,5));
print("Feature importances out of " + str(n_features) + " total features")

#### Prepare Test Dataset

In [None]:
test = pd.read_csv("test_values.csv")

In [None]:
# fill missing values
test.fillna(test.median(), inplace=True)
test.head(3)

In [None]:
# add feature engineering

# add loan_income_ratio
test['loan_income_ratio'] = test['loan_amount'] / test['applicant_income']

# add location_code = state_code * county_code
test['location_code'] = test['state_code'] * test['county_code']

# add applicant_race_ethnicity = applicant_ethnicity * applicant_race
test['applicant_race_eth'] = test['applicant_race'] * test['applicant_ethnicity']

# add family_income = high, medium, low
test['family_income'] = test['ffiecmedian_family_income'].apply(lambda value: ('low' if value <= 30000.00 else 'medium') if value <= 90000.00 else 'high')
test['family_income'] = pd.Categorical(test['family_income'], categories=['low', 'medium', 'high'])

test.head(3)

In [None]:
# encode test features
test['co_applicant'] = le.fit_transform(test['co_applicant'])
test['family_income'] = le.fit_transform(test['family_income'])
test.head()

In [None]:
# trim the test set features
test_features = test.drop(columns=['row_id', 'number_of_1_to_4_family_units', 
                                'population', 'tract_to_msa_md_income_pct', 'applicant_ethnicity', 
                                'number_of_owner-occupied_units', 'msa_md', 'state_code', 'county_code',
                               'ffiecmedian_family_income', 'minority_population_pct', 'applicant_race'
                               ])

In [None]:
test_features.head(2)

In [None]:
# scale the test features array (based on the training fit)
test_features = scaler.transform(test_features)

In [None]:
test_features[0]

#### Prediction

In [None]:
# predicting on the "test" dataset, where values are unknown
predict = xgb.predict(test_features)

print(predict)

In [None]:
output = pd.DataFrame()
output['row_id'] = y_labels['row_id']
output['accepted'] = predict

In [None]:
output.to_csv("submission2.csv", index=False)