# Import libraries and data

In [140]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [141]:
Xtrain = pd.read_csv('Xtrain.csv')
Ytrain = pd.read_csv('Ytrain.csv')

In [142]:
Xtrain.head()

Unnamed: 0,Id,Name,City,State,Zip,Bank,BankState,NAICS,ApprovalDate,ApprovalFY,...,RetainedJob,FranchiseCode,UrbanRural,RevLineCr,LowDoc,DisbursementDate,DisbursementGross,BalanceGross,GrAppv,SBA_Appv
0,0,"NPE Realty, LLC",Pembroke Pines,FL,33027,"TD BANK, NATIONAL ASSOCIATION",DE,334516,16-Apr-10,2010,...,0,0,1,N,N,1-May-10,"$2,000,000.00",$0.00,"$2,000,000.00","$1,500,000.00"
1,1,"KEVCO CONSTRUCTION, LLC",SANDY,UT,84094,ZIONS FIRST NATIONAL BANK,UT,236115,30-Jan-10,2010,...,3,0,1,N,N,1-Feb-10,"$17,000.00",$0.00,"$17,000.00","$17,000.00"
2,2,EAST L A SHOE REPAIR,LOS ANGELES,CA,90022,BANK OF AMERICA NATL ASSOC,NC,811430,29-Jul-02,2002,...,1,1,1,Y,N,31-Aug-02,"$17,000.00",$0.00,"$17,000.00","$8,500.00"
3,3,"RUSSLER, DANIEL C, MD",LODI,WI,53555,ASSOCIATED BANK NATL ASSOC,WI,0,10-Jun-94,1994,...,0,1,0,N,N,31-Jul-94,"$137,000.00",$0.00,"$137,000.00","$123,300.00"
4,4,AFFORDABLE FAMILY DENISTRY LLC,LEBANON,MO,65536,COMMERCE BANK,MO,621210,6-Feb-06,2006,...,2,1,1,T,N,31-Mar-06,"$46,665.00",$0.00,"$25,000.00","$12,500.00"


In [143]:
Xtrain.columns

Index(['Id', 'Name', 'City', 'State', 'Zip', 'Bank', 'BankState', 'NAICS',
       'ApprovalDate', 'ApprovalFY', 'Term', 'NoEmp', 'NewExist', 'CreateJob',
       'RetainedJob', 'FranchiseCode', 'UrbanRural', 'RevLineCr', 'LowDoc',
       'DisbursementDate', 'DisbursementGross', 'BalanceGross', 'GrAppv',
       'SBA_Appv'],
      dtype='object')

In [144]:
for column in Xtrain.columns:
    print(column,len(Xtrain[column].unique()))

Id 50000
Name 48757
City 9488
State 51
Zip 13794
Bank 2784
BankState 53
NAICS 1118
ApprovalDate 6934
ApprovalFY 47
Term 325
NoEmp 207
NewExist 4
CreateJob 98
RetainedJob 144
FranchiseCode 849
UrbanRural 3
RevLineCr 5
LowDoc 8
DisbursementDate 2434
DisbursementGross 14120
BalanceGross 1
GrAppv 3951
SBA_Appv 6048


In [145]:
Xtrain['UrbanRural'].unique()

array([1, 0, 2])

In [146]:
Ytrain.head()

Unnamed: 0,Id,ChargeOff
0,0,0
1,1,1
2,2,0
3,3,0
4,4,0


# Data Preprocessing

In [147]:
def currency_to_decimal(amount):
    return int(''.join(amount[1:-4].split(',')))

In [148]:
def preprocessing(pd):
    X = pd.loc[:,['NewExist','UrbanRural','LowDoc']].values.astype(str)
    encoded_x = None
    for i in range(0, X.shape[1]):
        label_encoder = LabelEncoder()
        feature = label_encoder.fit_transform(X[:,i])
        feature = feature.reshape(X.shape[0], 1)
        onehot_encoder = OneHotEncoder(sparse=False, categories='auto')
        feature = onehot_encoder.fit_transform(feature)
        if encoded_x is None:
            encoded_x = feature
        else:
            encoded_x = np.concatenate((encoded_x, feature), axis=1)
    return encoded_x

# Training

In [149]:
xgb_model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
xgb_model.fit(preprocessing(Xtrain), Ytrain['ChargeOff'].values)

X shape: (50000, 15)


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=0, num_parallel_tree=1,
              objective='binary:logistic', random_state=42, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

# Import test data 

In [150]:
Xtest = pd.read_csv('Xtest.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [151]:
Xtest.head()

Unnamed: 0,Id,Name,City,State,Zip,Bank,BankState,NAICS,ApprovalDate,ApprovalFY,...,RetainedJob,FranchiseCode,UrbanRural,RevLineCr,LowDoc,DisbursementDate,DisbursementGross,BalanceGross,GrAppv,SBA_Appv
0,0,Bayridge Grocery Inc.,BROOKLYN,NY,11209,BANK OF AMERICA NATL ASSOC,NC,445110,15-Dec-05,2006,...,6,0,1,Y,N,31-Dec-05,"$120,000.00",$0.00,"$100,000.00","$50,000.00"
1,1,SUBWAY,GILBERT,AZ,85297,INDEPENDENCE BANK,RI,722211,1-May-03,2003,...,0,78760,1,0,Y,30-Jun-03,"$130,000.00",$0.00,"$130,000.00","$110,500.00"
2,2,1-800 RADIATOR OF KATY,KATY,TX,77450,BANCO POPULAR NORTH AMERICA,NY,423120,17-Oct-06,2007,...,1,407,1,0,N,31-Dec-06,"$184,000.00",$0.00,"$184,000.00","$138,000.00"
3,3,"DAUGHTERY'S FOOD&FUEL CTR, INC",JEFFERSONVILLE,KY,40337,"COMMUNITY TRUST BANK, INC.",KY,447110,8-Nov-00,2001,...,0,1,1,N,N,30-Apr-01,"$80,000.00",$0.00,"$135,000.00","$101,250.00"
4,4,"TGGP ENTERPRISES, LLC",TWINSBURG,OH,44087,CAPITAL ONE NATL ASSOC,VA,722110,27-Oct-05,2006,...,9,0,1,N,N,31-Dec-05,"$50,000.00",$0.00,"$50,000.00","$25,000.00"


In [152]:
for column in Xtest.columns:
    print(column,len(Xtest[column].unique()))

Id 100000
Name 96410
City 13109
State 52
Zip 17763
Bank 3547
BankState 53
NAICS 1197
ApprovalDate 7907
ApprovalFY 86
Term 348
NoEmp 256
NewExist 4
CreateJob 118
RetainedJob 186
FranchiseCode 1230
UrbanRural 3
RevLineCr 9
LowDoc 8
DisbursementDate 3675
DisbursementGross 25160
BalanceGross 1
GrAppv 5811
SBA_Appv 9309


# Testing & Output

In [183]:
y_pred = xgb_model.predict(preprocessing(Xtest))

X shape: (100000, 15)


In [184]:
y_pred

array([1, 0, 1, ..., 1, 0, 1])

In [185]:
out = pd.DataFrame()
out['Id'] = range(len(y_pred))
out['ChargeOff'] = y_pred

In [186]:
out.to_csv('out.csv', index=False)