In [1]:
%%javascript
IPython.OutputArea.prototype._should_scroll = function(lines) {
    return false;
}

<IPython.core.display.Javascript object>

In [2]:
#import main packages
import os
import glob
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

import matplotlib.pyplot as plt
import scikitplot as skplt

In [3]:
from sklearn.preprocessing import StandardScaler

In [4]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [5]:
from collections import Counter
import pprint
import pickle

In [6]:
#import the test dataset
test = pd.read_csv("../data/test.csv")

In [7]:
test.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [8]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 367 entries, 0 to 366
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Loan_ID            367 non-null    object 
 1   Gender             356 non-null    object 
 2   Married            367 non-null    object 
 3   Dependents         357 non-null    object 
 4   Education          367 non-null    object 
 5   Self_Employed      344 non-null    object 
 6   ApplicantIncome    367 non-null    int64  
 7   CoapplicantIncome  367 non-null    int64  
 8   LoanAmount         362 non-null    float64
 9   Loan_Amount_Term   361 non-null    float64
 10  Credit_History     338 non-null    float64
 11  Property_Area      367 non-null    object 
dtypes: float64(3), int64(2), object(7)
memory usage: 34.5+ KB


In [9]:
#change the columns to their correct types
test['Credit_History'] = test['Credit_History'].astype('object')

### Engineer features

In [10]:
#other features

#total income
test['TotalIncome'] = test['ApplicantIncome'] + test['CoapplicantIncome']

#monthly income
test['MonthlyIncome'] = test['TotalIncome'] / 12

#amount is in thousands so for comparisons to income, put it back up to thousands
test['LoanAmount'] = test['LoanAmount']  * 1000

#monthly installment
test['MonthlyInstallment'] = test['LoanAmount'] / test['Loan_Amount_Term']

#affordability
test['Affordability'] = (test['MonthlyInstallment'] < test['MonthlyIncome'])

#make it a Yes or No only
test['Affordability'] = np.where(test['Affordability'] == True, 'Yes', 'No')

In [11]:
#create additional features - logarithms
test['LogApplicantIncome'] = np.log(test['ApplicantIncome'] + 1)
test['LogCoApplicantIncome'] = np.log(test['CoapplicantIncome'] + 1)
test['LogLoanAmount'] = np.log(test['LoanAmount'] + 1)
test['LogMonthlyInstallment'] = np.log(test['MonthlyInstallment'] + 1)
test['LogMonthlyIncome'] = np.log(test['MonthlyIncome'] + 1)

In [12]:
#replace missing values
test['Gender'] = test['Gender'].fillna('Male')
test['Married'] = test['Married'].fillna('Yes')
test['Dependents'] = test['Dependents'].fillna('0')
test['Self_Employed'] = test['Self_Employed'].fillna('No')
test['LoanAmount'] = test['LoanAmount'].fillna(np.nanmedian(test['LoanAmount']))
test['Loan_Amount_Term'] = test['Loan_Amount_Term'].fillna(np.nanmedian(test['Loan_Amount_Term']))
test['Credit_History'] = test['Credit_History'].fillna(str(test['Credit_History'].mode()[0]))
test['LogApplicantIncome'] = test['LogApplicantIncome'].fillna(np.nanmedian(test['LogApplicantIncome']))
test['LogCoApplicantIncome'] = test['LogCoApplicantIncome'].fillna(np.nanmedian(test['LogCoApplicantIncome']))
test['LogLoanAmount'] = test['LogLoanAmount'].fillna(np.nanmedian(test['LogLoanAmount']))
test['LogMonthlyInstallment'] = test['LogMonthlyInstallment'].fillna(np.nanmedian(test['LogMonthlyInstallment']))
test['LogMonthlyIncome'] = test['LogMonthlyIncome'].fillna(np.nanmedian(test['LogMonthlyIncome']))

In [13]:
#create new features - brackets
test['ApplicantIncomeBracket'] = np.where(test['ApplicantIncome'] < 2800, 'Low_Applicant_Income',
                                          np.where(test['ApplicantIncome'] < 3800, 'Medium_Applicant_Income',
                                          np.where(test['ApplicantIncome'] < 5800, 'High_Applicant_Income', 'Affluent_Applicant_Income')))

test['CoapplicantIncomeBracket'] = np.where(test['CoapplicantIncome'] < 500, 'Low_CoApplicant_Income',
                                          np.where(test['CoapplicantIncome'] < 1100, 'Medium_CoApplicant_Income',
                                          np.where(test['CoapplicantIncome'] < 2200, 'High_CoApplicant_Income', 'Affluent_CoApplicant_Income')))



test['TotalIncomeBracket'] = np.where(test['TotalIncome'] < 4000, 'Low_Total_Income',
                                          np.where(test['TotalIncome'] < 5500, 'Medium_Total_Income',
                                          np.where(test['TotalIncome'] < 7500, 'High_Total_Income', 'Affluent_Total_Income')))

test['LoanAmountBracket'] = np.where(test['LoanAmount'] < 100000, 'Low_Loan_Value',
                                        np.where(test['LoanAmount'] < 130000, 'Medium_Loan_Value',
                                        np.where(test['LoanAmount'] < 170000, 'High_Loan_Value', 'Affluent_Loan_Value')))


In [14]:
#feature types
quantitative = [f for f in test.columns if test.dtypes[f] != 'object']

qualitative = [f for f in test.columns if test.dtypes[f] == 'object']
qualitative.remove('Loan_ID')

In [15]:
quantitative

['ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'TotalIncome',
 'MonthlyIncome',
 'MonthlyInstallment',
 'LogApplicantIncome',
 'LogCoApplicantIncome',
 'LogLoanAmount',
 'LogMonthlyInstallment',
 'LogMonthlyIncome']

In [16]:
qualitative

['Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'Credit_History',
 'Property_Area',
 'Affordability',
 'ApplicantIncomeBracket',
 'CoapplicantIncomeBracket',
 'TotalIncomeBracket',
 'LoanAmountBracket']

In [17]:
test.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,TotalIncome,MonthlyIncome,MonthlyInstallment,Affordability,LogApplicantIncome,LogCoApplicantIncome,LogLoanAmount,LogMonthlyInstallment,LogMonthlyIncome,ApplicantIncomeBracket,CoapplicantIncomeBracket,TotalIncomeBracket,LoanAmountBracket
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110000.0,360.0,1.0,Urban,5720,476.666667,305.555556,Yes,8.651899,0.0,11.608245,5.725399,6.168913,High_Applicant_Income,Low_CoApplicant_Income,High_Total_Income,Medium_Loan_Value
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126000.0,360.0,1.0,Urban,4576,381.333333,350.0,Yes,8.03171,7.313887,11.744045,5.860786,5.946293,Medium_Applicant_Income,High_CoApplicant_Income,Medium_Total_Income,Medium_Loan_Value
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208000.0,360.0,1.0,Urban,6800,566.666667,577.777778,No,8.517393,7.496097,12.245298,6.360919,6.341534,High_Applicant_Income,High_CoApplicant_Income,High_Total_Income,Affluent_Loan_Value
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100000.0,360.0,1.0,Urban,4886,407.166667,277.777778,Yes,7.758333,7.842671,11.512935,5.630415,6.011676,Low_Applicant_Income,Affluent_CoApplicant_Income,Medium_Total_Income,Medium_Loan_Value
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78000.0,360.0,1.0,Urban,3276,273.0,216.666667,Yes,8.094684,0.0,11.264477,5.382965,5.613128,Medium_Applicant_Income,Low_CoApplicant_Income,Low_Total_Income,Low_Loan_Value


In [18]:
#scale the dataset
#scaler = StandardScaler()
#test_clean[quantitative] = scaler.fit_transform(test_clean[quantitative])

In [19]:
test.dtypes

Loan_ID                      object
Gender                       object
Married                      object
Dependents                   object
Education                    object
Self_Employed                object
ApplicantIncome               int64
CoapplicantIncome             int64
LoanAmount                  float64
Loan_Amount_Term            float64
Credit_History               object
Property_Area                object
TotalIncome                   int64
MonthlyIncome               float64
MonthlyInstallment          float64
Affordability                object
LogApplicantIncome          float64
LogCoApplicantIncome        float64
LogLoanAmount               float64
LogMonthlyInstallment       float64
LogMonthlyIncome            float64
ApplicantIncomeBracket       object
CoapplicantIncomeBracket     object
TotalIncomeBracket           object
LoanAmountBracket            object
dtype: object

In [20]:
#get dummies
#test_clean = pd.get_dummies(test_clean)

In [21]:
#encoding the categorical variables
def encode(frame, feature):
    ordering = pd.DataFrame()
    ordering['val'] = frame[feature].unique()
    ordering.index = ordering.val
    ordering['ordering'] = range(1, ordering.shape[0]+1)
    ordering = ordering['ordering'].to_dict()
    
    for cat, o in ordering.items():
        frame.loc[frame[feature] == cat, feature+'_E'] = o
    
qual_encoded = []
for q in qualitative:  
    encode(test, q)
    qual_encoded.append(q+'_E')
print(qual_encoded)

['Gender_E', 'Married_E', 'Dependents_E', 'Education_E', 'Self_Employed_E', 'Credit_History_E', 'Property_Area_E', 'Affordability_E', 'ApplicantIncomeBracket_E', 'CoapplicantIncomeBracket_E', 'TotalIncomeBracket_E', 'LoanAmountBracket_E']


In [22]:
#drop unnecessary columns
submission = test[['Loan_ID']]

test_clean = test.drop(['Loan_ID', 'Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Credit_History',
                          'Property_Area', 'ApplicantIncomeBracket', 'CoapplicantIncomeBracket', 'TotalIncomeBracket',
                          'LoanAmountBracket', 'Affordability', 'ApplicantIncome', 'CoapplicantIncome',
                         'LoanAmount', 'Loan_Amount_Term', 'TotalIncome', 'MonthlyIncome', 'MonthlyInstallment'], axis = 1)

In [23]:
train_columns = ['Loan_Status', 'LogApplicantIncome', 'LogCoApplicantIncome',
       'LogMonthlyIncome', 'LogLoanAmount', 'LogMonthlyInstallment',
       'Gender_E', 'Married_E', 'Dependents_E', 'Education_E',
       'Self_Employed_E', 'Credit_History_E', 'Property_Area_E',
       'ApplicantIncomeBracket_E', 'CoapplicantIncomeBracket_E',
       'TotalIncomeBracket_E', 'LoanAmountBracket_E', 'Affordability_E']

In [24]:
expected_columns = [i for i in train_columns  if i not in test_clean.columns]
expected_columns

['Loan_Status']

In [25]:
test_clean.isnull().sum()

LogApplicantIncome            0
LogCoApplicantIncome          0
LogLoanAmount                 0
LogMonthlyInstallment         0
LogMonthlyIncome              0
Gender_E                      0
Married_E                     0
Dependents_E                  0
Education_E                   0
Self_Employed_E               0
Credit_History_E              0
Property_Area_E               0
Affordability_E               0
ApplicantIncomeBracket_E      0
CoapplicantIncomeBracket_E    0
TotalIncomeBracket_E          0
LoanAmountBracket_E           0
dtype: int64

In [26]:
test_clean.shape

(367, 17)

In [27]:
submission.head()

Unnamed: 0,Loan_ID
0,LP001015
1,LP001022
2,LP001031
3,LP001035
4,LP001051


### Apply models

In [28]:
#get the pickle model files
os.chdir("../models")

#model number counter
i = 1

for file in glob.glob("*.pkl"):
    
    with open("../models/" + file, "rb") as file:
        model = pickle.load(file)
        
    #get the model name
    model_name = str(file).split("/")[2].split(".")[0]
    
    #for now just run the logistic regression model
    if model_name == 'RF':
    
        #do the prediction with this model
        prediction = model.predict(test_clean)
        
        #replace the 1s and 0s predicted with Y and N
        prediction = np.where(prediction == 1, 'N', 'Y')

        #create a column for this model's run and its predictions
        submission[model_name] = prediction
    
    file.close()

Feature names must be in the same order as they were in fit.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission[model_name] = prediction


In [29]:
#change column names for submission
submission.columns = ['Loan_ID', 'Loan_Status']

In [30]:
submission.head()

Unnamed: 0,Loan_ID,Loan_Status
0,LP001015,Y
1,LP001022,Y
2,LP001031,Y
3,LP001035,N
4,LP001051,Y


In [31]:
#write a csv for this submission
submission.to_csv("../data/submission.csv", index = False)