In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import warnings
warnings.filterwarnings("ignore")

### Load Data

In [None]:
trainCsv = "cleanLoanDataTrain.csv"
valCsv = 'cleanLoanDataValidation.csv'

In [None]:
trainDf = pd.read_csv(trainCsv)
trainDf.head()

In [None]:
valDf = pd.read_csv(valCsv)
valDf.head()

### Logistic Regression

In [None]:
classifier = LogisticRegression()
classifier

In [None]:
y = trainDf["Loan_Status"]
X = trainDf.drop(columns="Loan_Status")

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.25)

In [None]:
classifier.fit(x_train, y_train)

In [None]:
print(f"Training Data Score: {classifier.score(x_train, y_train)}")
print(f"Testing Data Score: {classifier.score(x_test, y_test)}")

### Logistic Regression - after log-transform
* After inspect the histogram, income and loan amount distributions are skewed
* Perform log-transformation to reduce outlier effects
* Model increases in accuracy

In [None]:
colList = ["LoanAmount","TotalIncome"]

for col in colList: 
    fig1 = plt.figure(figsize=(12,7))
    sns.set(font_scale=1.5)
    plt.hist(trainDf[col], density=True, bins=30)
    plt.title('Histogram of ' + col)
    
    fig2 = plt.figure(figsize=(12,7))
    sns.set(font_scale=1.5)
    plt.hist(np.log(trainDf[col]), density=True, bins=30)
    plt.title('Histogram of Log-tranformed of' + col)   


In [None]:
trainDf["Log_TotalIncome"]= np.log(trainDf["TotalIncome"])
trainDf["Log_LoanAmount"]= np.log(trainDf["LoanAmount"])

In [None]:
y = trainDf["Loan_Status"]
X = trainDf.drop(columns=["Loan_Status","LoanAmount","TotalIncome"])

In [None]:
classifierLog = LogisticRegression()
classifierLog

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X,y, test_size=0.25)
classifierLog.fit(x_train, y_train)

In [None]:
print(f"Training Data Score: {classifierLog.score(x_train, y_train)}")
print(f"Testing Data Score: {classifierLog.score(x_test, y_test)}")

### Save the model 
* use pickle
* use joblib

In [None]:
valDf["Log_TotalIncome"]= np.log(valDf["TotalIncome"])
valDf["Log_LoanAmount"]= np.log(valDf["LoanAmount"])
valDf = valDf.drop(columns=["LoanAmount","TotalIncome"])

In [None]:
import pickle 
  
# Save the trained model as a pickle string. 
saved_model = pickle.dumps(classifierLog) 
  
# Load the pickled model 
classifier_from_pickle = pickle.loads(saved_model) 
  
# Use the loaded pickled model to make predictions for validation
classifier_from_pickle.predict(valDf) 

In [None]:
import joblib 
  
# Save the model as a pickle in a file 
joblib.dump(classifierLog, 'data/logisticRegression.pkl') 
  
# Load the model from the file 
classifier_from_joblib = joblib.load('data/logisticRegression.pkl')  
  
# Use the loaded model to make predictions 
classifier_from_joblib.predict(valDf) 

In [None]:
#  create dummy data to tets
colNames = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',\
       'LoanAmount', 'Loan_Amount_Term', 'Credit_History', 'Property_Area',\
       'TotalIncome']
colValues = [[1],[1],[1],[0],[0],[400000],[360],[0],[1],[60000]]
testDummyDf = pd.DataFrame(data=colValues,index=colNames).T

testDummyDf["Log_TotalIncome"]= np.log(testDummyDf["TotalIncome"])
testDummyDf["Log_LoanAmount"]= np.log(testDummyDf["LoanAmount"])
testDummyDf = testDummyDf.drop(columns=["LoanAmount","TotalIncome"])

testDummyDf

In [None]:
# Use the loaded model to make predictions 
classifier_from_joblib.predict(testDummyDf) 