In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion

from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest

from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay # want the ones with false neg not positives
from sklearn.metrics import recall_score, precision_score, roc_auc_score, plot_roc_curve, roc_curve, auc, RocCurveDisplay

from matplotlib import pyplot as plt
# import seaborn as sns
from sklearn import set_config # for plotting pipeline



In [2]:
# Load data
df = pd.read_csv("data.csv") 

# Separate out target, and drop id column
X = df.drop(columns=['Loan_Status','Loan_ID'])
y = df['Loan_Status'].replace({'Y':1, 'N':0})

# Test train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=88)



In [3]:
# Split into cat_feats and num_feats
cat_feats = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Credit_History', 'Property_Area']
num_feats = ['ApplicantIncome', 'CoapplicantIncome', 'LoanAmount', 'Loan_Amount_Term']

def numFeat(data):
    return data[num_feats]

def catFeat(data):
    return data[cat_feats]

keep_num = FunctionTransformer(numFeat)
keep_cat = FunctionTransformer(catFeat)

# Note: Loan amount term is really more categorical, but leaving as numeric so can use in calculations - and will scale
# Credit history will need to be converted to categorical

In [4]:
# Add new columns
def replace_income_with_total_income_log(data):
    data['Total_Income_Log'] = np.log(data['ApplicantIncome'] + data['CoapplicantIncome'])
    data.drop(labels=['ApplicantIncome','CoapplicantIncome'], axis=1, inplace=True)
    return data

def add_LoanAmt_Term_Ratio_Log(data):
    data['LoanAmt_Term_Ratio_Log']=  np.log(data.LoanAmount/data.Loan_Amount_Term)
    return data

def replace_loanamount_with_loanamount_log(X):
    data['LoanAmount_Log'] = np.log(data.LoanAmount)
    data.drop(labels=['LoanAmount'], axis=1, inplace=True)
    return data


def inject_features(data):
    data['Total_Income_Log'] = 1 #np.log(data['ApplicantIncome'] + data['CoapplicantIncome'])
    data['LoanAmt_Term_Ratio_Log']=  1 #np.log(data['LoanAmount']/data['Loan_Amount_Term'])
    data['LoanAmount_Log'] = 1 #np.log(data['LoanAmount'])
    data.drop(labels=['ApplicantIncome','CoapplicantIncome', 'LoanAmount'], axis=1, inplace=True)
    data.reset_index(inplace=True)
    return data



add_total_income_log_object = FunctionTransformer(replace_income_with_total_income_log)
add_loanamt_term_ratio_log_object = FunctionTransformer(add_LoanAmt_Term_Ratio_Log)
add_loanamount_log_object = FunctionTransformer(replace_loanamount_with_loanamount_log)

injected = FunctionTransformer(inject_features)



In [5]:

enc = OneHotEncoder(sparse=False)

pca = PCA(n_components=3)

class ToDenseTransformer():

    # here you define the operation it should perform
    def transform(self, X, y=None, **fit_params):
        return X.todense()

    # just return self
    def fit(self, X, y=None, **fit_params):
        return self
    
to_dense = ToDenseTransformer()


selection = SelectKBest(k=3)

from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

base_model = Ridge()

In [6]:
def fill_null(data):
    fill_NaN = SimpleImputer(missing_values=np.nan, strategy='mean')
    imputed = pd.DataFrame(fill_NaN.fit_transform(data))
    imputed.columns = data.columns
    imputed.index = data.index
    return imputed

fill_null = FunctionTransformer(fill_null)    

In [7]:

numeric_transform = Pipeline([('keep_num', keep_num),
                            ('impute_median', fill_null),
                            ('injected', injected),
                            ('scaling', StandardScaler()),
                            ("kbest", selection)]) 


categorical_transform = Pipeline([('keep_cat', keep_cat),
                                ('impute_mode', SimpleImputer(strategy='most_frequent')), 
                                  ('one-hot-encode', OneHotEncoder(sparse=False)),
                                  #("to_dense", to_dense),
                                 ("pca", pca)])

all_features = FeatureUnion([('numeric_features', numeric_transform),
                            ('categorical_features', categorical_transform)])

In [8]:
main_pipeline = Pipeline([('all_features', all_features),
                     ("model", LogisticRegression())])

model = main_pipeline.fit(X_train, y_train)

# X_test = X_test.dropna()

y_pred = model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
recall = recall_score(y_test, y_pred, average='macro')
precision =  precision_score(y_test, y_pred, average='micro')

print(f'Test set accuracy: {acc}')
print(f'Test set recall: {recall}')
print(f'Precision: {precision}')





Test set accuracy: 0.6910569105691057
Test set recall: 0.5487804878048781
Precision: 0.6910569105691057


  f = msb / msw
