In [16]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
loan_sanction_train = pd.read_csv('loan_sanction_train.csv')
loan_sanction_test = pd.read_csv('loan_sanction_test.csv')


In [17]:
df = loan_sanction_train
df = df.dropna(subset=df.columns.values)

In [None]:
import pandas as pd
import numpy as np

# Assuming df is your DataFrame
# Replace the column names as per your actual DataFrame
df['Gender'] = np.where(df['Gender'] == 'Male', 0, 1)
df['Married'] = np.where(df['Married'] == 'Yes', 0, 1)
df['Education'] = np.where(df['Education'] == 'Graduate', 0, 1)
df['Self_Employed'] = np.where(df['Self_Employed'] == 'Yes', 0, 1)

# Using map for Dependents and Property_Area
dependents_mapping = {'0': 0, '1': 1, '2': 2, '3+': 3}
property_area_mapping = {'Urban': 0, 'Rural': 1, 'Semiurban': 2}

df['Dependents'] = df['Dependents'].map(dependents_mapping)
df['Property_Area'] = df['Property_Area'].map(property_area_mapping)

# For Loan_Status
loan_status_mapping = {'Y': 0, 'N': 1}
df['Loan_Status'] = df['Loan_Status'].map(loan_status_mapping)



In [19]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001003,0,0,1,0,1,4583,1508.0,128.0,360.0,1.0,1,1
1,LP001005,0,0,0,0,0,3000,0.0,66.0,360.0,1.0,0,0
2,LP001006,0,0,0,1,1,2583,2358.0,120.0,360.0,1.0,0,0
3,LP001008,0,1,0,0,1,6000,0.0,141.0,360.0,1.0,0,0
4,LP001011,0,0,2,0,0,5417,4196.0,267.0,360.0,1.0,0,0


In [20]:
FEATURES = list(df.columns[1:12])
FEATURES

['Gender',
 'Married',
 'Dependents',
 'Education',
 'Self_Employed',
 'ApplicantIncome',
 'CoapplicantIncome',
 'LoanAmount',
 'Loan_Amount_Term',
 'Credit_History',
 'Property_Area']

In [21]:
result_dict = {}

In [22]:
def summarize_classification(y_test, y_pred):
    
    acc = accuracy_score(y_test, y_pred, normalize=True)
    num_acc = accuracy_score(y_test, y_pred, normalize=False)

    prec = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    
    return {'accuracy': acc, 
            'precision': prec,
            'recall':recall, 
            'accuracy_count':num_acc}

In [23]:
def build_model(classifier_fn,                
                name_of_y_col, 
                names_of_x_cols, 
                dataset, 
                test_frac=0.2):
    
    X = dataset[names_of_x_cols]
    Y = dataset[name_of_y_col]

    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_frac)
       
    model = classifier_fn(x_train, y_train)
    
    y_pred = model.predict(x_test)

    y_pred_train = model.predict(x_train)
    
    train_summary = summarize_classification(y_train, y_pred_train)
    test_summary = summarize_classification(y_test, y_pred)
    
    pred_results = pd.DataFrame({'y_test': y_test,
                                 'y_pred': y_pred})
    
    model_crosstab = pd.crosstab(pred_results.y_pred, pred_results.y_test)
    
    return {'training': train_summary, 
            'test': test_summary,
            'confusion_matrix': model_crosstab}

In [24]:
def compare_results():
    for key in result_dict:
        print('Classification: ', key)

        print()
        print('Training data')
        for score in result_dict[key]['training']:
            print(score, result_dict[key]['training'][score])

        print()
        print('Test data')
        for score in result_dict[key]['test']:
            print(score, result_dict[key]['test'][score])
       
        print()

In [25]:
def decision_tree_fn(x_train, y_train, max_depth=None, max_features=None): 
    
    model = DecisionTreeClassifier(max_depth=max_depth, max_features=max_features)
    model.fit(x_train, y_train)
    
    return model

In [26]:
result_dict['Loan_Status ~ decision_tree'] = build_model(decision_tree_fn,
                                                 'Loan_Status',
                                                  FEATURES,
                                                  df)

compare_results()

Classification:  Loan_Status ~ decision_tree

Training data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 384.0

Test data
accuracy 0.5729166666666666
precision 0.3939393939393939
recall 0.38235294117647056
accuracy_count 55.0



In [27]:
def logistic_fn(x_train, y_train):
    
    model = LogisticRegression(solver='liblinear')
    model.fit(x_train, y_train)
    
    return model

In [29]:
result_dict['Loan_Status ~ logistic'] = build_model(logistic_fn,
                                              'Loan_Status',
                                               FEATURES,
                                               df)

compare_results()

Classification:  Loan_Status ~ decision_tree

Training data
accuracy 1.0
precision 1.0
recall 1.0
accuracy_count 384.0

Test data
accuracy 0.5729166666666666
precision 0.3939393939393939
recall 0.38235294117647056
accuracy_count 55.0

Classification:  Loan_Status ~ logistic

Training data
accuracy 0.8151041666666666
precision 0.8679245283018868
recall 0.41818181818181815
accuracy_count 313.0

Test data
accuracy 0.7708333333333334
precision 0.9444444444444444
recall 0.4473684210526316
accuracy_count 74.0

