In [12]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline


from sklearn.model_selection import train_test_split
#from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier



from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
#from sklearn.inspection import DecisionBoundaryDisplay


#import optuna

In [13]:
train_data= pd.read_csv("train_data.csv")
train_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [25]:

train_data["Dependents"].head()

0    0
1    1
2    0
3    0
4    0
Name: Dependents, dtype: object

In [15]:
train_data['Loan_Status'] = train_data['Loan_Status'].replace(('Y', 'N'), (1, 0))

X = train_data.iloc[:,1:12] # Leave out the Loan ID and the target column 
y= train_data.iloc[:,-1]
X_train, X_val , y_train , y_val = train_test_split(X,y, test_size=0.2,random_state = 30, stratify=y)

print(X.shape)
print(X_train.shape)

(614, 11)
(491, 11)


In [36]:
print(X_train.columns)

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')


## Feature Engineering

In [26]:
# define the feature engineering funtion

#Add applicant and coapplicant's income to get the total income
def get_total_income(df):
    df['TotalIncome'] = df['ApplicantIncome'] + df['CoapplicantIncome'] 
    df['SquaredTotalIncome'] = np.sqrt(df['ApplicantIncome'] * df['CoapplicantIncome'])
    return df

def binarize_dependants(df):
    df['Dependents'] = np.where( (df['Dependents']=="0") | (df['Dependents']=="1"),0,1)
    return df

In [28]:
# Feature Engineering Pipeline

feature_engineering = ColumnTransformer([
    ('total_income', FunctionTransformer(get_total_income, validate=False),
      ['ApplicantIncome', 'CoapplicantIncome']),
    ('Dependents', FunctionTransformer(binarize_dependants, validate=False),
      ["Dependents"] ),
    #add more feature engineering functions as you like 
   
])

## Pipeline for Categorical and Numerical features

In [37]:
categorical_columns = list(X_train.select_dtypes(include=['object']).columns.values.tolist())
numeric_columns = list(X_train.select_dtypes(exclude=['object']).columns.values.tolist())
all_columns = categorical_columns + numeric_columns

In [38]:
categorical_transformer = Pipeline(
    steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore')),
    ])

numeric_transformer = Pipeline(
    steps= [
        ("imputer", SimpleImputer(strategy='constant')),
        ("normalise",StandardScaler()),
])

For the numerical data, we’ll use the SimpleImputer class to fill in any missing values with the mean of the column. 
For the categorical columns we’ll use the SimpleImputer class to fill in any missing values with the most frequent value in the column. We’ll then use the OneHotEncoder class to one-hot encode the categorical columns.

## Preprocessor 

In [39]:
preprocessor = ColumnTransformer(
    transformers=[
        ('feature_engineering', feature_engineering, all_columns),
        ('numeric_transformers', numeric_transformer, numeric_columns),
        ('categorical_transformers', categorical_transformer, categorical_columns),
    ])

## Model Building

In [40]:
model = RandomForestClassifier(n_estimators = 100)
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('model', model)])

#print(pipeline)


In [41]:
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('preprocessor',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('feature_engineering',
                                                  ColumnTransformer(n_jobs=None,
                                                                    remainder='drop',
                                                                    sparse_threshold=0.3,
                                                                    transformer_weights=None,
                                                                    transformers=[('total_income',
                                                                                   FunctionTransformer(accept_sparse=False,
                                                                                                       check_inver

In [42]:
predictions = pipeline.predict(X_val)
print('Accuracy: ', accuracy_score(y_val, predictions))
print('AUC: ', roc_auc_score(y_val, predictions))

Accuracy:  0.7317073170731707
AUC:  0.6458204334365325


## Range of Models

In [43]:
names = [
    "Nearest Neighbors",
    "Linear SVM",
    "RBF SVM",
    "Gaussian Process",
    "Decision Tree",
    "Random Forest",
    "Neural Net",
    "AdaBoost",
    "Naive Bayes",
    "QDA",
    "XGBoost Classifier",

]

classifiers = [
    KNeighborsClassifier(3),
    SVC(kernel="linear", C=0.025),
    SVC(gamma=2, C=1),
    GaussianProcessClassifier(1.0 * RBF(1.0)),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1, max_iter=1000),
    AdaBoostClassifier(),
    GaussianNB(),
    QuadraticDiscriminantAnalysis(),
    #XGBClassifier(random_state=0, eval_metric='mlogloss', tree_method='gpu_hist'),
]

for cnt in range(len(classifiers)):

    pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('cls',classifiers[cnt])
           ])
    model = pipeline.fit(X_train, y_train)
    print (cnt, names[cnt])
    predictions = pipeline.predict(X_val)
    print('Accuracy: ', accuracy_score(y_val, predictions))
    print('AUC: ', roc_auc_score(y_val, predictions))

0 Nearest Neighbors
Accuracy:  0.6097560975609756
AUC:  0.5139318885448916
1 Linear SVM
Accuracy:  0.7642276422764228
AUC:  0.676625386996904
2 RBF SVM
Accuracy:  0.6991869918699187
AUC:  0.513157894736842
3 Gaussian Process
Accuracy:  0.6910569105691057
AUC:  0.5
4 Decision Tree
Accuracy:  0.7398373983739838
AUC:  0.651702786377709
5 Random Forest
Accuracy:  0.7154471544715447
AUC:  0.5540247678018576
6 Neural Net
Accuracy:  0.6991869918699187
AUC:  0.5713622291021672
7 AdaBoost
Accuracy:  0.7804878048780488
AUC:  0.6956656346749226
8 Naive Bayes
Accuracy:  0.7317073170731707
AUC:  0.6458204334365325
9 QDA
Accuracy:  0.3089430894308943
AUC:  0.5




In [44]:
#Selecting the best model from above

pipeline = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('cls',classifiers[7])
           ])
final_model = pipeline.fit(X, y)

In [45]:
from sklearn.inspection import permutation_importance
r = permutation_importance(final_model, X, y,
                            n_repeats=30,
                            random_state=0)

for i in r.importances_mean.argsort()[::-1]:
     if r.importances_mean[i] - 2 * r.importances_std[i] > 0:
         print(f"{X.columns[i]:<8}"
               f"{r.importances_mean[i]:.3f}"
               f" +/- {r.importances_std[i]:.3f}")


Credit_History0.142 +/- 0.011
ApplicantIncome0.046 +/- 0.008
CoapplicantIncome0.045 +/- 0.008
LoanAmount0.027 +/- 0.007
Loan_Amount_Term0.015 +/- 0.004


## Prediction on Test Set

In [46]:
# Import the test data

test_data = pd.read_csv('test_data.csv')
test_data.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [47]:
test_data_to_pass = test_data.copy()
test_data_to_pass.drop("Loan_ID", axis = 1)
test_data_to_pass.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
0,LP001015,Male,Yes,0,Graduate,No,5720,0,110.0,360.0,1.0,Urban
1,LP001022,Male,Yes,1,Graduate,No,3076,1500,126.0,360.0,1.0,Urban
2,LP001031,Male,Yes,2,Graduate,No,5000,1800,208.0,360.0,1.0,Urban
3,LP001035,Male,Yes,2,Graduate,No,2340,2546,100.0,360.0,,Urban
4,LP001051,Male,No,0,Not Graduate,No,3276,0,78.0,360.0,1.0,Urban


In [48]:
test_predictions = pipeline.predict(test_data_to_pass)




In [49]:
test_data["Loan_Status"] = pd.DataFrame(test_predictions)

In [50]:
test_data["Loan_Status"] = np.where(test_data["Loan_Status"]==1,"Y","N")
results=test_data[["Loan_ID","Loan_Status"]]
results.to_csv("results.csv", index=False)
