In [35]:
import numpy as np
import pandas as pd

In [36]:
df = pd.read_csv('/content/Loan_Train.csv')

In [37]:
df.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [38]:
#look at the shape 
df.shape

(614, 13)

# 1.__Preprocessing__

In [39]:
#counting the nan values in the dataset
df.isna().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

## 1.1 __Dealing with missing values__

In [40]:
#dealing with missing values 

#replacing the categorical values with the mean
for column in ['Gender', 'Married', 'Self_Employed']:
    df[column].fillna(df[column].mode()[0], inplace=True)
    
#replacing the numerical values with the mode
for column in ['Dependents', 'Loan_Amount_Term', 'LoanAmount','Credit_History']:
    df[column].fillna(df[column].mode()[0], inplace=True)
    
#check for missing values 
df.isna().sum()

Loan_ID              0
Gender               0
Married              0
Dependents           0
Education            0
Self_Employed        0
ApplicantIncome      0
CoapplicantIncome    0
LoanAmount           0
Loan_Amount_Term     0
Credit_History       0
Property_Area        0
Loan_Status          0
dtype: int64

In [41]:
df.drop('Loan_ID', axis=1, inplace=True)
df.drop('Gender',axis = 1, inplace = True)

In [42]:
df.head(5)

Unnamed: 0,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,No,0,Graduate,No,5849,0.0,120.0,360.0,1.0,Urban,Y
1,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


## 1.2 __Encoding Dummy Variable__

In [43]:
df = pd.get_dummies(df,drop_first=True)
df.head()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Married_Yes,Dependents_1,Dependents_2,Dependents_3+,Education_Not Graduate,Self_Employed_Yes,Property_Area_Semiurban,Property_Area_Urban,Loan_Status_Y
0,5849,0.0,120.0,360.0,1.0,0,0,0,0,0,0,0,1,1
1,4583,1508.0,128.0,360.0,1.0,1,1,0,0,0,0,0,0,0
2,3000,0.0,66.0,360.0,1.0,1,0,0,0,0,1,0,1,1
3,2583,2358.0,120.0,360.0,1.0,1,0,0,0,1,0,0,1,1
4,6000,0.0,141.0,360.0,1.0,0,0,0,0,0,0,0,1,1


## __1.3 Seperate features and labels__

In [45]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import scale 

In [46]:
#seperate the features and labesl
y = df['Loan_Status_Y']
X = df.drop('Loan_Status_Y',axis = 1)

#Scale the values 
X_scale = scale(X)

#split 
X_train, X_test , y_train, y_test = train_test_split(X_scale, y,test_size=0.33, random_state=42)

# 2.__Implementing the model__

## 2.1 __Decision Tree Classifier__

In [49]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score,f1_score


#define the object
tree_clf = DecisionTreeClassifier()

#fit the model
tree_clf.fit(X_train,y_train)

#make prediction
y_pred = tree_clf.predict(X_test)
print("Test Data Set Accuracy: ", accuracy_score(y_test,y_pred))
print("Training Data F1 Score ", f1_score(y_test,y_pred))

print("Validation Mean F1 Score: ",cross_val_score(tree_clf,X_test,y_pred,cv=5,scoring='f1_macro').mean())
print("Validation Mean Accuracy: ",cross_val_score(tree_clf,X_test,y_pred,cv=5,scoring='accuracy').mean())

Test Data Set Accuracy:  0.6798029556650246
Training Data F1 Score  0.7601476014760148
Validation Mean F1 Score:  0.637893255398793
Validation Mean Accuracy:  0.7098780487804878


## 2.2 __RandomForest Classifier__

In [50]:
from sklearn.ensemble import RandomForestClassifier


rf_clf = RandomForestClassifier(n_estimators=100,max_depth=3,min_samples_leaf = 10)
rf_clf.fit(X_train,y_train)
y_pred = rf_clf.predict(X_test)
print("Test F1 Score ", f1_score(y_test,y_pred))
print("Test Accuracy ", accuracy_score(y_test,y_pred))

print("Validation Mean F1 Score: ",cross_val_score(tree_clf,X_test,y_pred,cv=5,scoring='f1_macro').mean())
print("Validation Mean Accuracy: ",cross_val_score(tree_clf,X_test,y_pred,cv=5,scoring='accuracy').mean())

Train F1 Score  0.8571428571428571
Train Accuracy  0.7881773399014779
Validation Mean F1 Score:  0.9538326004516791
Validation Mean Accuracy:  0.9852439024390243


## 2.3 __Logistic Regression__

In [53]:
###
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_predict

log_ = LogisticRegression()
log_.fit(X_train,y_train)
y_pred = log_.predict(X_test)

print("Test F1 Score ", f1_score(y_test,y_pred))
print("Test Accuracy ", accuracy_score(y_test,y_pred))

print("Validation Mean F1 Score: ",cross_val_score(tree_clf,X_test,y_pred,cv=5,scoring='f1_macro').mean())
print("Validation Mean Accuracy: ",cross_val_score(tree_clf,X_test,y_pred,cv=5,scoring='accuracy').mean())


Test F1 Score  0.8628762541806019
Test Accuracy  0.7980295566502463
Validation Mean F1 Score:  1.0
Validation Mean Accuracy:  1.0


have to specify that these are just initial models and on the final run I'll have to be more specific on the parameters that i chose.