In [576]:
import pandas as pd


In [577]:
import matplotlib.pyplot as plt
import seaborn as sns

In [578]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import confusion_matrix,accuracy_score,precision_score,recall_score,f1_score
from sklearn.preprocessing import PowerTransformer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import TargetEncoder



In [579]:
df=pd.read_csv("C:\PrimeBatch\FeatureEnginnering\loan_approval_raw_data.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 20 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Applicant_ID        950 non-null    float64
 1   Applicant_Income    950 non-null    float64
 2   Coapplicant_Income  950 non-null    float64
 3   Employment_Status   950 non-null    object 
 4   Age                 950 non-null    float64
 5   Marital_Status      950 non-null    object 
 6   Dependents          950 non-null    float64
 7   Credit_Score        950 non-null    float64
 8   Existing_Loans      950 non-null    float64
 9   DTI_Ratio           950 non-null    float64
 10  Savings             950 non-null    float64
 11  Collateral_Value    950 non-null    float64
 12  Loan_Amount         950 non-null    float64
 13  Loan_Term           950 non-null    float64
 14  Loan_Purpose        950 non-null    object 
 15  Property_Area       950 non-null    object 
 16  Educati

In [580]:
df = df.dropna(subset=["Loan_Approved"])
X=df.drop(["Loan_Approved"],axis=1)
y=df["Loan_Approved"]
X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42,test_size=0.2)


In [581]:
cat_col=X_train.select_dtypes(include=["object"]).columns.tolist()
ordinal_col=["Property_Area","Education_Level"]
nominal_col=[col for col in cat_col if col not in ordinal_col]
ordinal_col

['Property_Area', 'Education_Level']

In [582]:
num_col=X_train.select_dtypes(include=["number"]).columns.tolist()

In [583]:
threshold=10
low_card_col=[
    col for col in nominal_col
    if X_train[col].nunique()<=threshold
]
high_card_col=[
    col for col in nominal_col
    if X_train[col].nunique()>threshold
]


In [584]:
num_pipeline=Pipeline([('imputer',SimpleImputer(strategy="mean")),('model',StandardScaler())])
X_train["Property_Area"].value_counts()

Property_Area
Urban        357
Rural        220
Semiurban    148
Name: count, dtype: int64

In [585]:
education_order=["Not Graduate","Graduate"]
property_order=["Rural","Semiurban","Urban"]
ordinal_pipeline=Pipeline([('imputer',SimpleImputer(strategy="most_frequent")),('ordinal',OrdinalEncoder(categories=[property_order,education_order]))])


In [586]:
low_card_pipeline=Pipeline([('imputer',SimpleImputer(strategy="most_frequent")),("onehot",OneHotEncoder(handle_unknown="ignore"))])
high_card_pipeline=Pipeline([('imputer',SimpleImputer(strategy="most_frequent")),('target',TargetEncoder())])


In [587]:
preprocessor=ColumnTransformer([('num',num_pipeline,num_col),('ordinal',ordinal_pipeline,ordinal_col),('low',low_card_pipeline,low_card_col),('high',high_card_pipeline,high_card_col)])

In [588]:
pipe=Pipeline([('preprocessing',preprocessor),('model',LogisticRegression(max_iter=1000))])

In [589]:
pipe.fit(X_train,y_train)
y_pred=pipe.predict(X_test)

In [590]:
X_train.isnull().sum()

Applicant_ID          40
Applicant_Income      40
Coapplicant_Income    37
Employment_Status     32
Age                   42
Marital_Status        42
Dependents            41
Credit_Score          39
Existing_Loans        38
DTI_Ratio             37
Savings               41
Collateral_Value      36
Loan_Amount           39
Loan_Term             36
Loan_Purpose          38
Property_Area         35
Education_Level       36
Gender                35
Employer_Category     33
dtype: int64

In [591]:
print("confusion_matrix :",confusion_matrix(y_test,y_pred))
print("accuracy_score :",accuracy_score(y_test,y_pred)*100,"%")
print("precision_score :",precision_score(y_test,y_pred,pos_label='Yes')*100,"%")
print("recall_score :",recall_score(y_test,y_pred,pos_label='Yes')*100,"%")
print("f1_score :",f1_score(y_test,y_pred,pos_label='Yes')*100,"%")

confusion_matrix : [[121   8]
 [ 14  47]]
accuracy_score : 88.42105263157895 %
precision_score : 85.45454545454545 %
recall_score : 77.04918032786885 %
f1_score : 81.03448275862068 %


In [592]:
pipe=Pipeline([('preprocessing',preprocessor),('model',GaussianNB())])
pipe.fit(X_train,y_train)
gn_y_pred=pipe.predict(X_test)
print("confusion_matrix :",confusion_matrix(y_test,gn_y_pred))
print("accuracy_score :",accuracy_score(y_test,gn_y_pred)*100,"%")
print("precision_score :",precision_score(y_test,gn_y_pred,pos_label='Yes')*100,"%")
print("recall_score :",recall_score(y_test,gn_y_pred,pos_label='Yes')*100,"%")
print("f1_score :",f1_score(y_test,gn_y_pred,pos_label='Yes')*100,"%")

confusion_matrix : [[125   4]
 [ 19  42]]
accuracy_score : 87.89473684210526 %
precision_score : 91.30434782608695 %
recall_score : 68.85245901639344 %
f1_score : 78.50467289719626 %


In [593]:
pipe=Pipeline([('preprocessing',preprocessor),('model',KNeighborsClassifier(n_neighbors=7))])
pipe.fit(X_train,y_train)
kn_y_pred=pipe.predict(X_test)
print("confusion_matrix :",confusion_matrix(y_test,kn_y_pred))
print("accuracy_score :",accuracy_score(y_test,kn_y_pred)*100,"%")
print("precision_score :",precision_score(y_test,kn_y_pred,pos_label='Yes')*100,"%")
print("recall_score :",recall_score(y_test,kn_y_pred,pos_label='Yes')*100,"%")
print("f1_score :",f1_score(y_test,kn_y_pred,pos_label='Yes')*100,"%")

confusion_matrix : [[122   7]
 [ 27  34]]
accuracy_score : 82.10526315789474 %
precision_score : 82.92682926829268 %
recall_score : 55.73770491803278 %
f1_score : 66.66666666666666 %
