In [46]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score , GridSearchCV
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,LabelEncoder,MinMaxScaler
from sklearn.impute import SimpleImputer,KNNImputer
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import FunctionTransformer,RobustScaler,MinMaxScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier

In [47]:
data=pd.read_csv('loan_dataset.csv')

In [48]:
# droping unrelated columns

data.drop("loan_id",axis=True,inplace=True)

In [49]:
df=data.copy()
df.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,,No,9600000,29900000,12,778,2400000.0,17600000.0,,8000000.0,Approved
1,0,Not Graduate,Yes,4100000,12200000,8,417,,2200000.0,8800000.0,3300000.0,Rejected
2,3,Graduate,No,9100000,29700000,20,506,7100000.0,,33300000.0,12800000.0,Rejected
3,3,,No,8200000,30700000,8,467,18200000.0,3300000.0,23300000.0,7900000.0,Rejected
4,5,Not Graduate,Yes,9800000,24200000,20,382,12400000.0,8200000.0,29400000.0,5000000.0,Rejected


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   no_of_dependents          4269 non-null   int64  
 1   education                 3629 non-null   object 
 2   self_employed             3928 non-null   object 
 3   income_annum              4269 non-null   int64  
 4   loan_amount               4269 non-null   int64  
 5   loan_term                 4269 non-null   int64  
 6   cibil_score               4269 non-null   int64  
 7   residential_assets_value  3885 non-null   float64
 8   commercial_assets_value   4141 non-null   float64
 9   luxury_assets_value       4013 non-null   float64
 10  bank_asset_value          3843 non-null   float64
 11  loan_status               4269 non-null   object 
dtypes: float64(4), int64(5), object(3)
memory usage: 400.3+ KB


In [51]:
categorical=[]
numerical=[]

for col in df:
    if df[col].dtype=='O':
        categorical.append(col)
        
    else:
        numerical.append(col)

In [52]:
# removing leading and trailing spaces from cols

df[categorical]=df[categorical].apply(lambda x:x.str.strip())

In [53]:
X=df.drop('loan_status',axis=True)
y=df['loan_status']

In [54]:
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.7)

In [None]:
X_train.shape , y_train.shape

((2988, 11), (2988,))

In [None]:
# Function for categorical missing values

def randomvalueimpuation_cat(x):
    x = x.copy()
    for col in x.columns:
        non_missing = x[col].dropna().values
        x[col] = x[col].apply(lambda val: np.random.choice(non_missing) if pd.isnull(val) else val)
    return x

In [57]:
random_imputer = FunctionTransformer(randomvalueimpuation_cat,validate=False)

# <center>Encoding training data<center>

In [None]:
# For categorical values

cat_pipe = Pipeline([
    ('rvi',random_imputer), # missing values
    ('encode', OneHotEncoder(drop='first', handle_unknown='ignore')) # encoding
])

In [59]:
# For Handling Missing Values

tf1 = ColumnTransformer([
    ('cat_selfed', cat_pipe,[1,2]), # cat columns 
    ('knn', KNNImputer(n_neighbors=5), [7,8,9,10]), # num columns 
], remainder='passthrough')

In [None]:
# For Scaling 

tf2 = ColumnTransformer([
    ('scaler', RobustScaler(), [3,4,5,6,7,8,9,10])
], remainder='passthrough')

In [61]:
# Base models

rf = RandomForestClassifier(n_estimators=100, random_state=42)
gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
xgb = XGBClassifier(n_estimators=100, learning_rate=0.1, use_label_encoder=False, eval_metric="logloss",
                    
                     random_state=42)


In [62]:
# Voting Classifier (hard voting for classification)

tf3 = VotingClassifier(
    estimators=[("rf", rf), ("gb", gb), ("xgb", xgb)],
    voting="soft"   # 'soft' uses predicted probabilities
)

In [63]:
pipe =Pipeline([('missing_vals', tf1),('scaling',tf2),('ensemble',tf3)])

In [64]:
# Fitting the data 

X_train_trans = pipe.fit(X_train,y_train)

In [65]:
X_train_trans

In [66]:

y_pred = pipe.predict(X_test)

In [67]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.985167837626854

Classification Report:
               precision    recall  f1-score   support

    Approved       0.98      0.99      0.99       784
    Rejected       0.99      0.97      0.98       497

    accuracy                           0.99      1281
   macro avg       0.99      0.98      0.98      1281
weighted avg       0.99      0.99      0.99      1281

