In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score , GridSearchCV
from sklearn.preprocessing import OneHotEncoder,OrdinalEncoder,LabelEncoder,MinMaxScaler
from sklearn.impute import SimpleImputer,KNNImputer
from sklearn.pipeline import Pipeline,make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import FunctionTransformer,RobustScaler,MinMaxScaler


In [2]:
data=pd.read_csv('loan_dataset.csv')

In [3]:
# droping unrelated columns

data.drop("loan_id",axis=True,inplace=True)

In [4]:
df=data.copy()
df.head()

Unnamed: 0,no_of_dependents,education,self_employed,income_annum,loan_amount,loan_term,cibil_score,residential_assets_value,commercial_assets_value,luxury_assets_value,bank_asset_value,loan_status
0,2,,No,9600000,29900000,12,778,2400000.0,17600000.0,,8000000.0,Approved
1,0,Not Graduate,Yes,4100000,12200000,8,417,,2200000.0,8800000.0,3300000.0,Rejected
2,3,Graduate,No,9100000,29700000,20,506,7100000.0,,33300000.0,12800000.0,Rejected
3,3,,No,8200000,30700000,8,467,18200000.0,3300000.0,23300000.0,7900000.0,Rejected
4,5,Not Graduate,Yes,9800000,24200000,20,382,12400000.0,8200000.0,29400000.0,5000000.0,Rejected


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4269 entries, 0 to 4268
Data columns (total 12 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   no_of_dependents          4269 non-null   int64  
 1   education                 3629 non-null   object 
 2   self_employed             3928 non-null   object 
 3   income_annum              4269 non-null   int64  
 4   loan_amount               4269 non-null   int64  
 5   loan_term                 4269 non-null   int64  
 6   cibil_score               4269 non-null   int64  
 7   residential_assets_value  3885 non-null   float64
 8   commercial_assets_value   4141 non-null   float64
 9   luxury_assets_value       4013 non-null   float64
 10  bank_asset_value          3843 non-null   float64
 11  loan_status               4269 non-null   object 
dtypes: float64(4), int64(5), object(3)
memory usage: 400.3+ KB


In [6]:
categorical=[]
numerical=[]

for col in df:
    if df[col].dtype=='O':
        categorical.append(col)
        
    else:
        numerical.append(col)

In [7]:
# removing leading and trailing spaces from cols

df[categorical]=df[categorical].apply(lambda x:x.str.strip())

In [8]:
X=df.drop('loan_status',axis=True)
y=df['loan_status']

In [9]:
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.7)

In [10]:
X_train.shape , y_train.shape

((2988, 11), (2988,))

In [11]:
def randomvalueimpuation_cat(x):
    x = x.copy()
    for col in x.columns:
        non_missing = x[col].dropna().values
        x[col] = x[col].apply(lambda val: np.random.choice(non_missing) if pd.isnull(val) else val)
    return x

In [12]:
random_imputer = FunctionTransformer(randomvalueimpuation_cat)

In [13]:
selfed_pipe = Pipeline([
    ('encode', OneHotEncoder(drop='first', handle_unknown='ignore'))
])

# <center>Encoding training data<center>

In [14]:
tf1 = ColumnTransformer([
    ('impute_selfed', selfed_pipe,[1,2]),
    ('knn', KNNImputer(n_neighbors=5), [7,8,9,10]),
], remainder='passthrough')

In [15]:
tf2 = ColumnTransformer([
    ('scaler', RobustScaler(), [3,4,5,6,7,8,9,10,11,12])
], remainder='passthrough')

In [16]:
pipe = Pipeline([('tf1', tf1),('tf2',tf2)])

In [17]:
X_train_trans = pipe.fit_transform(X_train,y_train)

In [18]:
X_train_trans_df = pd.DataFrame(X_train_trans)

In [19]:
X_train_trans.shape

(2988, 13)

In [20]:
X_train.shape

(2988, 11)