## Model Building and saving the model using Pickle

1. Import required libraries and read the dataset

In [5]:
# importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

In [6]:
df = pd.read_csv("loan_approval_data.csv")
df.head()

Unnamed: 0,loan_id,gender,married,dependents,education,self_employed,applicantincome,coapplicantincome,loanamount,loan_amount_term,credit_history,property_area,loan_status
0,lp001002,male,no,0.0,graduate,no,5849,0.0,,360.0,1.0,urban,y
1,lp001003,male,yes,1.0,graduate,no,4583,1508.0,128.0,360.0,1.0,rural,n
2,lp001005,male,yes,0.0,graduate,yes,3000,0.0,66.0,360.0,1.0,urban,y
3,lp001006,male,yes,0.0,not graduate,no,2583,2358.0,120.0,360.0,1.0,urban,y
4,lp001008,male,no,0.0,graduate,no,6000,0.0,141.0,360.0,1.0,urban,y


2. Check the first few samples, shape, info of the data and try to familiarize
   yourself with different features.

In [7]:
df.shape

(614, 13)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   loan_id            614 non-null    object 
 1   gender             601 non-null    object 
 2   married            611 non-null    object 
 3   dependents         599 non-null    float64
 4   education          614 non-null    object 
 5   self_employed      582 non-null    object 
 6   applicantincome    614 non-null    int64  
 7   coapplicantincome  614 non-null    float64
 8   loanamount         592 non-null    float64
 9   loan_amount_term   600 non-null    float64
 10  credit_history     564 non-null    float64
 11  property_area      614 non-null    object 
 12  loan_status        614 non-null    object 
dtypes: float64(5), int64(1), object(7)
memory usage: 62.5+ KB


3. Check for missing values in the dataset, if present? handle them with
   appropriate methods and drop redundant features.

In [9]:
## drop unnecessary columns
df.drop(['loan_id'],axis=1,inplace=True)

In [10]:
df.head()

Unnamed: 0,gender,married,dependents,education,self_employed,applicantincome,coapplicantincome,loanamount,loan_amount_term,credit_history,property_area,loan_status
0,male,no,0.0,graduate,no,5849,0.0,,360.0,1.0,urban,y
1,male,yes,1.0,graduate,no,4583,1508.0,128.0,360.0,1.0,rural,n
2,male,yes,0.0,graduate,yes,3000,0.0,66.0,360.0,1.0,urban,y
3,male,yes,0.0,not graduate,no,2583,2358.0,120.0,360.0,1.0,urban,y
4,male,no,0.0,graduate,no,6000,0.0,141.0,360.0,1.0,urban,y


In [11]:
df.isnull().sum()

gender               13
married               3
dependents           15
education             0
self_employed        32
applicantincome       0
coapplicantincome     0
loanamount           22
loan_amount_term     14
credit_history       50
property_area         0
loan_status           0
dtype: int64

In [None]:
df.gender.unique()

In [None]:
df['gender'].fillna(df['gender'].mode()[0],inplace=True)

In [None]:
df.gender.unique()

In [None]:
df.married.unique()

In [None]:
df['married'].fillna(df['married'].mode()[0],inplace=True)

In [None]:
df.dependents.unique()

In [None]:
df['dependents'].fillna(df['dependents'].mode()[0],inplace=True)

In [None]:
df.education.unique()

In [None]:
df['education'].fillna(df['education'].mode()[0],inplace=True)

In [None]:
df.self_employed.unique()

In [None]:
df['self_employed'].fillna(df['self_employed'].mode()[0],inplace=True)

In [None]:
df['loanamount'].unique()

In [None]:
df['loanamount'].fillna(df['loanamount'].mean(),inplace=True)

In [None]:
df.loan_amount_term.unique()

In [None]:
df['loan_amount_term'].fillna(df['loan_amount_term'].mode()[0],inplace=True)

In [None]:
df.credit_history.unique()

In [None]:
df['credit_history'].fillna(df['credit_history'].mode()[0],inplace=True)

In [None]:
df.isnull().sum()

In [None]:
df.head()

4. Visualize the distribution of the target column 'loan_status' with respect to
   various categorical features and write your observations.

In [None]:
df.replace({"loan_status":{'y':1,'n':0}},inplace=True)

In [None]:
df.head()

- Visualize

In [None]:
# gender and loan_status
import seaborn as sns
sns.countplot(x='gender',hue='loan_status',data=df)

- We observe from graph that males take more loan.

In [None]:
# married and loan_status
import seaborn as sns
sns.countplot(x='married',hue='loan_status',data=df)

- We observe from graph that those who are married take more loan.

In [None]:
# education and loan_status
import seaborn as sns
sns.countplot(x='education',hue='loan_status',data=df)

- We observe from graph that graduate take more loan.

In [None]:
# self_employed and loan_status
import seaborn as sns
sns.countplot(x='self_employed',hue='loan_status',data=df)

- We observe from graph that self employed people take less loan.

In [None]:
# credit_history and loan_status
import seaborn as sns
sns.countplot(x='credit_history',hue='loan_status',data=df)

- We observe from graph the credit history of people who take loan.

In [None]:
# property_area and loan_status
import seaborn as sns
sns.countplot(x='property_area',hue='loan_status',data=df)

- We observe from graph that in semiurban place people take most loan.

5. Encode the categorical data.

In [None]:
column=['gender','married','education','self_employed','credit_history','property_area']
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()

for i in column:
    df[i] = le.fit_transform(df[i])

In [None]:
df.head()

6. Separate the target and independent features and split the data into train
   and test.

In [None]:
## separate target and independent variables
X = df.drop('loan_status',axis=1)
y = df['loan_status']

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,random_state=0)

print(X_train.shape,X_test.shape)
print(y_train.shape,y_test.shape)

7. Build any classification model to predict the loan status of the customer
   and save your model using pickle.

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
import pandas as pd

# Encode categorical columns using one-hot encoding
df_encoded = pd.get_dummies(df, drop_first=True)


# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a Random Forest Regressor model
rf_model = RandomForestRegressor()

# Fit the model on the training data
rf_model.fit(X_train, y_train)

# Make predictions on the training and testing data
train_predictions = rf_model.predict(X_train)
test_predictions = rf_model.predict(X_test)

In [None]:
# Save Model using pickle.

# Uploaded seperately.