In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('/content/Banking.csv')
df.head()

In [None]:
df.shape

In [None]:
df.info()

From the above information, we notice that **Loan_Amount_Requested** is in incorrect type, I will be replcing comma to dot in that column and converting to float64 type

In [None]:
 df['Loan_Amount_Requested']= df['Loan_Amount_Requested'].str.replace(',','.')

In [None]:
df['Loan_Amount_Requested'] = df['Loan_Amount_Requested'].astype(np.float64)

In [None]:
# we use value_counts() to display how many instances are present in the
# categorical feature/class variable
df['Interest_Rate'].value_counts()

### **Preprocessing Data**

Preprocessing data includes handling missing values and outliers, applying feature coding techniques if needed, scale & standardize features.



**Checking for Missing values**

In [None]:
# isnull() method can be used to check each cell in the dataset
df.isnull().sum()

**Handling Outliers**

We check for outliers only in the features that contain numerical values.

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

1. Checking the **'Annual_Income'** feature for outliers

In [None]:
fig, axes = plt.subplots(figsize=(9, 6))

# Checking the box plot for age feature
print("Annual_Income Shape:",df.shape)
## Max and Min Quantile
max_val = df['Annual_Income'].quantile(0.75)
min_val = df['Annual_Income'].quantile(0.25)

sns.boxplot(df['Annual_Income'])
plt.show()

When checking the boxplot for **Annual_Income** feature, we can see that the values has started breaking from a point around 2x1e6. Therefore I will be removing the outliers after $incomes=2 \times 1e6$

In [None]:
# removing datapoints that have income values greater than 2 x 1e6
df = df[(df['Annual_Income']< 2e6)]

print("After Annual_Income Shape:",df.shape)

sns.boxplot(df['Annual_Income'])
plt.show()

2. Checking the **'Debt_To_Income'** feature for outliers

In [None]:
fig, axes = plt.subplots(figsize=(9, 6))

# Checking the box plot for age feature
print("Debt_To_Income Shape:",df.shape)
## Max and Min Quantile
max_val = df['Debt_To_Income'].quantile(0.75)
min_val = df['Debt_To_Income'].quantile(0.25)

sns.boxplot(df['Debt_To_Income'])
plt.show()

There are no outliers in the **Debt_to_income** feature.

3. Checking the **'Inquiries_Last_6Mo'** feature for outliers

In [None]:
fig, axes = plt.subplots(figsize=(9, 6))

# Checking the box plot for age feature
print("Inquiries_Last_6Mo Shape:",df.shape)
## Max and Min Quantile
max_val = df['Inquiries_Last_6Mo'].quantile(0.75)
min_val = df['Inquiries_Last_6Mo'].quantile(0.25)

sns.boxplot(df['Inquiries_Last_6Mo'])
plt.show()

When checking the boxplot for the **'Inquiries_Last_6Mo'** feature, we can see that there are no significant outliers, and that there are many datapoints that are outside the boxplot. Therefore, I will not be removing the datapoints that are identified here as outliers, since they can carry information in them.

4. Checking the **'Months_Since_Deliquency'** feature for outliers

In [None]:
fig, axes = plt.subplots(figsize=(9, 6))

# Checking the box plot for age feature
print("Months_Since_Deliquency Shape:",df.shape)
## Max and Min Quantile
max_val = df['Months_Since_Deliquency'].quantile(0.75)
min_val = df['Months_Since_Deliquency'].quantile(0.25)

sns.boxplot(df['Months_Since_Deliquency'])
plt.show()

When checking the boxplot for the **'Months_Since_Deliquency'** feature, we can see that there are no significant outliers, and that there are many datapoints that are outside the boxplot. Therefore, I will not be removing the datapoints that are identified here as outliers, since they can carry information in them.

5. Checking the **'Number_Open_Accounts'** feature for outliers  

In [None]:
fig, axes = plt.subplots(figsize=(9, 6))

# Checking the box plot for age feature
print("Number_Open_Accounts Shape:",df.shape)
## Max and Min Quantile
max_val = df['Number_Open_Accounts'].quantile(0.75)
min_val = df['Number_Open_Accounts'].quantile(0.25)

sns.boxplot(df['Number_Open_Accounts'])
plt.show()

When checking the above box plot, we can see that, there a significant gap has first occured around the number accounts value 58. Therefore I decided to clear the datapoints after number accounts 58 as handling outliers in this feature.

In [None]:
# removing datapoints that have number open accounts greater than 58
df = df[(df['Number_Open_Accounts']<58)]

print("After Number_Open_Accounts Shape:",df.shape)

sns.boxplot(df['Number_Open_Accounts'])
plt.show()

6. Checking the **'Total_Accounts'** feature for outliers   

In [None]:
fig, axes = plt.subplots(figsize=(9, 6))

# Checking the box plot for age feature
print("Total_Accounts Shape:",df.shape)
## Max and Min Quantile
max_val = df['Total_Accounts'].quantile(0.75)
min_val = df['Total_Accounts'].quantile(0.25)

sns.boxplot(df['Total_Accounts'])
plt.show()

When checking the above box plot, we can see that, there a significant gap has first occured around the total accounts value 120. Therefore I decided to clear the datapoints after total accounts 120 as handling outliers in this feature

In [None]:
# removing datapoints that have total accounts greater than 120
df = df[(df['Total_Accounts']<120)]

print("After Total_Accounts shape:",df.shape)

sns.boxplot(df['Total_Accounts'])
plt.show()

7. Checking the **'Loan_Amount_Requested'** feature for outliers

In [None]:
fig, axes = plt.subplots(figsize=(9, 6))

# Checking the box plot for age feature
print("Loan_Amount_Requested Shape:",df.shape)
## Max and Min Quantile
max_val = df['Loan_Amount_Requested'].quantile(0.75)
min_val = df['Loan_Amount_Requested'].quantile(0.25)

sns.boxplot(df['Loan_Amount_Requested'])
plt.show()

In [None]:
# removing datapoints that have income values greater than 50
df = df[(df['Loan_Amount_Requested']< 50)]

print("After Loan_Amount_Requested Shape:",df.shape)

sns.boxplot(df['Loan_Amount_Requested'])
plt.show()

Now, after carefully inspection of all the numerical fields (features) in the dataset, I have removed outliers and the remaining number datapoints is 139186. Therefore I have removed $164309 - 139186 = 25123$ outliers.

**Feature Encoding**

In this process, the categorical data are encoded into numerical data. The LabelEncoder is used to encode the class values to integers accordingly as follows.



In [None]:
df.head()

In [None]:
df.isnull().sum()

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

df['Length_Employed'].value_counts()

In [None]:
df['Home_Owner'].value_counts()

In [None]:
# replace NaN values with mean value
df.Months_Since_Deliquency = df.Months_Since_Deliquency.fillna(df.Months_Since_Deliquency.median())

In [None]:
df.Length_Employed = df.Length_Employed.fillna('< 1 year')
df.Home_Owner = df.Home_Owner.fillna('Other')

In [None]:
df.isnull().sum()

In [None]:
categorical_features = ['Income_Verified', 'Purpose_Of_Loan','Length_Employed','Home_Owner']

encoder = LabelEncoder()
for feature in categorical_features:
    df[feature] = encoder.fit_transform(df[feature])

In [None]:
binary_valued_features = ['Gender']
bin_dict = {'Female':0, 'Male':1}
# Replace binary values in df using the provided dictionary
for item in binary_valued_features:
  df.replace({item:bin_dict},inplace=True)

**Removing Unwanted Feature**

In [None]:
df.drop(['Loan_ID'], inplace=True, axis = 1)
df

After this point, we have encoded all the values in the dataset into numerical values

In [None]:
df.info()

In [None]:
df.describe().transpose()

**Splitting Data**

In [None]:
y = df['Interest_Rate']
X = df.values[:, :-1] # get all columns except the last column

# spliting training and testing data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=50)

Here when using train_test_split, we use a random_state initializing value to make sure that the data splitting is done in the same way even in a different run of the code.

**Feature Scaling**

After encoding categorical data, the dataset consists of features with different data ranges. These values are standardized and feature scaling is done as follows. Numerical features were scaled by removing the
mean and by scaling to unit variance (StandardScaler) as follows.

In [None]:
from sklearn.preprocessing import StandardScaler

# Feature scaling
scaler = StandardScaler()
scaler.fit(X)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

### **Feature Engineering**

Feature Selection is one of the core concepts in machine learning which hugely impacts the performance of our model. The data features that we use to train your machine learning models have a huge influence on the performance we can achieve. Irrelevant or partially relevant features can negatively impact model performance.

**Drawing the Correlation Matrix**

Therefore I will be performing the Correlation Coefficient checking mechanism in order to check the relationship between the different features with the output.

Each of those correlation types can exist in a spectrum represented by values from 0 to 1 where slightly or highly positive correlation features can be something like 0.5 or 0.7. If there is a strong and perfect positive correlation, then the result is represented by a correlation score value of 0.9 or 1.

In [None]:
# draw the correlation matrix
correlation_matrix = pd.DataFrame(X_train).corr()
fig, ax = plt.subplots(figsize=(10,10))         # Sample figsize in inches
sns.heatmap(correlation_matrix, ax=ax)
correlation_matrix

After generating the correlation matrix, we can see that to the right side of the matrix, there are features that has a very high correlation. We usually remove such features that have high correlations because, they are some what linearly dependent with other features. These features contribute very less in predicting the output but increses the computational cost.

It is clear that correlated features means that they bring the same information, so it is logical to remove one of them.

In order to find the exact columns that has the high correlation values, I perform the below code. I am checking the upper triangle of the correlation matrix because the upper and lower traingles are mirrors of each other that are divided by the diagonal in the correlation matrix. Here I am checking the columns that has correlations values more than 0.6 with the hope of removing them.

In [None]:
# getting the upper triangle of the correlation matrix
upper_tri = correlation_matrix.where(np.triu(np.ones(correlation_matrix.shape),k=1).astype(np.bool))
print(upper_tri)

# checking which columns can be dropped
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
print('\nTo drop')
print(to_drop)

# removing the selected columns
X_train = X_train.drop(X_train.columns[to_drop], axis=1)
X_test = X_test.drop(X_test.columns[to_drop], axis=1)
print(X_train.head())

However, after performing the above code, we can see that there are no columns that has more than 0.95 correlation and that therefore, there are no columns to be removed.

**Appyling PCA**

Principal Component Analysis, or PCA, is a dimensionality-reduction method that is often used to reduce the dimensionality of large data sets, by transforming a large set of variables into a smaller one that still contains most of the information in the large set.

In [None]:
from sklearn.decomposition import PCA

# apply the PCA for feature for feature reduction
pca = PCA(n_components=0.95)
pca.fit(X_train)
PCA_X_train = pca.transform(X_train)
PCA_X_test = pca.transform(X_test)

X_train

Here, I have not manually set the n_components of the PCA model. We want the explained variance to be between 95–99%. Therefore, i have set the PCA's n_components to 0.95

## **Developing the MultiLayer Perceptron Model**

### **MLPRegressor**

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
mlp = MLPRegressor(activation='logistic')
mlp.fit(PCA_X_train,y_train)

In [None]:
predictions = mlp.predict(PCA_X_test)
predictions1= mlp.predict(PCA_X_train)
print("mse_test :" ,mean_squared_error(y_test,predictions), "mse_train :",mean_squared_error(y_train,predictions1))

### **MLPClassifier**

In [None]:
from sklearn.neural_network import MLPClassifier

# define and train an MLPClassifier named mlp on the given data
mlp = MLPClassifier(hidden_layer_sizes=(50,200,50), max_iter=300, activation='relu', solver='adam', random_state=1)
mlp.fit(PCA_X_train, y_train)

In [None]:
print('Accuracy')
print(mlp.score(PCA_X_test, y_test))

In [None]:
predict = mlp.predict(PCA_X_test)

We can find the Mean Squared Error (MSE) and other scores as follows.

In [None]:
from sklearn.metrics import accuracy_score, mean_squared_error

# print the training error and MSE
print("Training error: %f" % mlp.loss_curve_[-1])
print("Training set score: %f" % mlp.score(PCA_X_train, y_train))
print("Test set score: %f" % mlp.score(PCA_X_test, y_test))
print(accuracy_score(y_test, predict))

print("MSE: %f" % mean_squared_error(y_test, predict))