<a href="https://colab.research.google.com/github/MohammadErfanRashidi/Loan-Prediction/blob/main/LoanPrediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [174]:
# Importing the libraries
import pandas as pd
import numpy as np
import nltk
import re
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [175]:
# Uploading the dataset
dataset = pd.read_csv("/content/loan_dataset.csv") # The path to your file

In [176]:
# Checking the first five rows of the dataset
dataset.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,LP001003,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,LP001005,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,LP001006,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,LP001008,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y


In [177]:
# Checking the shape
dataset.shape

(614, 13)

In [178]:
# Describing the dataset
dataset.describe()

Unnamed: 0,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History
count,614.0,614.0,592.0,600.0,564.0
mean,5403.459283,1621.245798,146.412162,342.0,0.842199
std,6109.041673,2926.248369,85.587325,65.12041,0.364878
min,150.0,0.0,9.0,12.0,0.0
25%,2877.5,0.0,100.0,360.0,1.0
50%,3812.5,1188.5,128.0,360.0,1.0
75%,5795.0,2297.25,168.0,360.0,1.0
max,81000.0,41667.0,700.0,480.0,1.0


In [179]:
# Checking for missing values
dataset.isnull().sum()

Unnamed: 0,0
Loan_ID,0
Gender,13
Married,3
Dependents,15
Education,0
Self_Employed,32
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,22
Loan_Amount_Term,14


In [180]:
# Dropping missing values for Gender column
dataset = dataset.dropna(subset=["Gender"])

# Dropping missing values for Married column
dataset = dataset.dropna(subset=["Married"])

# Dropping missing values for Dependents column
dataset = dataset.dropna(subset=["Dependents"])

# Dropping missing values for Self_Employed column
dataset = dataset.dropna(subset=["Self_Employed"])

In [181]:
# Filling LoandAmount missing values with the mean
dataset["LoanAmount"] = dataset["LoanAmount"].fillna(dataset["LoanAmount"].mean())

# Filling Loan_Amount_Term missing values with the mean
dataset["Loan_Amount_Term"] = dataset["Loan_Amount_Term"].fillna(dataset["Loan_Amount_Term"].mean())

# Filling Credit_History missing values with the mean
dataset["Credit_History"] = dataset["Credit_History"].fillna(dataset["Credit_History"].mean())

In [182]:
# Checking the missing values again
dataset.isnull().sum()

Unnamed: 0,0
Loan_ID,0
Gender,0
Married,0
Dependents,0
Education,0
Self_Employed,0
ApplicantIncome,0
CoapplicantIncome,0
LoanAmount,0
Loan_Amount_Term,0


In [183]:
# Counting the Loan_Status column
dataset["Loan_Status"].value_counts()

Unnamed: 0_level_0,count
Loan_Status,Unnamed: 1_level_1
Y,382
N,172


Y = Successful

N = Unsuccessful

# Numerating the textual columns

In [184]:
label_encoder = LabelEncoder()

In [185]:
categorical_cols = ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Property_Area', 'Loan_Status']
for col in categorical_cols:
    dataset[col] = label_encoder.fit_transform(dataset[col])

In [186]:
dataset.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,1,0,0,0,0,5849,0.0,144.917757,360.0,1.0,2,1
1,LP001003,1,1,1,0,0,4583,1508.0,128.0,360.0,1.0,0,0
2,LP001005,1,1,0,0,1,3000,0.0,66.0,360.0,1.0,2,1
3,LP001006,1,1,0,1,0,2583,2358.0,120.0,360.0,1.0,2,1
4,LP001008,1,0,0,0,0,6000,0.0,141.0,360.0,1.0,2,1


In [187]:
# Separating data and the label
X = dataset.drop(columns=["Loan_ID", "Loan_Status"], axis=1)
Y = dataset["Loan_Status"]

In [188]:
print(X)

     Gender  Married  Dependents  Education  Self_Employed  ApplicantIncome  \
0         1        0           0          0              0             5849   
1         1        1           1          0              0             4583   
2         1        1           0          0              1             3000   
3         1        1           0          1              0             2583   
4         1        0           0          0              0             6000   
..      ...      ...         ...        ...            ...              ...   
609       0        0           0          0              0             2900   
610       1        1           3          0              0             4106   
611       1        1           1          0              0             8072   
612       1        1           2          0              0             7583   
613       0        0           0          0              1             4583   

     CoapplicantIncome  LoanAmount  Loan_Amount_Ter

In [189]:
print(Y)

0      1
1      0
2      1
3      1
4      1
      ..
609    1
610    1
611    1
612    1
613    0
Name: Loan_Status, Length: 554, dtype: int64


In [190]:
# Train and test
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=2)

In [191]:
# Printing the shape of train and test data
print(X.shape, X_train.shape, X_test.shape)

(554, 11) (443, 11) (111, 11)


In [192]:
# Training the model
model = LogisticRegression()

In [193]:
# Fitting the model
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [194]:
# Evaluating the model (train)
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [195]:
# Printing the accuracy score for training data
print("Train data accuracy:", training_data_accuracy)

Train data accuracy: 0.8036117381489842


In [196]:
# Evaluating the model (test)
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [197]:
# Printing the accuracy score for test data
print("Test data accuracy:", test_data_accuracy)

Test data accuracy: 0.7927927927927928
