In [31]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import seaborn as sns

In [32]:
df=pd.read_csv('data.csv')

In [33]:
df.sample(10)

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
341,LP002116,Female,No,0,Graduate,No,2378,0.0,46.0,360.0,1.0,Rural,N
157,LP001543,Male,Yes,1,Graduate,No,9538,0.0,187.0,360.0,1.0,Urban,Y
396,LP002277,Female,No,0,Graduate,No,3180,0.0,71.0,360.0,0.0,Urban,N
600,LP002949,Female,No,3+,Graduate,,416,41667.0,350.0,180.0,,Urban,N
578,LP002877,Male,Yes,1,Graduate,No,1782,2232.0,107.0,360.0,1.0,Rural,Y
164,LP001572,Male,Yes,0,Graduate,No,9323,0.0,75.0,180.0,1.0,Urban,Y
322,LP002054,Male,Yes,2,Not Graduate,No,3601,1590.0,,360.0,1.0,Rural,Y
77,LP001259,Male,Yes,1,Graduate,Yes,1000,3022.0,110.0,360.0,1.0,Urban,N
410,LP002318,Female,No,1,Not Graduate,Yes,3867,0.0,62.0,360.0,1.0,Semiurban,N
449,LP002444,Male,No,1,Not Graduate,Yes,2769,1542.0,190.0,360.0,,Semiurban,N


In [34]:
df.shape

(614, 13)

In [35]:
df.isnull().sum()

Loan_ID               0
Gender               13
Married               3
Dependents           15
Education             0
Self_Employed        32
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           22
Loan_Amount_Term     14
Credit_History       50
Property_Area         0
Loan_Status           0
dtype: int64

In [36]:
df.drop('Loan_ID', axis=1, inplace=True)

In [37]:
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,Graduate,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,Graduate,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,Graduate,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,Not Graduate,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,Graduate,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,Graduate,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,Male,Yes,3+,Graduate,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,Male,Yes,1,Graduate,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,Male,Yes,2,Graduate,No,7583,0.0,187.0,360.0,1.0,Urban,Y


#### Firstly handling the categorical data with 0 missing values

In [38]:
df['Education'].value_counts()

Education
Graduate        480
Not Graduate    134
Name: count, dtype: int64

In [39]:
df['Education']=df['Education'].map({'Graduate':1, 'Not Graduate':0})

In [40]:
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,No,0,1,No,5849,0.0,,360.0,1.0,Urban,Y
1,Male,Yes,1,1,No,4583,1508.0,128.0,360.0,1.0,Rural,N
2,Male,Yes,0,1,Yes,3000,0.0,66.0,360.0,1.0,Urban,Y
3,Male,Yes,0,0,No,2583,2358.0,120.0,360.0,1.0,Urban,Y
4,Male,No,0,1,No,6000,0.0,141.0,360.0,1.0,Urban,Y
...,...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,1,No,2900,0.0,71.0,360.0,1.0,Rural,Y
610,Male,Yes,3+,1,No,4106,0.0,40.0,180.0,1.0,Rural,Y
611,Male,Yes,1,1,No,8072,240.0,253.0,360.0,1.0,Urban,Y
612,Male,Yes,2,1,No,7583,0.0,187.0,360.0,1.0,Urban,Y


In [41]:
df['Property_Area'].value_counts()

Property_Area
Semiurban    233
Urban        202
Rural        179
Name: count, dtype: int64

In [42]:
from sklearn.preprocessing import OneHotEncoder

In [43]:
ohe=OneHotEncoder(sparse_output=False, drop='first')

In [44]:
encoded=ohe.fit_transform(df[['Property_Area']])

In [45]:
names=ohe.get_feature_names_out(['Property_Area'])

In [46]:
encoded_df=pd.DataFrame(encoded, columns=names)

In [47]:
new_df = pd.concat([df.drop('Property_Area', axis=1), encoded_df], axis=1)

In [48]:
new_df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Property_Area_Semiurban,Property_Area_Urban
0,Male,No,0,1,No,5849,0.0,,360.0,1.0,Y,0.0,1.0
1,Male,Yes,1,1,No,4583,1508.0,128.0,360.0,1.0,N,0.0,0.0
2,Male,Yes,0,1,Yes,3000,0.0,66.0,360.0,1.0,Y,0.0,1.0
3,Male,Yes,0,0,No,2583,2358.0,120.0,360.0,1.0,Y,0.0,1.0
4,Male,No,0,1,No,6000,0.0,141.0,360.0,1.0,Y,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,Female,No,0,1,No,2900,0.0,71.0,360.0,1.0,Y,0.0,0.0
610,Male,Yes,3+,1,No,4106,0.0,40.0,180.0,1.0,Y,0.0,0.0
611,Male,Yes,1,1,No,8072,240.0,253.0,360.0,1.0,Y,0.0,1.0
612,Male,Yes,2,1,No,7583,0.0,187.0,360.0,1.0,Y,0.0,1.0


In [49]:
new_df.isnull().sum()

Gender                     13
Married                     3
Dependents                 15
Education                   0
Self_Employed              32
ApplicantIncome             0
CoapplicantIncome           0
LoanAmount                 22
Loan_Amount_Term           14
Credit_History             50
Loan_Status                 0
Property_Area_Semiurban     0
Property_Area_Urban         0
dtype: int64

In [50]:
new_df['Gender'].value_counts()

Gender
Male      489
Female    112
Name: count, dtype: int64

#### now handling categorical columns with missing values

In [51]:
df=new_df

In [52]:
df['Gender'].fillna('Male', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Gender'].fillna('Male', inplace=True)


In [53]:
df['Gender'].isnull().sum()

np.int64(0)

In [54]:
df['Gender']=df['Gender'].map({'Male':1, 'Female':0})

In [55]:
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Property_Area_Semiurban,Property_Area_Urban
0,1,No,0,1,No,5849,0.0,,360.0,1.0,Y,0.0,1.0
1,1,Yes,1,1,No,4583,1508.0,128.0,360.0,1.0,N,0.0,0.0
2,1,Yes,0,1,Yes,3000,0.0,66.0,360.0,1.0,Y,0.0,1.0
3,1,Yes,0,0,No,2583,2358.0,120.0,360.0,1.0,Y,0.0,1.0
4,1,No,0,1,No,6000,0.0,141.0,360.0,1.0,Y,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,No,0,1,No,2900,0.0,71.0,360.0,1.0,Y,0.0,0.0
610,1,Yes,3+,1,No,4106,0.0,40.0,180.0,1.0,Y,0.0,0.0
611,1,Yes,1,1,No,8072,240.0,253.0,360.0,1.0,Y,0.0,1.0
612,1,Yes,2,1,No,7583,0.0,187.0,360.0,1.0,Y,0.0,1.0


In [56]:
df.isnull().sum()

Gender                      0
Married                     3
Dependents                 15
Education                   0
Self_Employed              32
ApplicantIncome             0
CoapplicantIncome           0
LoanAmount                 22
Loan_Amount_Term           14
Credit_History             50
Loan_Status                 0
Property_Area_Semiurban     0
Property_Area_Urban         0
dtype: int64

In [57]:
df['Married'].value_counts()

Married
Yes    398
No     213
Name: count, dtype: int64

In [58]:
df['Married'].isnull().sum()

np.int64(3)

In [59]:
df['Married'].fillna('Yes', inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Married'].fillna('Yes', inplace=True)


In [60]:
df['Married'].isnull().sum()

np.int64(0)

In [61]:
df['Married'].value_counts()

Married
Yes    401
No     213
Name: count, dtype: int64

In [62]:
df['Married']=df['Married'].map({'Yes':1, 'No':0})

In [63]:
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Property_Area_Semiurban,Property_Area_Urban
0,1,0,0,1,No,5849,0.0,,360.0,1.0,Y,0.0,1.0
1,1,1,1,1,No,4583,1508.0,128.0,360.0,1.0,N,0.0,0.0
2,1,1,0,1,Yes,3000,0.0,66.0,360.0,1.0,Y,0.0,1.0
3,1,1,0,0,No,2583,2358.0,120.0,360.0,1.0,Y,0.0,1.0
4,1,0,0,1,No,6000,0.0,141.0,360.0,1.0,Y,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,1,No,2900,0.0,71.0,360.0,1.0,Y,0.0,0.0
610,1,1,3+,1,No,4106,0.0,40.0,180.0,1.0,Y,0.0,0.0
611,1,1,1,1,No,8072,240.0,253.0,360.0,1.0,Y,0.0,1.0
612,1,1,2,1,No,7583,0.0,187.0,360.0,1.0,Y,0.0,1.0


In [64]:
df.isnull().sum()

Gender                      0
Married                     0
Dependents                 15
Education                   0
Self_Employed              32
ApplicantIncome             0
CoapplicantIncome           0
LoanAmount                 22
Loan_Amount_Term           14
Credit_History             50
Loan_Status                 0
Property_Area_Semiurban     0
Property_Area_Urban         0
dtype: int64

In [65]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Gender                   614 non-null    int64  
 1   Married                  614 non-null    int64  
 2   Dependents               599 non-null    object 
 3   Education                614 non-null    int64  
 4   Self_Employed            582 non-null    object 
 5   ApplicantIncome          614 non-null    int64  
 6   CoapplicantIncome        614 non-null    float64
 7   LoanAmount               592 non-null    float64
 8   Loan_Amount_Term         600 non-null    float64
 9   Credit_History           564 non-null    float64
 10  Loan_Status              614 non-null    object 
 11  Property_Area_Semiurban  614 non-null    float64
 12  Property_Area_Urban      614 non-null    float64
dtypes: float64(6), int64(4), object(3)
memory usage: 62.5+ KB


In [66]:
df['Dependents'].value_counts()

Dependents
0     345
1     102
2     101
3+     51
Name: count, dtype: int64

In [67]:
df['Dependents'].isnull().sum()

np.int64(15)

In [68]:
df['Dependents'].fillna('0', inplace=True)
df['Dependents']=df['Dependents'].map({'0':0, '1':1, '2':2, '3+':3})
df.isnull().sum()

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Dependents'].fillna('0', inplace=True)


Gender                      0
Married                     0
Dependents                  0
Education                   0
Self_Employed              32
ApplicantIncome             0
CoapplicantIncome           0
LoanAmount                 22
Loan_Amount_Term           14
Credit_History             50
Loan_Status                 0
Property_Area_Semiurban     0
Property_Area_Urban         0
dtype: int64

In [71]:
df['Self_Employed'].value_counts()

Self_Employed
No     500
Yes     82
Name: count, dtype: int64

In [72]:
df['Self_Employed'].isnull().sum()

np.int64(32)

In [73]:
df['Self_Employed'].fillna('No', inplace=True)
df['Self_Employed']=df['Self_Employed'].map({'Yes':1, 'No':0})
df.isnull().sum()

Gender                      0
Married                     0
Dependents                  0
Education                   0
Self_Employed               0
ApplicantIncome             0
CoapplicantIncome           0
LoanAmount                 22
Loan_Amount_Term           14
Credit_History             50
Loan_Status                 0
Property_Area_Semiurban     0
Property_Area_Urban         0
dtype: int64

In [78]:
from sklearn.impute import SimpleImputer
imputer=SimpleImputer(strategy='mean')
df['LoanAmount']=imputer.fit_transform(df[['LoanAmount']])
df.isnull().sum()

Gender                      0
Married                     0
Dependents                  0
Education                   0
Self_Employed               0
ApplicantIncome             0
CoapplicantIncome           0
LoanAmount                  0
Loan_Amount_Term           14
Credit_History             50
Loan_Status                 0
Property_Area_Semiurban     0
Property_Area_Urban         0
dtype: int64

In [82]:
df['Loan_Amount_Term'].value_counts()

Loan_Amount_Term
360.0    512
180.0     44
480.0     15
300.0     13
84.0       4
240.0      4
120.0      3
60.0       2
36.0       2
12.0       1
Name: count, dtype: int64

In [83]:
imputer2=SimpleImputer(strategy='most_frequent')
df['Loan_Amount_Term']=imputer2.fit_transform(df[['Loan_Amount_Term']])
df.isnull().sum()

Gender                      0
Married                     0
Dependents                  0
Education                   0
Self_Employed               0
ApplicantIncome             0
CoapplicantIncome           0
LoanAmount                  0
Loan_Amount_Term            0
Credit_History             50
Loan_Status                 0
Property_Area_Semiurban     0
Property_Area_Urban         0
dtype: int64

In [85]:
df['Credit_History'].value_counts()

Credit_History
1.0    475
0.0     89
Name: count, dtype: int64

In [86]:
df['Credit_History']=imputer2.fit_transform(df[['Credit_History']])
df.isnull().sum()

Gender                     0
Married                    0
Dependents                 0
Education                  0
Self_Employed              0
ApplicantIncome            0
CoapplicantIncome          0
LoanAmount                 0
Loan_Amount_Term           0
Credit_History             0
Loan_Status                0
Property_Area_Semiurban    0
Property_Area_Urban        0
dtype: int64

In [87]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Gender                   614 non-null    int64  
 1   Married                  614 non-null    int64  
 2   Dependents               614 non-null    int64  
 3   Education                614 non-null    int64  
 4   Self_Employed            614 non-null    int64  
 5   ApplicantIncome          614 non-null    int64  
 6   CoapplicantIncome        614 non-null    float64
 7   LoanAmount               614 non-null    float64
 8   Loan_Amount_Term         614 non-null    float64
 9   Credit_History           614 non-null    float64
 10  Loan_Status              614 non-null    object 
 11  Property_Area_Semiurban  614 non-null    float64
 12  Property_Area_Urban      614 non-null    float64
dtypes: float64(6), int64(6), object(1)
memory usage: 62.5+ KB


In [88]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
df['Loan_Status']=le.fit_transform(df['Loan_Status'])
df

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Loan_Status,Property_Area_Semiurban,Property_Area_Urban
0,1,0,0,1,0,5849,0.0,146.412162,360.0,1.0,1,0.0,1.0
1,1,1,1,1,0,4583,1508.0,128.000000,360.0,1.0,0,0.0,0.0
2,1,1,0,1,1,3000,0.0,66.000000,360.0,1.0,1,0.0,1.0
3,1,1,0,0,0,2583,2358.0,120.000000,360.0,1.0,1,0.0,1.0
4,1,0,0,1,0,6000,0.0,141.000000,360.0,1.0,1,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
609,0,0,0,1,0,2900,0.0,71.000000,360.0,1.0,1,0.0,0.0
610,1,1,3,1,0,4106,0.0,40.000000,180.0,1.0,1,0.0,0.0
611,1,1,1,1,0,8072,240.0,253.000000,360.0,1.0,1,0.0,1.0
612,1,1,2,1,0,7583,0.0,187.000000,360.0,1.0,1,0.0,1.0


In [89]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 614 entries, 0 to 613
Data columns (total 13 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Gender                   614 non-null    int64  
 1   Married                  614 non-null    int64  
 2   Dependents               614 non-null    int64  
 3   Education                614 non-null    int64  
 4   Self_Employed            614 non-null    int64  
 5   ApplicantIncome          614 non-null    int64  
 6   CoapplicantIncome        614 non-null    float64
 7   LoanAmount               614 non-null    float64
 8   Loan_Amount_Term         614 non-null    float64
 9   Credit_History           614 non-null    float64
 10  Loan_Status              614 non-null    int64  
 11  Property_Area_Semiurban  614 non-null    float64
 12  Property_Area_Urban      614 non-null    float64
dtypes: float64(6), int64(7)
memory usage: 62.5 KB


In [90]:
x=df.drop('Loan_Status', axis=1)
y=df['Loan_Status']

In [92]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
from sklearn.linear_model import LogisticRegression
model=LogisticRegression()
model.fit(x_train, y_train)
y_pred=model.predict(x_test)
accuracy=accuracy_score(y_test, y_pred)
print(f'Accuracy of Logistic Regression model: {accuracy*100:.2f}')

Accuracy of Logistic Regression model: 78.86


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [93]:
from sklearn import svm
model_svm = svm.SVC(kernel='linear')
model_svm.fit(x_train, y_train)
y_pred_svm = model_svm.predict(x_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print(f'Accuracy of SVM model: {accuracy_svm*100:.2f}')

Accuracy of SVM model: 73.98
