In [1]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

In [2]:
dataset=pd.read_csv('credit_risk_dataset.csv')
dataset

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length
0,22,59000,RENT,123.0,PERSONAL,D,35000,16.02,1,0.59,Y,3
1,21,9600,OWN,5.0,EDUCATION,B,1000,11.14,0,0.10,N,2
2,25,9600,MORTGAGE,1.0,MEDICAL,C,5500,12.87,1,0.57,N,3
3,23,65500,RENT,4.0,MEDICAL,C,35000,15.23,1,0.53,N,2
4,24,54400,RENT,8.0,MEDICAL,C,35000,14.27,1,0.55,Y,4
...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,MORTGAGE,1.0,PERSONAL,C,5800,13.16,0,0.11,N,30
32577,54,120000,MORTGAGE,4.0,PERSONAL,A,17625,7.49,0,0.15,N,19
32578,65,76000,RENT,3.0,HOMEIMPROVEMENT,B,35000,10.99,1,0.46,N,28
32579,56,150000,MORTGAGE,5.0,PERSONAL,B,15000,11.48,0,0.10,N,26


# Data Cleaning
1. Checking for special characters

In [3]:
import re

df = pd.DataFrame(dataset)

# Define a regular expression for special characters
special_char_pattern = r'[^a-zA-Z0-9.\s]'

# Iterate over each column to check for special characters
for column in df.columns:
    print(f"Checking column: {column}")
    
    # Convert the column to string to handle non-string values
    contains_special_chars = df[column].astype(str).str.contains(special_char_pattern, regex=True)
    
    # Print the rows where special characters are found
    if contains_special_chars.any():
        print(f"Rows with special characters in {column}:")
        print(df[contains_special_chars])
    else:
        print(f"No special characters found in {column}.")
    print()  # For better readability between columns

Checking column: person_age
No special characters found in person_age.

Checking column: person_income
No special characters found in person_income.

Checking column: person_home_ownership
No special characters found in person_home_ownership.

Checking column: person_emp_length
No special characters found in person_emp_length.

Checking column: loan_intent
No special characters found in loan_intent.

Checking column: loan_grade
No special characters found in loan_grade.

Checking column: loan_amnt
No special characters found in loan_amnt.

Checking column: loan_int_rate
No special characters found in loan_int_rate.

Checking column: loan_status
No special characters found in loan_status.

Checking column: loan_percent_income
No special characters found in loan_percent_income.

Checking column: cb_person_default_on_file
No special characters found in cb_person_default_on_file.

Checking column: cb_person_cred_hist_length
No special characters found in cb_person_cred_hist_length.



# 2. Checking and Removing Null values

In [4]:
dataset.isna().sum()

person_age                       0
person_income                    0
person_home_ownership            0
person_emp_length              895
loan_intent                      0
loan_grade                       0
loan_amnt                        0
loan_int_rate                 3116
loan_status                      0
loan_percent_income              0
cb_person_default_on_file        0
cb_person_cred_hist_length       0
dtype: int64

In [5]:
print(dataset['person_emp_length'].isnull().sum())

895


In [6]:
# Calculate the mean of 'person_emp_length', ignoring NaN values
mean_emp_length = dataset['person_emp_length'].mean()
mean_emp_length

np.float64(4.789686296787225)

In [7]:
# Replace NaN values with the mean
dataset['person_emp_length'].fillna(mean_emp_length, inplace=True)
# Verify that there are no more missing values in the column
print(dataset['person_emp_length'].isnull().sum())

0


In [8]:
print(dataset['loan_int_rate'].isnull().sum())

3116


In [9]:
# Calculate the mean of 'person_emp_length', ignoring NaN values
mean_int_rate = dataset['loan_int_rate'].mean()
# Replace NaN values with the mean
dataset['loan_int_rate'].fillna(mean_int_rate, inplace=True)
# Verify that there are no more missing values in the column
print(dataset['loan_int_rate'].isnull().sum())

0


In [10]:
dataset.isna().sum()

person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_status                   0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
dtype: int64

# 3. Checking for zero values in columns

In [11]:

df = pd.DataFrame(dataset)

# Iterate over each column to check for the value 0
for column in df.columns:
    print(f"Checking column: {column}")
    
    # Check if the column contains any zeros
    contains_zero = df[column] == 0
    
    # Print the rows where the value is zero
    if contains_zero.any():
        print(f"Rows with zero in {column}:")
        #print(df[contains_zero])
    else:
        print(f"No zero values found in {column}.")
    print()  # For better readability between columns

Checking column: person_age
No zero values found in person_age.

Checking column: person_income
No zero values found in person_income.

Checking column: person_home_ownership
No zero values found in person_home_ownership.

Checking column: person_emp_length
Rows with zero in person_emp_length:

Checking column: loan_intent
No zero values found in loan_intent.

Checking column: loan_grade
No zero values found in loan_grade.

Checking column: loan_amnt
No zero values found in loan_amnt.

Checking column: loan_int_rate
No zero values found in loan_int_rate.

Checking column: loan_status
Rows with zero in loan_status:

Checking column: loan_percent_income
Rows with zero in loan_percent_income:

Checking column: cb_person_default_on_file
No zero values found in cb_person_default_on_file.

Checking column: cb_person_cred_hist_length
No zero values found in cb_person_cred_hist_length.



In [12]:
dataset['person_emp_length'] = dataset['person_emp_length'].replace(0.0, mean_emp_length)

In [13]:
calculated_Value= (dataset['loan_amnt'] / dataset['person_income'])

# Step 2: Replace zero values in 'loan_percent_income' with the recalculated values
dataset.loc[dataset['loan_percent_income'] == 0.0, 'loan_percent_income'] = calculated_Value

In [14]:
# Convert person_emp_length from years to months
dataset['person_emp_length'] = dataset['person_emp_length'] * 12

In [15]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   person_age                  32581 non-null  int64  
 1   person_income               32581 non-null  int64  
 2   person_home_ownership       32581 non-null  object 
 3   person_emp_length           32581 non-null  float64
 4   loan_intent                 32581 non-null  object 
 5   loan_grade                  32581 non-null  object 
 6   loan_amnt                   32581 non-null  int64  
 7   loan_int_rate               32581 non-null  float64
 8   loan_status                 32581 non-null  int64  
 9   loan_percent_income         32581 non-null  float64
 10  cb_person_default_on_file   32581 non-null  object 
 11  cb_person_cred_hist_length  32581 non-null  int64  
dtypes: float64(3), int64(5), object(4)
memory usage: 3.0+ MB


# 4. Dropping unwanted columns

In [16]:
dataset=dataset.drop(['loan_grade','loan_intent','cb_person_cred_hist_length'],axis=1)

# 5. Changing categorical values to numerical

In [17]:
pd.get_dummies(dataset)

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,cb_person_default_on_file_N,cb_person_default_on_file_Y
0,22,59000,1476.0,35000,16.02,1,0.59,False,False,False,True,False,True
1,21,9600,60.0,1000,11.14,0,0.10,False,False,True,False,True,False
2,25,9600,12.0,5500,12.87,1,0.57,True,False,False,False,True,False
3,23,65500,48.0,35000,15.23,1,0.53,False,False,False,True,True,False
4,24,54400,96.0,35000,14.27,1,0.55,False,False,False,True,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32576,57,53000,12.0,5800,13.16,0,0.11,True,False,False,False,True,False
32577,54,120000,48.0,17625,7.49,0,0.15,True,False,False,False,True,False
32578,65,76000,36.0,35000,10.99,1,0.46,False,False,False,True,True,False
32579,56,150000,60.0,15000,11.48,0,0.10,True,False,False,False,True,False


In [18]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   person_age                 32581 non-null  int64  
 1   person_income              32581 non-null  int64  
 2   person_home_ownership      32581 non-null  object 
 3   person_emp_length          32581 non-null  float64
 4   loan_amnt                  32581 non-null  int64  
 5   loan_int_rate              32581 non-null  float64
 6   loan_status                32581 non-null  int64  
 7   loan_percent_income        32581 non-null  float64
 8   cb_person_default_on_file  32581 non-null  object 
dtypes: float64(3), int64(4), object(2)
memory usage: 2.2+ MB


In [19]:
# Map person_home_ownership categories to integers
home_ownership_map = {'RENT': 0, 'OWN': 1, 'MORTGAGE': 2, 'OTHER': 3}
dataset['person_home_ownership'] = dataset['person_home_ownership'].map(home_ownership_map)

# Convert cb_person_default_on_file to binary values (e.g., 'Y' = 1, 'N' = 0)
dataset['cb_person_default_on_file'] = dataset['cb_person_default_on_file'].map({'Y': 1, 'N': 0})

# Check the result
print(dataset.head())


   person_age  person_income  person_home_ownership  person_emp_length  \
0          22          59000                      0             1476.0   
1          21           9600                      1               60.0   
2          25           9600                      2               12.0   
3          23          65500                      0               48.0   
4          24          54400                      0               96.0   

   loan_amnt  loan_int_rate  loan_status  loan_percent_income  \
0      35000          16.02            1                 0.59   
1       1000          11.14            0                 0.10   
2       5500          12.87            1                 0.57   
3      35000          15.23            1                 0.53   
4      35000          14.27            1                 0.55   

   cb_person_default_on_file  
0                          1  
1                          0  
2                          0  
3                          0  
4        

In [20]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32581 entries, 0 to 32580
Data columns (total 9 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   person_age                 32581 non-null  int64  
 1   person_income              32581 non-null  int64  
 2   person_home_ownership      32581 non-null  int64  
 3   person_emp_length          32581 non-null  float64
 4   loan_amnt                  32581 non-null  int64  
 5   loan_int_rate              32581 non-null  float64
 6   loan_status                32581 non-null  int64  
 7   loan_percent_income        32581 non-null  float64
 8   cb_person_default_on_file  32581 non-null  int64  
dtypes: float64(3), int64(6)
memory usage: 2.2 MB


In [21]:
dataset

Unnamed: 0,person_age,person_income,person_home_ownership,person_emp_length,loan_amnt,loan_int_rate,loan_status,loan_percent_income,cb_person_default_on_file
0,22,59000,0,1476.0,35000,16.02,1,0.59,1
1,21,9600,1,60.0,1000,11.14,0,0.10,0
2,25,9600,2,12.0,5500,12.87,1,0.57,0
3,23,65500,0,48.0,35000,15.23,1,0.53,0
4,24,54400,0,96.0,35000,14.27,1,0.55,1
...,...,...,...,...,...,...,...,...,...
32576,57,53000,2,12.0,5800,13.16,0,0.11,0
32577,54,120000,2,48.0,17625,7.49,0,0.15,0
32578,65,76000,0,36.0,35000,10.99,1,0.46,0
32579,56,150000,2,60.0,15000,11.48,0,0.10,0


# 6. Renaming columns

In [22]:
# Renaming specific columns
dataset.rename(columns={'person_age': 'age', 
                   'person_income': 'income',
                   'person_emp_length':'emplength_month',
                   'person_home_ownership':'home_type',
                   'cb_person_default_on_file':'default_status',
                   'loan_amnt':'loan_amount' ,
                   'loan_int_rate':'interest_rate'     
                  }, inplace=True)

In [23]:
dataset

Unnamed: 0,age,income,home_type,emplength_month,loan_amount,interest_rate,loan_status,loan_percent_income,default_status
0,22,59000,0,1476.0,35000,16.02,1,0.59,1
1,21,9600,1,60.0,1000,11.14,0,0.10,0
2,25,9600,2,12.0,5500,12.87,1,0.57,0
3,23,65500,0,48.0,35000,15.23,1,0.53,0
4,24,54400,0,96.0,35000,14.27,1,0.55,1
...,...,...,...,...,...,...,...,...,...
32576,57,53000,2,12.0,5800,13.16,0,0.11,0
32577,54,120000,2,48.0,17625,7.49,0,0.15,0
32578,65,76000,0,36.0,35000,10.99,1,0.46,0
32579,56,150000,2,60.0,15000,11.48,0,0.10,0


# 7. Removing outliers

In [24]:
dataset.describe()

Unnamed: 0,age,income,home_type,emplength_month,loan_amount,interest_rate,loan_status,loan_percent_income,default_status
count,32581.0,32581.0,32581.0,32581.0,32581.0,32581.0,32581.0,32581.0,32581.0
mean,27.7346,66074.85,0.914429,64.717878,9589.371106,11.011695,0.218164,0.170204,0.17633
std,6.348078,61983.12,0.960858,43.984932,6322.086646,3.081605,0.413006,0.10678,0.381106
min,20.0,4000.0,0.0,12.0,500.0,5.42,0.0,0.000789,0.0
25%,23.0,38500.0,0.0,36.0,5000.0,8.49,0.0,0.09,0.0
50%,26.0,55000.0,0.0,57.476236,8000.0,11.011695,0.0,0.15,0.0
75%,30.0,79200.0,2.0,84.0,12200.0,13.11,0.0,0.23,0.0
max,144.0,6000000.0,3.0,1476.0,35000.0,23.22,1.0,0.83,1.0


In [25]:
# Calculate Q1 (25th percentile) and Q3 (75th percentile) for 'person_income'
Q1 = dataset['income'].quantile(0.25)
Q3 = dataset['income'].quantile(0.75)
IQR = Q3 - Q1

# Define lower and upper bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

print(lower_bound)
print(upper_bound)


-22550.0
140250.0


In [26]:
df_no_outliers = dataset[(dataset['income'] >= lower_bound) & (dataset['income'] <= upper_bound)]
df_no_outliers

Unnamed: 0,age,income,home_type,emplength_month,loan_amount,interest_rate,loan_status,loan_percent_income,default_status
0,22,59000,0,1476.000000,35000,16.02,1,0.59,1
1,21,9600,1,60.000000,1000,11.14,0,0.10,0
2,25,9600,2,12.000000,5500,12.87,1,0.57,0
3,23,65500,0,48.000000,35000,15.23,1,0.53,0
4,24,54400,0,96.000000,35000,14.27,1,0.55,1
...,...,...,...,...,...,...,...,...,...
32575,52,64500,0,57.476236,5000,11.26,0,0.08,0
32576,57,53000,2,12.000000,5800,13.16,0,0.11,0
32577,54,120000,2,48.000000,17625,7.49,0,0.15,0
32578,65,76000,0,36.000000,35000,10.99,1,0.46,0


In [27]:
df_no_outliers.describe()

Unnamed: 0,age,income,home_type,emplength_month,loan_amount,interest_rate,loan_status,loan_percent_income,default_status
count,31097.0,31097.0,31097.0,31097.0,31097.0,31097.0,31097.0,31097.0,31097.0
mean,27.5989,58704.999582,0.888221,63.851959,9289.716854,10.997534,0.223912,0.174327,0.175933
std,6.196715,27519.057352,0.958011,42.380381,6038.671214,3.077604,0.416871,0.107048,0.38077
min,20.0,4000.0,0.0,12.0,500.0,5.42,0.0,0.01,0.0
25%,23.0,38000.0,0.0,36.0,5000.0,8.49,0.0,0.09,0.0
50%,26.0,54000.0,0.0,57.476236,8000.0,11.011695,0.0,0.15,0.0
75%,30.0,75000.0,2.0,84.0,12000.0,13.11,0.0,0.23,0.0
max,123.0,140004.0,3.0,1476.0,35000.0,23.22,1.0,0.83,1.0


In [28]:
df_no_outliers= df_no_outliers[df_no_outliers['emplength_month'] <1200]

In [29]:
df_no_outliers

Unnamed: 0,age,income,home_type,emplength_month,loan_amount,interest_rate,loan_status,loan_percent_income,default_status
1,21,9600,1,60.000000,1000,11.14,0,0.10,0
2,25,9600,2,12.000000,5500,12.87,1,0.57,0
3,23,65500,0,48.000000,35000,15.23,1,0.53,0
4,24,54400,0,96.000000,35000,14.27,1,0.55,1
5,21,9900,1,24.000000,2500,7.14,1,0.25,0
...,...,...,...,...,...,...,...,...,...
32575,52,64500,0,57.476236,5000,11.26,0,0.08,0
32576,57,53000,2,12.000000,5800,13.16,0,0.11,0
32577,54,120000,2,48.000000,17625,7.49,0,0.15,0
32578,65,76000,0,36.000000,35000,10.99,1,0.46,0


In [30]:
df_no_outliers.describe()

Unnamed: 0,age,income,home_type,emplength_month,loan_amount,interest_rate,loan_status,loan_percent_income,default_status
count,31096.0,31096.0,31096.0,31096.0,31096.0,31096.0,31096.0,31096.0,31096.0
mean,27.59908,58704.990095,0.888249,63.806546,9288.89005,10.997372,0.223887,0.174314,0.175907
std,6.196734,27519.499797,0.958013,41.617558,6037.00787,3.077522,0.416854,0.107023,0.380747
min,20.0,4000.0,0.0,12.0,500.0,5.42,0.0,0.01,0.0
25%,23.0,38000.0,0.0,36.0,5000.0,8.49,0.0,0.09,0.0
50%,26.0,54000.0,0.0,57.476236,8000.0,11.011695,0.0,0.15,0.0
75%,30.0,75000.0,2.0,84.0,12000.0,13.11,0.0,0.23,0.0
max,123.0,140004.0,3.0,492.0,35000.0,23.22,1.0,0.83,1.0


In [31]:
df_no_outliers['age']>100

1        False
2        False
3        False
4        False
5        False
         ...  
32575    False
32576    False
32577    False
32578    False
32580    False
Name: age, Length: 31096, dtype: bool

In [32]:
df_no_outliers= df_no_outliers[df_no_outliers['age'] <100]

In [33]:
df_no_outliers.describe()

Unnamed: 0,age,income,home_type,emplength_month,loan_amount,interest_rate,loan_status,loan_percent_income,default_status
count,31094.0,31094.0,31094.0,31094.0,31094.0,31094.0,31094.0,31094.0,31094.0
mean,27.592944,58703.684569,0.888306,63.807177,9288.188236,10.997396,0.223902,0.174309,0.175918
std,6.149513,27519.9022,0.958018,41.618127,6036.56752,3.077618,0.416864,0.107025,0.380757
min,20.0,4000.0,0.0,12.0,500.0,5.42,0.0,0.01,0.0
25%,23.0,38000.0,0.0,36.0,5000.0,8.49,0.0,0.09,0.0
50%,26.0,54000.0,0.0,57.476236,8000.0,11.011695,0.0,0.15,0.0
75%,30.0,75000.0,2.0,84.0,12000.0,13.11,0.0,0.23,0.0
max,94.0,140004.0,3.0,492.0,35000.0,23.22,1.0,0.83,1.0


# Training and Test data creation

In [35]:
independent=df_no_outliers[['age','income','emplength_month','loan_amount','interest_rate','loan_percent_income','default_status','home_type']]
dependent=df_no_outliers[['loan_status']]

In [36]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(independent,dependent,test_size=0.3,random_state=0)

In [37]:
X_train

Unnamed: 0,age,income,emplength_month,loan_amount,interest_rate,loan_percent_income,default_status,home_type
7727,22,54500,72.0,7500,10.59,0.14,0,2
30937,39,45500,48.0,10000,16.29,0.22,0,0
26689,30,110000,48.0,12000,10.74,0.11,0,2
12982,24,74004,96.0,8000,15.21,0.11,0,2
20779,27,42000,12.0,5000,10.99,0.12,0,0
...,...,...,...,...,...,...,...,...
13322,22,90000,72.0,7000,7.14,0.08,0,1
20415,27,40000,84.0,13000,7.88,0.33,0,2
10039,23,40000,84.0,9250,16.07,0.23,0,0
10993,23,70572,96.0,14500,5.99,0.21,0,2


# Standardization

In [38]:
from sklearn.preprocessing import StandardScaler
# Initialize StandardScaler
sc = StandardScaler()
X_train=sc.fit_transform(X_train)
X_test=sc.transform(X_test)

In [39]:
df_no_outliers.to_csv('Processed_credit_risk.csv',index=False)

In [40]:
new_ds=pd.read_csv('Processed_credit_risk.csv')

In [41]:
new_ds

Unnamed: 0,age,income,home_type,emplength_month,loan_amount,interest_rate,loan_status,loan_percent_income,default_status
0,21,9600,1,60.000000,1000,11.14,0,0.10,0
1,25,9600,2,12.000000,5500,12.87,1,0.57,0
2,23,65500,0,48.000000,35000,15.23,1,0.53,0
3,24,54400,0,96.000000,35000,14.27,1,0.55,1
4,21,9900,1,24.000000,2500,7.14,1,0.25,0
...,...,...,...,...,...,...,...,...,...
31089,52,64500,0,57.476236,5000,11.26,0,0.08,0
31090,57,53000,2,12.000000,5800,13.16,0,0.11,0
31091,54,120000,2,48.000000,17625,7.49,0,0.15,0
31092,65,76000,0,36.000000,35000,10.99,1,0.46,0


In [42]:
new_ds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31094 entries, 0 to 31093
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   age                  31094 non-null  int64  
 1   income               31094 non-null  int64  
 2   home_type            31094 non-null  int64  
 3   emplength_month      31094 non-null  float64
 4   loan_amount          31094 non-null  int64  
 5   interest_rate        31094 non-null  float64
 6   loan_status          31094 non-null  int64  
 7   loan_percent_income  31094 non-null  float64
 8   default_status       31094 non-null  int64  
dtypes: float64(3), int64(6)
memory usage: 2.1 MB


# Feature Engineering and Model creation.

def quanQual(dataset):
    quan=[]
    qual=[]
    for columnName in dataset.columns:
        if(dataset[columnName].dtypes=='O'):
            qual.append(columnName)
        else:
            quan.append(columnName)
    return quan,qual

In [44]:
quanQual(new_ds)

(['age',
  'income',
  'home_type',
  'emplength_month',
  'loan_amount',
  'interest_rate',
  'loan_status',
  'loan_percent_income',
  'default_status'],
 [])

In [45]:
quan,qual=quanQual(new_ds)

In [46]:
quan

['age',
 'income',
 'home_type',
 'emplength_month',
 'loan_amount',
 'interest_rate',
 'loan_status',
 'loan_percent_income',
 'default_status']

In [47]:
qual

[]

In [48]:
import numpy as np
import pandas as pd

def univariate(dataset, quan):
    descriptiveNew = pd.DataFrame(index=["Mean", "Median", "Mode", "Q1:25%", "Q2:50%", "Q3:75%", "99%", "Q4:100%",
                                "IQR", "1.5rule", "Lesser", "Greater", "MIN", "MAX", "Kurtosis", "Skewness", 'Variance', 'STDDev'], columns=quan)
    for columnName in quan:
        if columnName in dataset.columns:  # Check if the column exists in the dataset
            descriptiveNew[columnName]["Mean"] = dataset[columnName].mean() # would not omit outliers
            descriptiveNew[columnName]["Median"] = dataset[columnName].median()# would omit outliers
            descriptiveNew[columnName]["Mode"] = dataset[columnName].mode()[0]# repeated quans
            descriptiveNew[columnName]["Q1:25%"] = dataset.describe()[columnName]["25%"]
            descriptiveNew[columnName]["Q2:50%"] = dataset.describe()[columnName]["50%"]
            descriptiveNew[columnName]["Q3:75%"] = dataset.describe()[columnName]["75%"]
            descriptiveNew[columnName]["99%"] = np.percentile(dataset[columnName], 99)
            descriptiveNew[columnName]["Q4:100%"] = dataset.describe()[columnName]["max"]
            descriptiveNew[columnName]["IQR"] = descriptiveNew[columnName]["Q3:75%"] - descriptiveNew[columnName]["Q1:25%"]
            descriptiveNew[columnName]["1.5rule"] = 1.5 * descriptiveNew[columnName]["IQR"]
            descriptiveNew[columnName]["Lesser"] = descriptiveNew[columnName]["Q1:25%"] - descriptiveNew[columnName]["1.5rule"]
            descriptiveNew[columnName]["Greater"] = descriptiveNew[columnName]["Q3:75%"] + descriptiveNew[columnName]["1.5rule"]
            descriptiveNew[columnName]["MIN"] = dataset[columnName].min()
            descriptiveNew[columnName]["MAX"] = dataset[columnName].max()
            descriptiveNew[columnName]["Kurtosis"] = dataset[columnName].kurtosis()
            descriptiveNew[columnName]["Skewness"] = dataset[columnName].skew()
            descriptiveNew[columnName]["Variance"] = dataset[columnName].var()
            descriptiveNew[columnName]["STDDev"] = dataset[columnName].std()
        else:
            print(f"Warning: {columnName} not found in dataset.")
    return descriptiveNew

In [49]:
univariate(new_ds,quan)

Unnamed: 0,age,income,home_type,emplength_month,loan_amount,interest_rate,loan_status,loan_percent_income,default_status
Mean,27.592944,58703.684569,0.888306,63.807177,9288.188236,10.997396,0.223902,0.174309,0.175918
Median,26.0,54000.0,0.0,57.476236,8000.0,11.011695,0.0,0.15,0.0
Mode,23.0,60000.0,0.0,57.476236,10000.0,11.011695,0.0,0.1,0.0
Q1:25%,23.0,38000.0,0.0,36.0,5000.0,8.49,0.0,0.09,0.0
Q2:50%,26.0,54000.0,0.0,57.476236,8000.0,11.011695,0.0,0.15,0.0
Q3:75%,30.0,75000.0,2.0,84.0,12000.0,13.11,0.0,0.23,0.0
99%,49.0,133131.67,2.0,204.0,26000.0,18.39,1.0,0.5,1.0
Q4:100%,94.0,140004.0,3.0,492.0,35000.0,23.22,1.0,0.83,1.0
IQR,7.0,37000.0,2.0,48.0,7000.0,4.62,0.0,0.14,0.0
1.5rule,10.5,55500.0,3.0,72.0,10500.0,6.93,0.0,0.21,0.0


In [66]:
import pandas as pd
from sklearn.model_selection import train_test_split 
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pickle
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier   
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
import warnings
warnings.filterwarnings("ignore")

In [52]:
def rfeFeature(indep_X, dep_Y, n):
    rfelist = []
    
    # Define the models
    log_model = LogisticRegression(solver='lbfgs')
    RF = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
    DT = DecisionTreeClassifier(criterion='gini', max_features='sqrt', splitter='best', random_state=0)
    svc_model = SVC(kernel='linear', random_state=0)
    
    # List of models to use in RFE
    rfemodellist = [log_model, svc_model, RF, DT]
    
    # Apply RFE for each model
    for model in rfemodellist:
        print(model)  # To show which model is currently being processed
        # Corrected RFE initialization
        log_rfe = RFE(estimator=model, n_features_to_select=n)
        log_fit = log_rfe.fit(indep_X, dep_Y)
        log_rfe_feature = log_fit.transform(indep_X)
        rfelist.append(log_rfe_feature)
    
    return rfelist

In [53]:
def split_scalar(indep_X,dep_Y):
        X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size = 0.25, random_state = 0)
        #X_train, X_test, y_train, y_test = train_test_split(indep_X,dep_Y, test_size = 0.25, random_state = 0)
        
        #Feature Scaling
        #from sklearn.preprocessing import StandardScaler
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)
        
        return X_train, X_test, y_train, y_test

In [54]:
def cm_prediction(classifier,X_test):
     y_pred = classifier.predict(X_test)
        
        # Making the Confusion Matrix
     from sklearn.metrics import confusion_matrix
     cm = confusion_matrix(y_test, y_pred)
        
     from sklearn.metrics import accuracy_score 
     from sklearn.metrics import classification_report 
        #from sklearn.metrics import confusion_matrix
        #cm = confusion_matrix(y_test, y_pred)
        
     Accuracy=accuracy_score(y_test, y_pred )
        
     report=classification_report(y_test, y_pred)
     return  classifier,Accuracy,report,X_test,y_test,cm

In [55]:
def logistic(X_train,y_train,X_test):       
        # Fitting K-NN to the Training set
        from sklearn.linear_model import LogisticRegression
        classifier = LogisticRegression(random_state = 0)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm 

In [56]:
def svm_linear(X_train,y_train,X_test):
                
        from sklearn.svm import SVC
        classifier = SVC(kernel = 'linear', random_state = 0)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm

In [57]:
def svm_NL(X_train,y_train,X_test):
                
        from sklearn.svm import SVC
        classifier = SVC(kernel = 'rbf', random_state = 0)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm

In [58]:
def Navie(X_train,y_train,X_test):       
        # Fitting K-NN to the Training set
        from sklearn.naive_bayes import GaussianNB
        classifier = GaussianNB()
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm

In [59]:
def knn(X_train,y_train,X_test):
           
        # Fitting K-NN to the Training set
        from sklearn.neighbors import KNeighborsClassifier
        classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm

In [60]:
def Decision(X_train,y_train,X_test):
        
        # Fitting K-NN to the Training set
        from sklearn.tree import DecisionTreeClassifier
        classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm

In [61]:
def random(X_train,y_train,X_test):
        
        # Fitting K-NN to the Training set
        from sklearn.ensemble import RandomForestClassifier
        classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm

In [62]:
def rfe_classification(acclog,accsvml,accsvmnl,accknn,accnav,accdes,accrf): 
    
    rfedataframe=pd.DataFrame(index=['Logistic','SVC','Random','DecisionTree'],columns=['Logistic','SVMl','SVMnl',
                                                                                        'KNN','Navie','Decision','Random'])

    for number,idex in enumerate(rfedataframe.index):
        
        rfedataframe['Logistic'][idex]=acclog[number]       
        rfedataframe['SVMl'][idex]=accsvml[number]
        rfedataframe['SVMnl'][idex]=accsvmnl[number]
        rfedataframe['KNN'][idex]=accknn[number]
        rfedataframe['Navie'][idex]=accnav[number]
        rfedataframe['Decision'][idex]=accdes[number]
        rfedataframe['Random'][idex]=accrf[number]
    return rfedataframe

In [63]:
indep_X=new_ds.drop('loan_status', axis=1)
indep_X

Unnamed: 0,age,income,home_type,emplength_month,loan_amount,interest_rate,loan_percent_income,default_status
0,21,9600,1,60.000000,1000,11.14,0.10,0
1,25,9600,2,12.000000,5500,12.87,0.57,0
2,23,65500,0,48.000000,35000,15.23,0.53,0
3,24,54400,0,96.000000,35000,14.27,0.55,1
4,21,9900,1,24.000000,2500,7.14,0.25,0
...,...,...,...,...,...,...,...,...
31089,52,64500,0,57.476236,5000,11.26,0.08,0
31090,57,53000,2,12.000000,5800,13.16,0.11,0
31091,54,120000,2,48.000000,17625,7.49,0.15,0
31092,65,76000,0,36.000000,35000,10.99,0.46,0


In [64]:
dep_Y=new_ds['loan_status']
dep_Y

0        0
1        1
2        1
3        1
4        1
        ..
31089    0
31090    0
31091    0
31092    1
31093    0
Name: loan_status, Length: 31094, dtype: int64

In [None]:
rfelist=rfeFeature(indep_X,dep_Y,5)       

acclog=[]
accsvml=[]
accsvmnl=[]
accknn=[]
accnav=[]
accdes=[]
accrf=[]

for i in rfelist:   
    X_train, X_test, y_train, y_test=split_scalar(i,dep_Y)   
    
        
    classifier,Accuracy,report,X_test,y_test,cm=logistic(X_train,y_train,X_test)
    acclog.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=svm_linear(X_train,y_train,X_test)  
    accsvml.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=svm_NL(X_train,y_train,X_test)  
    accsvmnl.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=knn(X_train,y_train,X_test)  
    accknn.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=Navie(X_train,y_train,X_test)  
    accnav.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=Decision(X_train,y_train,X_test)  
    accdes.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=random(X_train,y_train,X_test)  
    accrf.append(Accuracy)
    
result=rfe_classification(acclog,accsvml,accsvmnl,accknn,accnav,accdes,accrf)

print ('5 RFE classification result is:\n',result)

LogisticRegression()
SVC(kernel='linear', random_state=0)
