# Data preprocessing

In [42]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import os

In [43]:
file = r"C:\Users\glabm\Desktop\Credit approval project\Credit-approval-project\data\01_raw\loan_approval_dataset.json"
df = pd.read_json(file)
print(df.head())

   Id   Income  Age  Experience Married/Single House_Ownership Car_Ownership  \
0   1  1303834   23           3         single          rented            no   
1   2  7574516   40          10         single          rented            no   
2   3  3991815   66           4        married          rented            no   
3   4  6256451   41           2         single          rented           yes   
4   5  5768871   47          11         single          rented            no   

            Profession                 CITY           STATE  CURRENT_JOB_YRS  \
0  Mechanical_engineer                 Rewa  Madhya_Pradesh                3   
1   Software_Developer             Parbhani     Maharashtra                9   
2     Technical_writer            Alappuzha          Kerala                4   
3   Software_Developer          Bhubaneswar          Odisha                2   
4        Civil_servant  Tiruchirappalli[10]      Tamil_Nadu                3   

   CURRENT_HOUSE_YRS  Risk_Flag  
0   

In [44]:
print(df.info())
print(df.isna().sum())
print(df.duplicated().sum())

<class 'pandas.core.frame.DataFrame'>
Index: 252000 entries, 0 to 251999
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Id                 252000 non-null  int64 
 1   Income             252000 non-null  int64 
 2   Age                252000 non-null  int64 
 3   Experience         252000 non-null  int64 
 4   Married/Single     252000 non-null  object
 5   House_Ownership    252000 non-null  object
 6   Car_Ownership      252000 non-null  object
 7   Profession         252000 non-null  object
 8   CITY               252000 non-null  object
 9   STATE              252000 non-null  object
 10  CURRENT_JOB_YRS    252000 non-null  int64 
 11  CURRENT_HOUSE_YRS  252000 non-null  int64 
 12  Risk_Flag          252000 non-null  int64 
dtypes: int64(7), object(6)
memory usage: 26.9+ MB
None
Id                   0
Income               0
Age                  0
Experience           0
Married/Single       0
Hou

### Encoding categorical values

In [49]:
category_columns = ['Married/Single', 'House_Ownership', 'Car_Ownership', 'Profession', 'CITY', 'STATE']

for col in category_columns:
    df[col] = df[col].astype('category')

print(df.info())  

<class 'pandas.core.frame.DataFrame'>
Index: 252000 entries, 0 to 251999
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype   
---  ------             --------------   -----   
 0   Id                 252000 non-null  int64   
 1   Income             252000 non-null  int64   
 2   Age                252000 non-null  int64   
 3   Experience         252000 non-null  int64   
 4   Married/Single     252000 non-null  category
 5   House_Ownership    231898 non-null  category
 6   Car_Ownership      252000 non-null  category
 7   Profession         252000 non-null  category
 8   CITY               252000 non-null  category
 9   STATE              252000 non-null  category
 10  CURRENT_JOB_YRS    252000 non-null  int64   
 11  CURRENT_HOUSE_YRS  252000 non-null  int64   
 12  Risk_Flag          252000 non-null  int64   
dtypes: category(6), int64(7)
memory usage: 17.1 MB
None


### Encoding binary variables

In [46]:
df['Car_Ownership'] = df['Car_Ownership'].map({'yes': 1, 'no': 0})
df['House_Ownership'] = df['House_Ownership'].map({'own': 2, 'noown_norent': 0, 'rented': 1})
df['Married/Single'] = df['Married/Single'].map({'single': 0, 'married': 1})
df.head()

Unnamed: 0,Id,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,1,1303834,23,3,0,1.0,0,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13,0
1,2,7574516,40,10,0,1.0,0,Software_Developer,Parbhani,Maharashtra,9,13,0
2,3,3991815,66,4,1,1.0,0,Technical_writer,Alappuzha,Kerala,4,10,0
3,4,6256451,41,2,0,1.0,1,Software_Developer,Bhubaneswar,Odisha,2,12,1
4,5,5768871,47,11,0,1.0,0,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14,1


### Normalizing data and saving it into file

In [47]:
scaler = MinMaxScaler()
normalized_data = scaler.fit_transform(df[['Income', 'Age', 'Experience', 'CURRENT_JOB_YRS', 'CURRENT_HOUSE_YRS']])

folder_path = r'C:\Users\glabm\Desktop\Credit approval project\Credit-approval-project\data\03_processed'
full_path = os.path.join(folder_path, 'normalized_data.npy')

np.save(full_path, normalized_data)

### Saving numerical and categorical data to separate files

In [48]:
# Define categorical and numerical columns
categorical_columns = ['Married/Single', 'House_Ownership','Car_Ownership','Profession', 'CITY', 'STATE', 'Risk_Flag']

# Save categorical data to a file
categorical_data = df[categorical_columns]
categorical_data.to_csv("categorical_data.csv", index=False)

# Save numerical data to a file
numerical_data = df[numerical_columns]
numerical_data.to_csv("numerical_data.csv", index=False)