In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder

In [2]:
data = pd.read_csv('/content/drive/MyDrive/Dataset/Training Data.csv')

data.head()

Unnamed: 0,Id,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,1,1303834,23,3,single,rented,no,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13,0
1,2,7574516,40,10,single,rented,no,Software_Developer,Parbhani,Maharashtra,9,13,0
2,3,3991815,66,4,married,rented,no,Technical_writer,Alappuzha,Kerala,4,10,0
3,4,6256451,41,2,single,rented,yes,Software_Developer,Bhubaneswar,Odisha,2,12,1
4,5,5768871,47,11,single,rented,no,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14,1


In [3]:
data.shape

(252000, 13)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 252000 entries, 0 to 251999
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   Id                 252000 non-null  int64 
 1   Income             252000 non-null  int64 
 2   Age                252000 non-null  int64 
 3   Experience         252000 non-null  int64 
 4   Married/Single     252000 non-null  object
 5   House_Ownership    252000 non-null  object
 6   Car_Ownership      252000 non-null  object
 7   Profession         252000 non-null  object
 8   CITY               252000 non-null  object
 9   STATE              252000 non-null  object
 10  CURRENT_JOB_YRS    252000 non-null  int64 
 11  CURRENT_HOUSE_YRS  252000 non-null  int64 
 12  Risk_Flag          252000 non-null  int64 
dtypes: int64(7), object(6)
memory usage: 25.0+ MB


### Data preprocessing

We want to convert the CSV file to npz. This will enable our tensorflow model see the data as tensors.

In [5]:
le = LabelEncoder()
data['Married/Single'] = le.fit_transform(data['Married/Single'])
data['House_Ownership'] = le.fit_transform(data['House_Ownership'])
data['Car_Ownership'] = le.fit_transform(data['Car_Ownership'])
data['Profession'] = le.fit_transform(data['Profession'])
data['CITY'] = le.fit_transform(data['CITY'])
data['STATE'] = le.fit_transform(data['STATE'])

data.head()

Unnamed: 0,Id,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,1,1303834,23,3,1,2,0,33,251,13,3,13,0
1,2,7574516,40,10,1,2,0,43,227,14,9,13,0
2,3,3991815,66,4,0,2,0,47,8,12,4,10,0
3,4,6256451,41,2,1,2,1,43,54,17,2,12,1
4,5,5768871,47,11,1,2,0,11,296,22,3,14,1


In [6]:
data.to_csv('/content/drive/MyDrive/Dataset/initial_clean.csv', index=False, header=False)

### Load the data

In [7]:
raw_data = np.loadtxt('/content/drive/MyDrive/Dataset/initial_clean.csv', delimiter=',')
raw_data[:10]

array([[1.000000e+00, 1.303834e+06, 2.300000e+01, 3.000000e+00,
        1.000000e+00, 2.000000e+00, 0.000000e+00, 3.300000e+01,
        2.510000e+02, 1.300000e+01, 3.000000e+00, 1.300000e+01,
        0.000000e+00],
       [2.000000e+00, 7.574516e+06, 4.000000e+01, 1.000000e+01,
        1.000000e+00, 2.000000e+00, 0.000000e+00, 4.300000e+01,
        2.270000e+02, 1.400000e+01, 9.000000e+00, 1.300000e+01,
        0.000000e+00],
       [3.000000e+00, 3.991815e+06, 6.600000e+01, 4.000000e+00,
        0.000000e+00, 2.000000e+00, 0.000000e+00, 4.700000e+01,
        8.000000e+00, 1.200000e+01, 4.000000e+00, 1.000000e+01,
        0.000000e+00],
       [4.000000e+00, 6.256451e+06, 4.100000e+01, 2.000000e+00,
        1.000000e+00, 2.000000e+00, 1.000000e+00, 4.300000e+01,
        5.400000e+01, 1.700000e+01, 2.000000e+00, 1.200000e+01,
        1.000000e+00],
       [5.000000e+00, 5.768871e+06, 4.700000e+01, 1.100000e+01,
        1.000000e+00, 2.000000e+00, 0.000000e+00, 1.100000e+01,
        2.96

In [8]:
input_all = raw_data[:,1:-1]
target_all = raw_data[:,-1]

In [9]:
target_all.shape[0]

252000

### Balance the dataset

In [10]:
num_of_one_target = int(np.sum(target_all))
zero_count_target = 0
indices_to_remove = []

for i in range(target_all.shape[0]):
  if target_all[i] == 0:
    zero_count_target += 1
    if zero_count_target > num_of_one_target:
      indices_to_remove.append(i)

input_all_equal_prior = np.delete(input_all, indices_to_remove, axis=0)
target_all_equal_prior = np.delete(target_all, indices_to_remove, axis=0)

## Scale the data

In [11]:
scaled_input = preprocessing.scale(input_all_equal_prior)

### Shuffle the data

In [12]:
shuffled_indices = np.arange(scaled_input.shape[0])
np.random.shuffle(shuffled_indices)

shuffled_input = scaled_input[shuffled_indices]
shuffled_target = target_all_equal_prior[shuffled_indices]

### Split the dataset into train, test and validation set

In [13]:
samples_count = shuffled_input.shape[0]

train_samples_count = int(0.8 * samples_count)
validation_samples_count = int(0.1 * samples_count)
test_samples_count = samples_count - train_samples_count - validation_samples_count

train_input = shuffled_input[:train_samples_count]
train_target = shuffled_target[:train_samples_count]

validation_input = shuffled_input[train_samples_count:train_samples_count + validation_samples_count]
validation_target = shuffled_target[train_samples_count:train_samples_count + validation_samples_count]

test_input = shuffled_input[train_samples_count + validation_samples_count:]
test_target = shuffled_target[train_samples_count + validation_samples_count:]

print(np.sum(train_target), train_samples_count, np.sum(train_target)/train_samples_count)
print(np.sum(validation_target), validation_samples_count, np.sum(validation_target)/validation_samples_count)
print(np.sum(test_target), test_samples_count, np.sum(test_target)/test_samples_count)

24828.0 49593 0.5006351702861291
3094.0 6199 0.4991127601226004
3074.0 6200 0.4958064516129032


### Save the npz files

In [14]:
np.savez('/content/drive/MyDrive/Dataset/train_loan_data', inputs=train_input, target=train_target)
np.savez('/content/drive/MyDrive/Dataset/val_loan_data', inputs=validation_input, target=validation_target)
np.savez('/content/drive/MyDrive/Dataset/test_loan_data', inputs=test_input, target=test_target)