# Import Libraries

In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import warnings

# Import Dataset

In [2]:
dataset = pd.read_csv("credit-card-data.csv")
print('Load the datasets...')

Load the datasets...


In [3]:
dataset

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Industry,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,Income,Approved
0,b,30.83,0.000,u,g,w,v,1.25,t,t,1,0,+
1,a,58.67,4.460,u,g,q,h,3.04,t,t,6,560,+
2,a,24.5,0.500,u,g,q,h,1.50,t,f,0,824,+
3,b,27.83,1.540,u,g,w,v,3.75,t,t,5,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,b,21.08,10.085,y,p,e,h,1.25,f,f,0,0,-
686,a,22.67,0.750,u,g,c,v,2.00,f,t,2,394,-
687,a,25.25,13.500,y,p,ff,ff,2.00,f,t,1,1,-
688,b,17.92,0.205,u,g,aa,v,0.04,f,f,0,750,-


## Gender: Map values —> 0 = Female, 1 = Male
## b = Male
## a = Female

In [4]:
# Map 'Gender' values: 'a' -> 0 (Female), 'b' -> 1 (Male)
gender_map = {'a': 0, 'b': 1}
dataset['Gender'] = dataset['Gender'].map(gender_map).astype('Int64')
le = LabelEncoder()
dataset['Gender'] = le.fit_transform(dataset['Gender'])
dataset

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Industry,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,Income,Approved
0,1,30.83,0.000,u,g,w,v,1.25,t,t,1,0,+
1,0,58.67,4.460,u,g,q,h,3.04,t,t,6,560,+
2,0,24.5,0.500,u,g,q,h,1.50,t,f,0,824,+
3,1,27.83,1.540,u,g,w,v,3.75,t,t,5,3,+
4,1,20.17,5.625,u,g,w,v,1.71,t,f,0,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,1,21.08,10.085,y,p,e,h,1.25,f,f,0,0,-
686,0,22.67,0.750,u,g,c,v,2.00,f,t,2,394,-
687,0,25.25,13.500,y,p,ff,ff,2.00,f,t,1,1,-
688,1,17.92,0.205,u,g,aa,v,0.04,f,f,0,750,-


## Age: Convert to a numeric format. Handle missing or invalid entries appropriatel

In [5]:
dataset['Age'] = pd.to_numeric(dataset['Age'], errors='coerce')
dataset['Age'] = dataset['Age'].round(0).astype('Int64')
dataset

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Industry,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,Income,Approved
0,1,31,0.000,u,g,w,v,1.25,t,t,1,0,+
1,0,59,4.460,u,g,q,h,3.04,t,t,6,560,+
2,0,24,0.500,u,g,q,h,1.50,t,f,0,824,+
3,1,28,1.540,u,g,w,v,3.75,t,t,5,3,+
4,1,20,5.625,u,g,w,v,1.71,t,f,0,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,1,21,10.085,y,p,e,h,1.25,f,f,0,0,-
686,0,23,0.750,u,g,c,v,2.00,f,t,2,394,-
687,0,25,13.500,y,p,ff,ff,2.00,f,t,1,1,-
688,1,18,0.205,u,g,aa,v,0.04,f,f,0,750,-


## Debt: Apply feature scaling.

In [6]:
scaler_std = StandardScaler()
dataset['Debt'] = scaler_std.fit_transform(dataset[['Debt']])
dataset

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Industry,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,Income,Approved
0,1,31,-0.956613,u,g,w,v,1.25,t,t,1,0,+
1,0,59,-0.060051,u,g,q,h,3.04,t,t,6,560,+
2,0,24,-0.856102,u,g,q,h,1.50,t,f,0,824,+
3,1,28,-0.647038,u,g,w,v,3.75,t,t,5,3,+
4,1,20,0.174141,u,g,w,v,1.71,t,f,0,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,1,21,1.070704,y,p,e,h,1.25,f,f,0,0,-
686,0,23,-0.805846,u,g,c,v,2.00,f,t,2,394,-
687,0,25,1.757198,y,p,ff,ff,2.00,f,t,1,1,-
688,1,18,-0.915403,u,g,aa,v,0.04,f,f,0,750,-


## Married: Apply variable encoding.
## u = Married
## y = Single
## I = Divorced

In [7]:
married_map = {'u': 'Married', 'y': 'Single', 'l': 'Divorced'}
dataset['Married'] = dataset['Married'].map(married_map)
le = LabelEncoder()
dataset['Married'] = le.fit_transform(dataset['Married'])
dataset

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Industry,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,Income,Approved
0,1,31,-0.956613,1,g,w,v,1.25,t,t,1,0,+
1,0,59,-0.060051,1,g,q,h,3.04,t,t,6,560,+
2,0,24,-0.856102,1,g,q,h,1.50,t,f,0,824,+
3,1,28,-0.647038,1,g,w,v,3.75,t,t,5,3,+
4,1,20,0.174141,1,g,w,v,1.71,t,f,0,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,1,21,1.070704,2,p,e,h,1.25,f,f,0,0,-
686,0,23,-0.805846,1,g,c,v,2.00,f,t,2,394,-
687,0,25,1.757198,2,p,ff,ff,2.00,f,t,1,1,-
688,1,18,-0.915403,1,g,aa,v,0.04,f,f,0,750,-


## Bank Customer: Map values —> 0 = No bank account, 1 = Has bank account

In [8]:
bank_customer_map = {'g': 0, 'p': 1, 's': 1}
dataset['BankCustomer'] = dataset['BankCustomer'].map(bank_customer_map).astype('Int64')
dataset

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Industry,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,Income,Approved
0,1,31,-0.956613,1,0,w,v,1.25,t,t,1,0,+
1,0,59,-0.060051,1,0,q,h,3.04,t,t,6,560,+
2,0,24,-0.856102,1,0,q,h,1.50,t,f,0,824,+
3,1,28,-0.647038,1,0,w,v,3.75,t,t,5,3,+
4,1,20,0.174141,1,0,w,v,1.71,t,f,0,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,1,21,1.070704,2,1,e,h,1.25,f,f,0,0,-
686,0,23,-0.805846,1,0,c,v,2.00,f,t,2,394,-
687,0,25,1.757198,2,1,ff,ff,2.00,f,t,1,1,-
688,1,18,-0.915403,1,0,aa,v,0.04,f,f,0,750,-


## Industry: Apply variable encoding.
## w = Industrials
## q = Materials
## m = Communication Services
## r = Transport
## cc = IT
## k = Financials
## d = Real Estate
## c = Energy
## x = Utilities
## q = Consumer Discretionary
## i = Education
## ff = Healthcare
## j = Research

In [9]:
industry_map = {
    'w': 'Industrials',
    'q': 'Materials',
    'm': 'Communication Services',
    'r': 'Transport',
    'cc': 'IT',
    'k': 'Financials',
    'd': 'Real Estate',
    'c': 'Energy',
    'x': 'Utilities',
    'aa': 'Consumer Discretionary',
    'i': 'Education',
    'e': 'Consumer Staples',
    'ff': 'Healthcare',
    'j': 'Research'
}

dataset['Industry'] = dataset['Industry'].map(industry_map).fillna('Other')
le = LabelEncoder()
dataset['Industry'] = le.fit_transform(dataset['Industry'])
dataset

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Industry,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,Income,Approved
0,1,31,-0.956613,1,0,8,v,1.25,t,t,1,0,+
1,0,59,-0.060051,1,0,9,h,3.04,t,t,6,560,+
2,0,24,-0.856102,1,0,9,h,1.50,t,f,0,824,+
3,1,28,-0.647038,1,0,8,v,3.75,t,t,5,3,+
4,1,20,0.174141,1,0,8,v,1.71,t,f,0,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,1,21,1.070704,2,1,2,h,1.25,f,f,0,0,-
686,0,23,-0.805846,1,0,4,v,2.00,f,t,2,394,-
687,0,25,1.757198,2,1,6,ff,2.00,f,t,1,1,-
688,1,18,-0.915403,1,0,1,v,0.04,f,f,0,750,-


## Ethnicity: Apply variable encoding.
## v = white
## h = black
## bb = Asian
## ff = Latino
## other

In [10]:
dataset['Ethnicity'].unique()

array(['v', 'h', 'bb', 'ff', 'j', 'z', '?', 'o', 'dd', 'n'], dtype=object)

In [11]:
ethnicity_map = {'v': 'white', 'h': 'black', 'bb': 'Asian', 'ff': 'Latino'}
dataset['Ethnicity'] = dataset['Ethnicity'].map(ethnicity_map)
dataset = dataset.dropna(subset=['Ethnicity'])
dataset

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Industry,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,Income,Approved
0,1,31,-0.956613,1,0,8,white,1.25,t,t,1,0,+
1,0,59,-0.060051,1,0,9,black,3.04,t,t,6,560,+
2,0,24,-0.856102,1,0,9,black,1.50,t,f,0,824,+
3,1,28,-0.647038,1,0,8,white,3.75,t,t,5,3,+
4,1,20,0.174141,1,0,8,white,1.71,t,f,0,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,1,21,1.070704,2,1,2,black,1.25,f,f,0,0,-
686,0,23,-0.805846,1,0,4,white,2.00,f,t,2,394,-
687,0,25,1.757198,2,1,6,Latino,2.00,f,t,1,1,-
688,1,18,-0.915403,1,0,1,white,0.04,f,f,0,750,-


## Years Employed: Convert to a numeric format. Handle non-standard values (e.g., "n/a", text).

In [12]:

dataset.loc[:, 'YearsEmployed'] = pd.to_numeric(dataset['YearsEmployed'], errors='coerce')
warnings.filterwarnings('ignore')
dataset.loc[:, 'YearsEmployed'] = dataset['YearsEmployed'].round(0).astype('Int64')
dataset['YearsEmployed'] = dataset['YearsEmployed'].astype('Int64')
dataset

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Industry,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,Income,Approved
0,1,31,-0.956613,1,0,8,white,1,t,t,1,0,+
1,0,59,-0.060051,1,0,9,black,3,t,t,6,560,+
2,0,24,-0.856102,1,0,9,black,2,t,f,0,824,+
3,1,28,-0.647038,1,0,8,white,4,t,t,5,3,+
4,1,20,0.174141,1,0,8,white,2,t,f,0,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,1,21,1.070704,2,1,2,black,1,f,f,0,0,-
686,0,23,-0.805846,1,0,4,white,2,f,t,2,394,-
687,0,25,1.757198,2,1,6,Latino,2,f,t,1,1,-
688,1,18,-0.915403,1,0,1,white,0,f,f,0,750,-


## Prior Default: Map values —> 0 = No prior default, 1 = Has prior default
## t = True/Has Prior Default
## f = False/No Prior Default

In [13]:
warnings.filterwarnings('ignore')
prior_default_map = {'t': 1, 'f': 0}
dataset['PriorDefault'] = dataset['PriorDefault'].map(prior_default_map).astype('Int64')
le = LabelEncoder()
dataset['PriorDefault'] = le.fit_transform(dataset['PriorDefault'])
dataset

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Industry,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,Income,Approved
0,1,31,-0.956613,1,0,8,white,1,1,t,1,0,+
1,0,59,-0.060051,1,0,9,black,3,1,t,6,560,+
2,0,24,-0.856102,1,0,9,black,2,1,f,0,824,+
3,1,28,-0.647038,1,0,8,white,4,1,t,5,3,+
4,1,20,0.174141,1,0,8,white,2,1,f,0,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,1,21,1.070704,2,1,2,black,1,0,f,0,0,-
686,0,23,-0.805846,1,0,4,white,2,0,t,2,394,-
687,0,25,1.757198,2,1,6,Latino,2,0,t,1,1,-
688,1,18,-0.915403,1,0,1,white,0,0,f,0,750,-


## Employed: Map values —> 0 = Not employed, 1 = Employed
## t = True/Employed
## f = False/Not Employed

In [14]:
employed_map = {'t': 1, 'f': 0}
dataset['Employed'] = dataset['Employed'].map(employed_map).astype('Int64')
le = LabelEncoder()
dataset['Employed'] = le.fit_transform(dataset['Employed'])
dataset

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Industry,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,Income,Approved
0,1,31,-0.956613,1,0,8,white,1,1,1,1,0,+
1,0,59,-0.060051,1,0,9,black,3,1,1,6,560,+
2,0,24,-0.856102,1,0,9,black,2,1,0,0,824,+
3,1,28,-0.647038,1,0,8,white,4,1,1,5,3,+
4,1,20,0.174141,1,0,8,white,2,1,0,0,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,1,21,1.070704,2,1,2,black,1,0,0,0,0,-
686,0,23,-0.805846,1,0,4,white,2,0,1,2,394,-
687,0,25,1.757198,2,1,6,Latino,2,0,1,1,1,-
688,1,18,-0.915403,1,0,1,white,0,0,0,0,750,-


## Credit Score: Convert to a numeric format. Handle missing or corrupted entries.

In [15]:
dataset['CreditScore'] = pd.to_numeric(dataset['CreditScore'], errors='coerce')
dataset['CreditScore'] = dataset['CreditScore'].fillna(dataset['CreditScore'].median()).astype(int)
dataset

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Industry,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,Income,Approved
0,1,31,-0.956613,1,0,8,white,1,1,1,1,0,+
1,0,59,-0.060051,1,0,9,black,3,1,1,6,560,+
2,0,24,-0.856102,1,0,9,black,2,1,0,0,824,+
3,1,28,-0.647038,1,0,8,white,4,1,1,5,3,+
4,1,20,0.174141,1,0,8,white,2,1,0,0,0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,1,21,1.070704,2,1,2,black,1,0,0,0,0,-
686,0,23,-0.805846,1,0,4,white,2,0,1,2,394,-
687,0,25,1.757198,2,1,6,Latino,2,0,1,1,1,-
688,1,18,-0.915403,1,0,1,white,0,0,0,0,750,-


## Income: Convert to a numeric format with two decimal places. Remove any non-numeric characters.

In [16]:
dataset['Income'] = (
    dataset['Income']
    .astype(str)
    .str.replace(r'[^0-9.]', '', regex=True)
    .replace('', np.nan)
    .astype(float)
    .round(2)
)
dataset

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Industry,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,Income,Approved
0,1,31,-0.956613,1,0,8,white,1,1,1,1,0.0,+
1,0,59,-0.060051,1,0,9,black,3,1,1,6,560.0,+
2,0,24,-0.856102,1,0,9,black,2,1,0,0,824.0,+
3,1,28,-0.647038,1,0,8,white,4,1,1,5,3.0,+
4,1,20,0.174141,1,0,8,white,2,1,0,0,0.0,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,1,21,1.070704,2,1,2,black,1,0,0,0,0.0,-
686,0,23,-0.805846,1,0,4,white,2,0,1,2,394.0,-
687,0,25,1.757198,2,1,6,Latino,2,0,1,1,1.0,-
688,1,18,-0.915403,1,0,1,white,0,0,0,0,750.0,-


## Approved: Map values —> 0 = Not approved, 1 = Approved
## + = Approved
## - = Not Approved

In [17]:
approved_map = {'+': 1, '-': 0}
dataset['Approved'] = dataset['Approved'].map(approved_map).astype('Int64')
le = LabelEncoder()
dataset['Approved'] = le.fit_transform(dataset['Approved'])
dataset

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Industry,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,Income,Approved
0,1,31,-0.956613,1,0,8,white,1,1,1,1,0.0,1
1,0,59,-0.060051,1,0,9,black,3,1,1,6,560.0,1
2,0,24,-0.856102,1,0,9,black,2,1,0,0,824.0,1
3,1,28,-0.647038,1,0,8,white,4,1,1,5,3.0,1
4,1,20,0.174141,1,0,8,white,2,1,0,0,0.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
685,1,21,1.070704,2,1,2,black,1,0,0,0,0.0,0
686,0,23,-0.805846,1,0,4,white,2,0,1,2,394.0,0
687,0,25,1.757198,2,1,6,Latino,2,0,1,1,1.0,0
688,1,18,-0.915403,1,0,1,white,0,0,0,0,750.0,0
