In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [None]:
data = pd.read_csv('clean_dataset.csv')

# **Basic EDA**

In [None]:
data.head()

Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Industry,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,Approved
0,1,30.83,0.0,1,1,Industrials,White,1.25,1,1,1,0,ByBirth,202,0,1
1,0,58.67,4.46,1,1,Materials,Black,3.04,1,1,6,0,ByBirth,43,560,1
2,0,24.5,0.5,1,1,Materials,Black,1.5,1,0,0,0,ByBirth,280,824,1
3,1,27.83,1.54,1,1,Industrials,White,3.75,1,1,5,1,ByBirth,100,3,1
4,1,20.17,5.625,1,1,Industrials,White,1.71,1,0,0,0,ByOtherMeans,120,0,1


In [None]:
print(f'shape of dataset: {data.shape}')

shape of dataset: (690, 16)


split the data to represent features and target

In [None]:
X = data.drop('Approved', axis=1)
y = data['Approved']

In [None]:
num = ['Age', 'Debt', 'YearsEmployed', 'CreditScore', 'Income']
cat = [col for col in X.columns if col not in num]
print('Frequency of categories per column:')
for col in cat:
  print(X[col].value_counts())

Frequency of categories per column:
Gender
1    480
0    210
Name: count, dtype: int64
Married
1    525
0    165
Name: count, dtype: int64
BankCustomer
1    527
0    163
Name: count, dtype: int64
Industry
Energy                   146
Materials                 78
Industrials               64
ConsumerDiscretionary     59
ConsumerStaples           54
Healthcare                53
Financials                51
InformationTechnology     41
Utilities                 38
CommunicationServices     38
Real Estate               30
Education                 25
Research                  10
Transport                  3
Name: count, dtype: int64
Ethnicity
White     408
Black     138
Asian      59
Latino     57
Other      28
Name: count, dtype: int64
PriorDefault
1    361
0    329
Name: count, dtype: int64
Employed
0    395
1    295
Name: count, dtype: int64
DriversLicense
0    374
1    316
Name: count, dtype: int64
Citizen
ByBirth         625
ByOtherMeans     57
Temporary         8
Name: count, dtype: 

# **Data preprocessing**

group low frequency zip codes as others

In [None]:
zipcount = X['ZipCode'].value_counts()

In [None]:
print(f'10 most frequent zip counts: \n{zipcount.nlargest(10)}')

10 most frequent zip counts: 
ZipCode
0      145
120     35
200     35
160     34
80      30
100     30
280     22
180     18
140     16
320     14
Name: count, dtype: int64


In [None]:
count_gt_30 = list(zipcount[zipcount>=30].index)
X['ZipCode'] = X['ZipCode'].where(X['ZipCode'].isin(count_gt_30), 'Others')

Encode categorical values

In [None]:
dummy = pd.get_dummies(X[cat], drop_first=True, dtype=int)

In [None]:
X = X.drop(cat, axis=1)

In [None]:
X_encoded = pd.concat([X, dummy], axis=1)

split data to train and test sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, stratify=y, random_state=512)

scale numerical values

In [None]:
sc = StandardScaler()

In [None]:
X_train[num] = sc.fit_transform(X_train[num])
X_test[num] = sc.transform(X_test[num])

# **Model Building**

In [None]:
lr = LogisticRegression()
lr.fit(X_train, y_train)

# **Model Evaluation**

In [None]:
y_pred = lr.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [None]:
accuracy

0.8913043478260869