<a href="https://colab.research.google.com/github/Iteba/Data-Science-Practice/blob/main/Credit_Card_Approval_Prediction_Logistic_Regression_Practice/Credit_Card_Approval_Prediction_Logistic_Regression_Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data

In [115]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [116]:
x_url = 'https://raw.githubusercontent.com/Iteba/Data-Science-Practice/refs/heads/main/Credit_Card_Approval_Prediction_Logistic_Regression_Practice/credit_card.csv'
y_url = 'https://raw.githubusercontent.com/Iteba/Data-Science-Practice/refs/heads/main/Credit_Card_Approval_Prediction_Logistic_Regression_Practice/credit_card_label.csv'

X = pd.read_csv(x_url)
y = pd.read_csv(y_url)

# Exploration

In [117]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1548 entries, 0 to 1547
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Ind_ID           1548 non-null   int64  
 1   GENDER           1541 non-null   object 
 2   Car_Owner        1548 non-null   object 
 3   Propert_Owner    1548 non-null   object 
 4   CHILDREN         1548 non-null   int64  
 5   Annual_income    1525 non-null   float64
 6   Type_Income      1548 non-null   object 
 7   EDUCATION        1548 non-null   object 
 8   Marital_status   1548 non-null   object 
 9   Housing_type     1548 non-null   object 
 10  Birthday_count   1526 non-null   float64
 11  Employed_days    1548 non-null   int64  
 12  Mobile_phone     1548 non-null   int64  
 13  Work_Phone       1548 non-null   int64  
 14  Phone            1548 non-null   int64  
 15  EMAIL_ID         1548 non-null   int64  
 16  Type_Occupation  1060 non-null   object 
 17  Family_Members

In [118]:
X.isnull().sum()

Unnamed: 0,0
Ind_ID,0
GENDER,7
Car_Owner,0
Propert_Owner,0
CHILDREN,0
Annual_income,23
Type_Income,0
EDUCATION,0
Marital_status,0
Housing_type,0


In [119]:
X.head(3)

Unnamed: 0,Ind_ID,GENDER,Car_Owner,Propert_Owner,CHILDREN,Annual_income,Type_Income,EDUCATION,Marital_status,Housing_type,Birthday_count,Employed_days,Mobile_phone,Work_Phone,Phone,EMAIL_ID,Type_Occupation,Family_Members
0,5008827,M,Y,Y,0,180000.0,Pensioner,Higher education,Married,House / apartment,-18772.0,365243,1,0,0,0,,2
1,5009744,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2
2,5009746,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,,-586,1,1,1,0,,2


In [120]:
X.duplicated().sum()

np.int64(0)

In [121]:
y.head(3)

Unnamed: 0,Ind_ID,label
0,5008827,1
1,5009744,1
2,5009746,1


# Preprocessing

## Missing Values

In [122]:
from sklearn.impute import SimpleImputer

In [123]:
X[['Annual_income', 'Birthday_count']] = SimpleImputer(strategy = 'mean').fit_transform(X[['Annual_income', 'Birthday_count']])

In [124]:
X[['GENDER']] = SimpleImputer(strategy = 'most_frequent').fit_transform(X[['GENDER']])

## Feature **Selection**

In [125]:
y.drop('Ind_ID', axis=1, inplace=True)

In [126]:
X.drop(['Ind_ID', 'Type_Occupation'], axis=1, inplace=True)

Close to a third of the 'Type_Occupation' column had missing values. Dropped it; not sure if filling those values would skew the data in any way.

## Encoding

In [127]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [128]:
hot_encoder = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore').set_output(transform = 'pandas')

In [129]:
label_encoder = LabelEncoder()

In [130]:
X = pd.concat([X.drop('GENDER', axis = 1), hot_encoder.fit_transform(X[['GENDER']]).astype(int)], axis = 1)

In [131]:
X = pd.concat([X.drop('Type_Income', axis = 1), hot_encoder.fit_transform(X[['Type_Income']]).astype(int)], axis = 1)

In [132]:
X = pd.concat([X.drop('Marital_status', axis = 1), hot_encoder.fit_transform(X[['Marital_status']]).astype(int)], axis = 1)

In [133]:
X = pd.concat([X.drop('EDUCATION', axis = 1), hot_encoder.fit_transform(X[['EDUCATION']]).astype(int)], axis = 1)

In [134]:
X = pd.concat([X.drop('Housing_type', axis = 1), hot_encoder.fit_transform(X[['Housing_type']]).astype(int)], axis = 1)

In [135]:
X['Propert_Owner'] = label_encoder.fit_transform(X['Propert_Owner'])
X['Car_Owner'] = label_encoder.fit_transform(X['Car_Owner'])

## Feature Scaling

In [62]:
from sklearn.preprocessing import MinMaxScaler

In [136]:
scaler = MinMaxScaler().set_output(transform = 'pandas')

In [137]:
X[[
    'CHILDREN',
    'Annual_income',
    'Birthday_count',
    'Employed_days',
    'Mobile_phone',
    'Work_Phone',
    'Phone',
    'EMAIL_ID',
    'Family_Members']] = scaler.fit_transform(X[[
    'CHILDREN',
    'Annual_income',
    'Birthday_count',
    'Employed_days',
    'Mobile_phone',
    'Work_Phone',
    'Phone',
    'EMAIL_ID',
    'Family_Members'
]])

## Data Split

In [139]:
from sklearn.model_selection import train_test_split

In [148]:
y = y.to_numpy().ravel()

In [149]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Training

In [145]:
from sklearn.linear_model import LogisticRegression

In [150]:
model = LogisticRegression()

In [151]:
model.fit(X_train, y_train)

# Evaluation

## Predictions

In [152]:
predictions = model.predict(X_test)

In [153]:
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

## Accuracy

In [154]:
from sklearn.metrics import accuracy_score, classification_report

In [155]:
print('Model Accuracy: ', round(accuracy_score(y_test, predictions), 2) * 100, '%\n')
print(classification_report(y_test, predictions))

Model Accuracy:  90.0 %

              precision    recall  f1-score   support

           0       0.91      1.00      0.95       280
           1       0.50      0.03      0.06        30

    accuracy                           0.90       310
   macro avg       0.70      0.51      0.51       310
weighted avg       0.87      0.90      0.86       310

