<a href="https://colab.research.google.com/github/Iteba/Data-Science-Practice/blob/main/Credit_Card_Approval_Prediction_Logistic_Regression_Practice/Credit_Card_Approval_Prediction_Logistic_Regression_Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Read Data

In [367]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [368]:
features = pd.read_csv('credit_card.csv')
labels = pd.read_csv('credit_card_label.csv')

# Exploration

In [369]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1548 entries, 0 to 1547
Data columns (total 18 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Ind_ID           1548 non-null   int64  
 1   GENDER           1541 non-null   object 
 2   Car_Owner        1548 non-null   object 
 3   Propert_Owner    1548 non-null   object 
 4   CHILDREN         1548 non-null   int64  
 5   Annual_income    1525 non-null   float64
 6   Type_Income      1548 non-null   object 
 7   EDUCATION        1548 non-null   object 
 8   Marital_status   1548 non-null   object 
 9   Housing_type     1548 non-null   object 
 10  Birthday_count   1526 non-null   float64
 11  Employed_days    1548 non-null   int64  
 12  Mobile_phone     1548 non-null   int64  
 13  Work_Phone       1548 non-null   int64  
 14  Phone            1548 non-null   int64  
 15  EMAIL_ID         1548 non-null   int64  
 16  Type_Occupation  1060 non-null   object 
 17  Family_Members

In [370]:
features.describe()

Unnamed: 0,Ind_ID,CHILDREN,Annual_income,Birthday_count,Employed_days,Mobile_phone,Work_Phone,Phone,EMAIL_ID,Family_Members
count,1548.0,1548.0,1525.0,1526.0,1548.0,1548.0,1548.0,1548.0,1548.0,1548.0
mean,5078920.0,0.412791,191399.3,-16040.342071,59364.689922,1.0,0.20801,0.309432,0.092377,2.161499
std,41717.59,0.776691,113253.0,4229.503202,137808.062701,0.0,0.406015,0.462409,0.289651,0.947772
min,5008827.0,0.0,33750.0,-24946.0,-14887.0,1.0,0.0,0.0,0.0,1.0
25%,5045070.0,0.0,121500.0,-19553.0,-3174.5,1.0,0.0,0.0,0.0,2.0
50%,5078842.0,0.0,166500.0,-15661.5,-1565.0,1.0,0.0,0.0,0.0,2.0
75%,5115673.0,1.0,225000.0,-12417.0,-431.75,1.0,0.0,1.0,0.0,3.0
max,5150412.0,14.0,1575000.0,-7705.0,365243.0,1.0,1.0,1.0,1.0,15.0


In [371]:
features.head(3)

Unnamed: 0,Ind_ID,GENDER,Car_Owner,Propert_Owner,CHILDREN,Annual_income,Type_Income,EDUCATION,Marital_status,Housing_type,Birthday_count,Employed_days,Mobile_phone,Work_Phone,Phone,EMAIL_ID,Type_Occupation,Family_Members
0,5008827,M,Y,Y,0,180000.0,Pensioner,Higher education,Married,House / apartment,-18772.0,365243,1,0,0,0,,2
1,5009744,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,-13557.0,-586,1,1,1,0,,2
2,5009746,F,Y,N,0,315000.0,Commercial associate,Higher education,Married,House / apartment,,-586,1,1,1,0,,2


In [372]:
features.isnull().sum()

Unnamed: 0,0
Ind_ID,0
GENDER,7
Car_Owner,0
Propert_Owner,0
CHILDREN,0
Annual_income,23
Type_Income,0
EDUCATION,0
Marital_status,0
Housing_type,0


In [373]:
features.duplicated().sum()

0

In [374]:
labels.head(3)

Unnamed: 0,Ind_ID,label
0,5008827,1
1,5009744,1
2,5009746,1


# Preprocessing
Clean and transforming the data

## Missing Values

In [375]:
from sklearn.impute import KNNImputer
import numpy as np

In [376]:
imputer = KNNImputer(n_neighbors=5)

In [377]:
features[['Annual_income']] = imputer.fit_transform(features[['Annual_income']])

In [378]:
features[['Birthday_count']] = imputer.fit_transform(features[['Birthday_count']])

In [379]:
features['GENDER'] = features['GENDER'].fillna(np.random.choice(['M', 'F']))

## Feature **Selection**

In [380]:
labels.drop('Ind_ID', axis=1, inplace=True)

In [381]:
features.drop(['Ind_ID', 'Type_Occupation'], axis=1, inplace=True)

Close to a third of the 'Type_Occupation' column had missing values. Dropped it; not sure if filling those values would skew the data in any way.

## Encoding

In [382]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [383]:
hot_encoder = OneHotEncoder(sparse_output = False, handle_unknown = 'ignore').set_output(transform = 'pandas')

In [384]:
label_encoder = LabelEncoder()

In [385]:
features = pd.concat([features.drop('GENDER', axis = 1), hot_encoder.fit_transform(features[['GENDER']]).astype(int)], axis = 1)

In [386]:
features = pd.concat([features.drop('Type_Income', axis = 1), hot_encoder.fit_transform(features[['Type_Income']]).astype(int)], axis = 1)

In [387]:
features = pd.concat([features.drop('Marital_status', axis = 1), hot_encoder.fit_transform(features[['Marital_status']]).astype(int)], axis = 1)

In [388]:
features = pd.concat([features.drop('EDUCATION', axis = 1), hot_encoder.fit_transform(features[['EDUCATION']]).astype(int)], axis = 1)

In [389]:
features = pd.concat([features.drop('Housing_type', axis = 1), hot_encoder.fit_transform(features[['Housing_type']]).astype(int)], axis = 1)

In [390]:
features['Propert_Owner'] = label_encoder.fit_transform(features['Propert_Owner'])
features['Car_Owner'] = label_encoder.fit_transform(features['Car_Owner'])

## Feature Scaling

In [391]:
from sklearn.preprocessing import MinMaxScaler

In [392]:
scaler = MinMaxScaler().set_output(transform = 'pandas')

In [393]:
features[[
    'CHILDREN',
    'Annual_income',
    'Birthday_count',
    'Employed_days',
    'Mobile_phone',
    'Work_Phone',
    'Phone',
    'EMAIL_ID',
    'Family_Members']] = scaler.fit_transform(features[[
    'CHILDREN',
    'Annual_income',
    'Birthday_count',
    'Employed_days',
    'Mobile_phone',
    'Work_Phone',
    'Phone',
    'EMAIL_ID',
    'Family_Members'
]])

In [394]:
features.head(3)

Unnamed: 0,Car_Owner,Propert_Owner,CHILDREN,Annual_income,Birthday_count,Employed_days,Mobile_phone,Work_Phone,Phone,EMAIL_ID,...,EDUCATION_Higher education,EDUCATION_Incomplete higher,EDUCATION_Lower secondary,EDUCATION_Secondary / secondary special,Housing_type_Co-op apartment,Housing_type_House / apartment,Housing_type_Municipal apartment,Housing_type_Office apartment,Housing_type_Rented apartment,Housing_type_With parents
0,1,1,0.0,0.094891,0.3581,1.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,1,0,0,0,0
1,1,0,0.0,0.182482,0.660577,0.037621,0.0,1.0,1.0,0.0,...,1,0,0,0,0,1,0,0,0,0
2,1,0,0.0,0.182482,0.51654,0.037621,0.0,1.0,1.0,0.0,...,1,0,0,0,0,1,0,0,0,0


## Data Split

In [395]:
from sklearn.model_selection import train_test_split

In [396]:
labels = labels.to_numpy().ravel()

In [397]:
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size=0.2, random_state=42)

# Training

In [398]:
from sklearn.linear_model import LogisticRegression

In [399]:
model = LogisticRegression()

In [400]:
model.fit(X_train, y_train)

# Evaluation

## Coefficients
How much each feature affects the outcome

In [401]:
coefficients = pd.DataFrame(model.coef_.reshape(-1,1), columns=['Coefficients'], index=features.columns)

In [402]:
coefficients

Unnamed: 0,Coefficients
Car_Owner,-0.080025
Propert_Owner,-0.010087
CHILDREN,0.004762
Annual_income,-0.005556
Birthday_count,-0.395224
Employed_days,-1.646999
Mobile_phone,0.0
Work_Phone,0.059227
Phone,0.12456
EMAIL_ID,0.163078


## Predictions

In [403]:
predictions = model.predict(X_test)

In [404]:
predictions

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

## Accuracy

In [409]:
from sklearn.metrics import accuracy_score, classification_report

In [411]:
print('Model Accuracy: ', round(accuracy_score(y_test, predictions), 2) * 100, '%\n')
print(classification_report(y_test, predictions))

Model Accuracy:  90.0 %

              precision    recall  f1-score   support

           0       0.91      1.00      0.95       280
           1       0.50      0.03      0.06        30

    accuracy                           0.90       310
   macro avg       0.70      0.51      0.51       310
weighted avg       0.87      0.90      0.86       310



Not sure about this 🤨