In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
#NEW
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.cluster import KMeans


In [2]:
bank = pd.read_csv("/Users/jeronimoperezrocha/Library/Containers/com.microsoft.Excel/Data/Downloads/banking_customer_data.csv")

In [3]:
bank

Unnamed: 0,Customer_ID,Age,Annual_Income,Employment_Type,Credit_Score,Loan_Amount,Default
0,1,55,122742.05,Self-Employed,371,34566.81,0
1,2,61,114681.86,Unemployed,815,42949.54,1
2,3,30,58585.31,Self-Employed,819,13268.27,0
3,4,26,35330.15,Unemployed,789,43527.17,0
4,5,27,63746.03,Unemployed,608,21396.45,1
...,...,...,...,...,...,...,...
495,496,64,30720.90,Unemployed,485,34914.96,0
496,497,25,103646.36,Self-Employed,746,17544.13,0
497,498,39,33743.59,Unemployed,389,19159.14,0
498,499,34,37525.22,Unemployed,803,23897.67,0


In [4]:
bank.isnull().sum()

Customer_ID        0
Age                0
Annual_Income      0
Employment_Type    0
Credit_Score       0
Loan_Amount        0
Default            0
dtype: int64

In [5]:
bank.describe()

Unnamed: 0,Customer_ID,Age,Annual_Income,Credit_Score,Loan_Amount,Default
count,500.0,500.0,500.0,500.0,500.0,500.0
mean,250.5,43.492,90185.0434,566.994,28372.19072,0.158
std,144.481833,14.68923,34996.64596,155.406371,12912.561969,0.365106
min,1.0,18.0,30121.59,300.0,5159.96,0.0
25%,125.75,31.0,59769.31,434.75,16424.23,0.0
50%,250.5,43.0,90965.235,560.5,28840.66,0.0
75%,375.25,56.0,119362.08,704.0,39697.405,0.0
max,500.0,69.0,149972.29,849.0,49928.61,1.0


In [6]:
categorical_features = ['Employment_Type'] #Categorical features 

categorical_transformer = Pipeline(steps =[
    ('imputer', SimpleImputer(strategy= 'constant', fill_value='missing')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
]) #Pipeline for categorical features 

numeric_features = ['Age','Annual_Income', 'Credit_Score']

numeric_transformer = Pipeline(steps=[ #Pipeline for numeric features
    ('imputer', SimpleImputer(strategy='mean')), #Impute missing values with the mean
    ('scaler', StandardScaler()) #Standardize the data
]) #Pipeline for numeric features

In [7]:
#bundle the preprocessing for the categorical and numeric features
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features), #Apply the numeric_transformer to the numeric_features
        ('cat', categorical_transformer, categorical_features) #Apply the categorical_transformer to the categorical_features
    ]
)

In [8]:
#defien x and y X_train, X_test, y_train, y_test
X = bank[['Age','Annual_Income', 'Credit_Score', 'Employment_Type']]
y = bank.Default

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [10]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=42))
])

In [11]:
#fit the model on the training set
model.fit(X_train, y_train)

#predict
y_pred = model.predict(X_test)  


#Evaluate
accuracy_score(y_test, y_pred)

0.864

In [12]:
#Unsupervised Learning  - kmeans clustering
#This does not require splitting into training and test sets

kmeans = KMeans(n_clusters=5, random_state=42)
bank['cluster'] = kmeans.fit_predict(bank[['Age', 'Annual_Income', 'Credit_Score']])

  super()._check_params_vs_input(X, default_n_init=10)
