In [2]:
import pandas as pd 
import numpy as np 

In [3]:
df=pd.read_csv("./data/UCI_Credit_Card.csv")

In [4]:
df.rename(columns={'default.payment.next.month':'def_pay'}, inplace=True)
df.rename(columns={'PAY_0':'PAY_1'}, inplace=True)

In [5]:
bins = [20,30,40,50,60,70,80]
names = ['21-30','31-40','41-50','51-60','61-70','71-80']
df['AGE_BIN'] = pd.cut(x=df.AGE, bins=bins, labels=names, right=True)

In [108]:
df.SEX = df.SEX.astype("object")
df.EDUCATION = df.EDUCATION.astype("object")
df.MARRIAGE = df.MARRIAGE.astype("object")
df.PAY_1 = df.PAY_1.astype("object")
df.PAY_2 = df.PAY_2.astype("object")
df.PAY_3 = df.PAY_3.astype("object")
df.PAY_4 = df.PAY_4.astype("object")
df.PAY_5 = df.PAY_5.astype("object")
df.PAY_6 = df.PAY_6.astype("object")
df.def_type = df.def_pay.astype("object")
df.def_type = df.AGE_BIN.astype("object")

In [115]:
df_X = df.drop(['def_pay','AGE_BIN'], axis=1)
df_y = df.def_pay

In [116]:
df_X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 24 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   ID         30000 non-null  int64  
 1   LIMIT_BAL  30000 non-null  float64
 2   SEX        30000 non-null  object 
 3   EDUCATION  30000 non-null  object 
 4   MARRIAGE   30000 non-null  object 
 5   AGE        30000 non-null  int64  
 6   PAY_1      30000 non-null  object 
 7   PAY_2      30000 non-null  object 
 8   PAY_3      30000 non-null  object 
 9   PAY_4      30000 non-null  object 
 10  PAY_5      30000 non-null  object 
 11  PAY_6      30000 non-null  object 
 12  BILL_AMT1  30000 non-null  float64
 13  BILL_AMT2  30000 non-null  float64
 14  BILL_AMT3  30000 non-null  float64
 15  BILL_AMT4  30000 non-null  float64
 16  BILL_AMT5  30000 non-null  float64
 17  BILL_AMT6  30000 non-null  float64
 18  PAY_AMT1   30000 non-null  float64
 19  PAY_AMT2   30000 non-null  float64
 20  PAY_AM

In [117]:
categorical_cols=df_X.select_dtypes(include='object').columns

In [118]:
numerical_cols=df_X.select_dtypes(exclude='object').columns

In [119]:
categorical_cols

Index(['SEX', 'EDUCATION', 'MARRIAGE', 'PAY_1', 'PAY_2', 'PAY_3', 'PAY_4',
       'PAY_5', 'PAY_6'],
      dtype='object')

In [120]:
numerical_cols

Index(['ID', 'LIMIT_BAL', 'AGE', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3',
       'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2',
       'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'],
      dtype='object')

In [121]:
df_X.select_dtypes(include='object')

Unnamed: 0,SEX,EDUCATION,MARRIAGE,PAY_1,PAY_2,PAY_3,PAY_4,PAY_5,PAY_6
0,2,2,1,2,2,-1,-1,-2,-2
1,2,2,2,-1,2,0,0,0,2
2,2,2,2,0,0,0,0,0,0
3,2,2,1,0,0,0,0,0,0
4,1,2,1,-1,0,-1,0,0,0
...,...,...,...,...,...,...,...,...,...
29995,1,3,1,0,0,0,0,0,0
29996,1,3,2,-1,-1,-1,-1,0,0
29997,1,2,2,4,3,2,-1,0,0
29998,1,3,1,1,-1,0,0,0,-1


In [122]:
SEX_categories = [1,2]
EDUCATION_categories = [0,1,2,3,4,5,6]
MARRIAGE_categories = [0,1,2,3]
PAY_categories = [-1,-2,0,1,2,3,4,5,6,7,8]

In [123]:
numerical_cols

Index(['ID', 'LIMIT_BAL', 'AGE', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3',
       'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2',
       'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6'],
      dtype='object')

In [124]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [125]:
num_pipeline=Pipeline(
    
    
    steps=[
        
        ('imputer',SimpleImputer()),
        ('scaler', StandardScaler())
    ]
    
    
)

In [126]:
cat_pipeline=Pipeline(
    
    steps=[
        ('imputer',SimpleImputer(strategy='most_frequent')),
        ('ordinalencoder',OrdinalEncoder(categories=[SEX_categories,EDUCATION_categories,MARRIAGE_categories,PAY_categories,PAY_categories,PAY_categories,PAY_categories,PAY_categories,PAY_categories]))
    ]
    
)

In [127]:
preprocessor=ColumnTransformer(
    [
        
        ('num_pipeline',num_pipeline,numerical_cols),
        ('cat_pipeline',cat_pipeline,categorical_cols)
    ]
)

In [128]:
from sklearn.model_selection import train_test_split

X_train,X_test,y_train,y_test=train_test_split(df_X,df_y,test_size=0.30,random_state=30)

In [129]:
preprocessor.fit_transform(X_train)

array([[-1.64367185, -1.13900338, -1.03192059, ...,  0.        ,
         1.        ,  0.        ],
       [-1.10533554, -0.13853957,  0.59977506, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.49199462,  0.63104798, -0.70558146, ...,  4.        ,
         4.        ,  4.        ],
       ...,
       [ 0.03176849,  0.70800674, -0.37924233, ...,  2.        ,
         2.        ,  2.        ],
       [-1.21152379, -0.90812712,  0.27343593, ...,  0.        ,
         0.        ,  0.        ],
       [-1.04883275,  0.938883  ,  1.90513158, ...,  0.        ,
         0.        ,  0.        ]])

In [130]:
preprocessor.transform(X_test)

array([[-0.84292691, -1.13900338,  0.16465622, ...,  2.        ,
         4.        ,  2.        ],
       [ 0.28088914,  0.32321296,  1.0348939 , ...,  1.        ,
         1.        ,  1.        ],
       [ 0.0769476 ,  1.86238806, -0.92314088, ...,  2.        ,
         0.        ,  0.        ],
       ...,
       [ 1.0800164 , -1.13900338, -0.37924233, ...,  2.        ,
         2.        ,  2.        ],
       [ 1.49067261, -0.6002921 , -1.35825972, ...,  0.        ,
         2.        ,  0.        ],
       [ 1.12057361,  0.86192425,  1.25245332, ...,  2.        ,
         2.        ,  2.        ]])

In [131]:
preprocessor.get_feature_names_out()

array(['num_pipeline__ID', 'num_pipeline__LIMIT_BAL', 'num_pipeline__AGE',
       'num_pipeline__BILL_AMT1', 'num_pipeline__BILL_AMT2',
       'num_pipeline__BILL_AMT3', 'num_pipeline__BILL_AMT4',
       'num_pipeline__BILL_AMT5', 'num_pipeline__BILL_AMT6',
       'num_pipeline__PAY_AMT1', 'num_pipeline__PAY_AMT2',
       'num_pipeline__PAY_AMT3', 'num_pipeline__PAY_AMT4',
       'num_pipeline__PAY_AMT5', 'num_pipeline__PAY_AMT6',
       'cat_pipeline__SEX', 'cat_pipeline__EDUCATION',
       'cat_pipeline__MARRIAGE', 'cat_pipeline__PAY_1',
       'cat_pipeline__PAY_2', 'cat_pipeline__PAY_3',
       'cat_pipeline__PAY_4', 'cat_pipeline__PAY_5',
       'cat_pipeline__PAY_6'], dtype=object)

In [132]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [133]:
## Model Training

from sklearn.linear_model import LogisticRegression , RidgeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error 

In [134]:
model1 = LogisticRegression()
model1.fit(X_train,y_train)

    #Make Predictions
y_pred=model1.predict(X_test)



In [135]:
print(classification_report(y_pred, y_test))
print(confusion_matrix(y_pred, y_test))
print('\nAccuracy Score for model1: ', accuracy_score(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.97      0.82      0.89      8386
           1       0.22      0.68      0.33       614

    accuracy                           0.81      9000
   macro avg       0.59      0.75      0.61      9000
weighted avg       0.92      0.81      0.85      9000

[[6868 1518]
 [ 196  418]]

Accuracy Score for model1:  0.8095555555555556
