In [33]:
import pandas as pd       
import matplotlib as mat
import matplotlib.pyplot as plt    
import numpy as np
import seaborn as sns
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import pickle
from importlib import reload  
import logging
logger = logging.getLogger()
fhandler = logging.FileHandler(filename='mylog.log', mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)
logging.info('We are starting the data analysis in Jupyter Notebook')

In [34]:
data=pd.read_csv(r"C:\credit_card_defaulter\notebooks\data\CreditCard.csv")

In [35]:
data.drop(labels=["ID"],axis=1,inplace=True)


In [36]:
data.head()

Unnamed: 0,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,PAY_5,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,20000.0,2,2,1,24,2,2,-1,-1,-2,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,120000.0,2,2,2,26,-1,2,0,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,90000.0,2,2,2,34,0,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,50000.0,2,2,1,37,0,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,50000.0,1,2,1,57,-1,0,-1,0,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0


In [37]:
X=data.drop(labels=["default.payment.next.month"],axis=1)

In [38]:
y=data[["default.payment.next.month"]]

In [39]:
from sklearn.impute import SimpleImputer ## HAndling Missing Values
from sklearn.preprocessing import StandardScaler # HAndling Feature Scaling
from sklearn.preprocessing import OrdinalEncoder # Ordinal Encoding
## pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [40]:
numerical_cols=X.select_dtypes(exclude='object').columns

In [41]:
num_pipeline=Pipeline(
    
    
    steps=[
        
        ('imputer',SimpleImputer()),
        ('scaler', StandardScaler())
    ]
    
    
)

In [42]:
preprocessor=ColumnTransformer(
    [
        
        ('num_pipeline',num_pipeline,numerical_cols)
    ]
)

In [43]:

from sklearn.model_selection import train_test_split

In [44]:
## Train test split

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=30)

In [45]:
y

Unnamed: 0,default.payment.next.month
0,1
1,1
2,0
3,0
4,0
...,...
29995,0
29996,0
29997,1
29998,1


In [46]:
preprocessor.fit_transform(X_train)


array([[-1.13900338, -1.23108594,  0.18989225, ..., -0.26222501,
        -0.16991997, -0.28995798],
       [-0.13853957, -1.23108594, -1.07605611, ..., -0.31394799,
        -0.3050631 , -0.28995798],
       [ 0.63104798,  0.81229098, -1.07605611, ...,  0.08087074,
        -0.05761794, -0.02196538],
       ...,
       [ 0.70800674,  0.81229098,  0.18989225, ..., -0.11501346,
        -0.05127319, -0.06783399],
       [-0.90812712, -1.23108594, -1.07605611, ..., -0.2880865 ,
         0.01280876, -0.26830089],
       [ 0.938883  ,  0.81229098,  0.18989225, ..., -0.06209688,
         0.00570264, -0.01041494]])

In [47]:
preprocessor.transform(X_test)

array([[-1.13900338, -1.23108594,  0.18989225, ..., -0.31394799,
        -0.27778068, -0.23442698],
       [ 0.32321296,  0.81229098,  0.18989225, ..., -0.31394799,
        -0.3050631 , -0.28995798],
       [ 1.86238806, -1.23108594, -1.07605611, ...,  0.87793508,
         1.88501685, -0.26946704],
       ...,
       [-1.13900338, -1.23108594,  0.18989225, ..., -0.28788757,
        -0.27873239, -0.25575088],
       [-0.6002921 ,  0.81229098,  1.45584062, ...,  1.00140712,
        -0.24250389,  0.91034453],
       [ 0.86192425, -1.23108594,  0.18989225, ..., -0.16116627,
        -0.14619062, -0.16201456]])

In [48]:
preprocessor.get_feature_names_out()

array(['num_pipeline__LIMIT_BAL', 'num_pipeline__SEX',
       'num_pipeline__EDUCATION', 'num_pipeline__MARRIAGE',
       'num_pipeline__AGE', 'num_pipeline__PAY_0', 'num_pipeline__PAY_2',
       'num_pipeline__PAY_3', 'num_pipeline__PAY_4',
       'num_pipeline__PAY_5', 'num_pipeline__PAY_6',
       'num_pipeline__BILL_AMT1', 'num_pipeline__BILL_AMT2',
       'num_pipeline__BILL_AMT3', 'num_pipeline__BILL_AMT4',
       'num_pipeline__BILL_AMT5', 'num_pipeline__BILL_AMT6',
       'num_pipeline__PAY_AMT1', 'num_pipeline__PAY_AMT2',
       'num_pipeline__PAY_AMT3', 'num_pipeline__PAY_AMT4',
       'num_pipeline__PAY_AMT5', 'num_pipeline__PAY_AMT6'], dtype=object)

In [49]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [50]:
X_train

Unnamed: 0,num_pipeline__LIMIT_BAL,num_pipeline__SEX,num_pipeline__EDUCATION,num_pipeline__MARRIAGE,num_pipeline__AGE,num_pipeline__PAY_0,num_pipeline__PAY_2,num_pipeline__PAY_3,num_pipeline__PAY_4,num_pipeline__PAY_5,...,num_pipeline__BILL_AMT3,num_pipeline__BILL_AMT4,num_pipeline__BILL_AMT5,num_pipeline__BILL_AMT6,num_pipeline__PAY_AMT1,num_pipeline__PAY_AMT2,num_pipeline__PAY_AMT3,num_pipeline__PAY_AMT4,num_pipeline__PAY_AMT5,num_pipeline__PAY_AMT6
0,-1.139003,-1.231086,0.189892,0.855242,-1.031921,-0.872883,-0.720412,-0.694606,-0.665419,-1.532938,...,-0.667972,-0.683707,-0.669670,-0.629882,-0.259967,-0.219680,-0.284034,-0.262225,-0.169920,-0.289958
1,-0.138540,-1.231086,-1.076056,-1.060909,0.599775,0.901613,-1.558100,-0.694606,-0.665419,-0.647261,...,-0.651531,-0.586762,-0.662746,-0.652089,-0.333370,-0.166585,0.008592,-0.313948,-0.305063,-0.289958
2,0.631048,0.812291,-1.076056,0.855242,-0.705581,1.788862,1.792653,1.821359,1.910872,2.009769,...,1.124524,1.227215,1.416612,1.498716,0.010071,-0.060316,-0.284034,0.080871,-0.057618,-0.021965
3,1.477594,-1.231086,-1.076056,-1.060909,0.382216,0.901613,-1.558100,-1.533261,-1.524183,-0.647261,...,-0.674425,-0.671125,-0.651354,-0.533990,-0.333370,-0.237431,-0.284034,-0.268127,0.140338,-0.289958
4,0.861924,-1.231086,0.189892,-1.060909,0.926114,0.014365,0.117276,0.144049,0.193345,0.238416,...,1.080023,1.177725,1.212598,1.283075,0.121951,-0.059174,-0.055687,-0.028477,-0.038076,-0.067667
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20995,-1.062045,0.812291,0.189892,-1.060909,0.273436,0.014365,0.117276,0.144049,0.193345,2.009769,...,-0.351942,-0.295819,-0.248487,-0.238640,-0.216949,-0.178393,-0.176273,-0.214481,-0.305063,-0.223321
20996,0.477130,0.812291,0.189892,-1.060909,-0.379242,-0.872883,0.117276,0.144049,0.193345,0.238416,...,-0.635504,-0.613340,-0.662746,-0.652089,-0.272482,-0.198072,-0.230154,-0.313948,-0.305063,2.195165
20997,0.708007,0.812291,0.189892,-1.060909,-0.379242,0.014365,0.117276,0.144049,0.193345,0.238416,...,0.790379,0.579140,0.691576,0.774827,-0.024855,-0.099675,-0.122392,-0.115013,-0.051273,-0.067834
20998,-0.908127,-1.231086,-1.076056,0.855242,0.273436,-0.872883,-0.720412,-0.694606,-0.665419,-0.647261,...,-0.668845,-0.665067,-0.656317,-0.567805,-0.310668,-0.222081,-0.263021,-0.288087,0.012809,-0.268301


In [51]:
X_test

Unnamed: 0,num_pipeline__LIMIT_BAL,num_pipeline__SEX,num_pipeline__EDUCATION,num_pipeline__MARRIAGE,num_pipeline__AGE,num_pipeline__PAY_0,num_pipeline__PAY_2,num_pipeline__PAY_3,num_pipeline__PAY_4,num_pipeline__PAY_5,...,num_pipeline__BILL_AMT3,num_pipeline__BILL_AMT4,num_pipeline__BILL_AMT5,num_pipeline__BILL_AMT6,num_pipeline__PAY_AMT1,num_pipeline__PAY_AMT2,num_pipeline__PAY_AMT3,num_pipeline__PAY_AMT4,num_pipeline__PAY_AMT5,num_pipeline__PAY_AMT6
0,-1.139003,-1.231086,0.189892,-1.060909,0.164656,0.014365,0.117276,0.144049,0.193345,2.009769,...,-0.517827,-0.495906,-0.481219,-0.521625,-0.202513,-0.178393,-0.243247,-0.313948,-0.277781,-0.234427
1,0.323213,0.812291,0.189892,0.855242,1.034894,0.901613,-1.558100,-1.533261,-1.524183,-1.532938,...,-0.674425,-0.671125,-0.662746,-0.652089,-0.333370,-0.237431,-0.284034,-0.313948,-0.305063,-0.289958
2,1.862388,-1.231086,-1.076056,0.855242,-0.923141,0.014365,-0.720412,-0.694606,0.193345,-0.647261,...,-0.450088,-0.041830,-0.367911,-0.075875,0.603235,0.395304,1.882136,0.877935,1.885017,-0.269467
3,0.323213,0.812291,-1.076056,0.855242,-0.705581,-1.760132,-1.558100,-1.533261,-1.524183,-1.532938,...,-0.674468,-0.671172,-0.652030,-0.620966,-0.333370,-0.237431,-0.284034,-0.270647,-0.187114,-0.211604
4,0.015378,0.812291,-1.076056,0.855242,0.382216,0.014365,0.117276,0.144049,0.193345,0.238416,...,-0.364491,-0.309084,-0.413231,-0.592753,-0.041678,-0.040636,0.068238,-0.037628,-0.081284,0.127691
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8995,0.400172,-1.231086,-1.076056,-1.060909,0.382216,0.014365,0.117276,0.144049,0.193345,0.238416,...,-0.497393,-0.599515,-0.602276,-0.665699,-0.215960,-0.119354,-0.230154,-0.313948,-0.305063,-0.123365
8996,1.246718,0.812291,-1.076056,-1.060909,0.382216,0.014365,0.117276,0.144049,0.193345,0.238416,...,0.014170,0.108582,0.193568,0.289392,-0.100529,-0.119354,-0.122392,-0.115013,0.012174,-0.178896
8997,-1.139003,-1.231086,0.189892,0.855242,-0.379242,0.014365,0.117276,0.144049,0.193345,0.238416,...,-0.453093,-0.498982,-0.476405,-0.470936,-0.234412,-0.178865,-0.220886,-0.287888,-0.278732,-0.255751
8998,-0.600292,0.812291,1.455841,0.855242,-1.358260,0.014365,0.117276,-0.694606,-0.665419,0.238416,...,-0.361972,-0.499697,-0.170871,-0.635501,1.412941,0.943338,0.310594,1.001407,-0.242504,0.910345


In [52]:
y_train

Unnamed: 0,default.payment.next.month
777,0
5436,0
19260,1
2643,0
26905,0
...,...
500,0
12077,0
15277,0
4517,1


In [54]:
###"""linear regression
#ridge regression
#lasso regression
#elastic net"""

In [55]:
## Model Training

#from sklearn.linear_model import LinearRegression,Lasso,Ridge,ElasticNet
#from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [56]:
import numpy as np
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [58]:
## Train multiple models

###models={
#    'LinearRegression':LinearRegression(),
#    'Lasso':Lasso(),
#    'Ridge':Ridge(),
#    'Elasticnet':ElasticNet()
#}

In [59]:
###trained_model_list=[]
##model_list=[]
#r2_list=[]

In [60]:
#for i in range(len(list(models))):
 #   model=list(models.values())[i]
   # print(model)

In [61]:
#models.keys()

In [62]:
#models.values()

In [63]:
###for i in range(len(list(models))):
 #  model.fit(X_train,y_train)

    #Make Predictions
  #  y_pred=model.predict(X_test)

    #this is a validation(test) score
   # mae, rmse, r2_square=evaluate_model(y_test,y_pred)

    #print(list(models.keys())[i])
    #model_list.append(list(models.keys())[i])

    #print('Model Training Performance')
    #print("RMSE:",rmse)
    #print("MAE:",mae)
    #print("R2 score",r2_square*100)

    #r2_list.append(r2_square)
    
    #print('='*35)
    #print('\n')##

In [64]:
 from sklearn.metrics import accuracy_score

##random forest model


In [None]:
rfm = RandomForestClassifier(n_estimators=20,oob_score=True,n_jobs=1,random_state=42,max_features=None,min_samples_leaf=10)
rfm.fit(X_train,y_train)
rfm_y_pred = rfm.predict(X_test)
print(accuracy_score(rfm_y_pred,y_test))

0.8202222222222222


In [65]:
rfm = RandomForestClassifier(n_estimators=20,oob_score=True,n_jobs=1,random_state=42,max_features=None,min_samples_leaf=10)

In [66]:
rfm.fit(X_train,y_train)

In [67]:
rfm_y_pred = rfm.predict(X_test)
print(accuracy_score(rfm_y_pred,y_test))

0.8202222222222222
