In [116]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegressionCV
credit = pd.read_csv('creditcards.csv')

credit.info()
credit.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 690 entries, 0 to 689
Data columns (total 16 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Gender          690 non-null    int64  
 1   Age             690 non-null    float64
 2   Debt            690 non-null    float64
 3   Married         690 non-null    int64  
 4   BankCustomer    690 non-null    int64  
 5   Industry        690 non-null    object 
 6   Ethnicity       690 non-null    object 
 7   YearsEmployed   690 non-null    float64
 8   PriorDefault    690 non-null    int64  
 9   Employed        690 non-null    int64  
 10  CreditScore     690 non-null    int64  
 11  DriversLicense  690 non-null    int64  
 12  Citizen         690 non-null    object 
 13  ZipCode         690 non-null    int64  
 14  Income          690 non-null    int64  
 15  Approved        690 non-null    int64  
dtypes: float64(3), int64(10), object(3)
memory usage: 86.4+ KB


Unnamed: 0,Gender,Age,Debt,Married,BankCustomer,Industry,Ethnicity,YearsEmployed,PriorDefault,Employed,CreditScore,DriversLicense,Citizen,ZipCode,Income,Approved
0,1,30.83,0.0,1,1,Industrials,White,1.25,1,1,1,0,ByBirth,202,0,1
1,0,58.67,4.46,1,1,Materials,Black,3.04,1,1,6,0,ByBirth,43,560,1
2,0,24.5,0.5,1,1,Materials,Black,1.5,1,0,0,0,ByBirth,280,824,1
3,1,27.83,1.54,1,1,Industrials,White,3.75,1,1,5,1,ByBirth,100,3,1
4,1,20.17,5.625,1,1,Industrials,White,1.71,1,0,0,0,ByOtherMeans,120,0,1


Selecting columns for Categorical and Getting dummies

In [117]:
credit = credit.drop(columns = ['Age','Debt','YearsEmployed','CreditScore','ZipCode','Income'])
credit = pd.get_dummies(credit,drop_first=True)


Getting training and testing sets

In [118]:
X = credit.copy().drop(columns=['Approved'])
y = credit['Approved'].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

X.head()

Unnamed: 0,Gender,Married,BankCustomer,PriorDefault,Employed,DriversLicense,Industry_ConsumerDiscretionary,Industry_ConsumerStaples,Industry_Education,Industry_Energy,...,Industry_Real Estate,Industry_Research,Industry_Transport,Industry_Utilities,Ethnicity_Black,Ethnicity_Latino,Ethnicity_Other,Ethnicity_White,Citizen_ByOtherMeans,Citizen_Temporary
0,1,1,1,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,1,1,1,1,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
2,0,1,1,1,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,1,1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
4,1,1,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,1,0


No modification to Gaussian

In [119]:
cat = CategoricalNB()
y_pred = cat.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d"
% (X_test.shape[0], (y_test != y_pred).sum()))
cat.fit(X_train, y_train)
print(cat)
# make predictions
expected = y_test
predicted = cat.predict(X_test)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))
print(cat.score(X_test, y_test))

Number of mislabeled points out of a total 173 points : 34
CategoricalNB()
              precision    recall  f1-score   support

           0       0.80      0.85      0.82        94
           1       0.81      0.75      0.78        79

    accuracy                           0.80       173
   macro avg       0.80      0.80      0.80       173
weighted avg       0.80      0.80      0.80       173

[[80 14]
 [20 59]]
0.8034682080924855


Adjusting priors on the Gaussian

In [120]:
cat = CategoricalNB(class_prior = (.5,.5))
#class_prior = (.No,.Yes)
y_pred = cat.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d"
% (X_test.shape[0], (y_test != y_pred).sum()))
cat.fit(X_train, y_train)
print(cat)
# make predictions
expected = y_test
predicted = cat.predict(X_test)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))
print(cat.score(X_test, y_test))

Number of mislabeled points out of a total 173 points : 34
CategoricalNB(class_prior=(0.5, 0.5))
              precision    recall  f1-score   support

           0       0.82      0.82      0.82        94
           1       0.78      0.78      0.78        79

    accuracy                           0.80       173
   macro avg       0.80      0.80      0.80       173
weighted avg       0.80      0.80      0.80       173

[[77 17]
 [17 62]]
0.8034682080924855


Adjusting priors on the Gaussian further

In [121]:
cat = CategoricalNB(class_prior = (.4,.6))
#class_prior = (.No,.Yes)
y_pred = cat.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d"
% (X_test.shape[0], (y_test != y_pred).sum()))
cat.fit(X_train, y_train)
print(cat)
# make predictions
expected = y_test
predicted = cat.predict(X_test)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))
print(cat.score(X_test, y_test))

Number of mislabeled points out of a total 173 points : 30
CategoricalNB(class_prior=(0.4, 0.6))
              precision    recall  f1-score   support

           0       0.86      0.82      0.84        94
           1       0.80      0.84      0.81        79

    accuracy                           0.83       173
   macro avg       0.83      0.83      0.83       173
weighted avg       0.83      0.83      0.83       173

[[77 17]
 [13 66]]
0.8265895953757225


Random Forest Classification

In [122]:
clf = RandomForestClassifier(n_estimators = 100,)
clf.fit(X_train, y_train)
print(clf)
y_pred = clf.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d"
% (X_test.shape[0], (y_test != y_pred).sum()))
# make predictions
expected2 = y_test
predicted2 = clf.predict(X_test)
# summarize the fit of the model
print(metrics.classification_report(expected2, predicted2))
print(metrics.confusion_matrix(expected2, predicted2))
print(clf.score(X_test, y_test))

RandomForestClassifier()
Number of mislabeled points out of a total 173 points : 26
              precision    recall  f1-score   support

           0       0.83      0.91      0.87        94
           1       0.88      0.77      0.82        79

    accuracy                           0.85       173
   macro avg       0.86      0.84      0.85       173
weighted avg       0.85      0.85      0.85       173

[[86  8]
 [18 61]]
0.8497109826589595


Logisitc Regression

In [123]:
logregcv = LogisticRegressionCV(cv=10, random_state=0, max_iter=500)
logregcv.fit(X_train, y_train)
y_pred = logregcv.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d"
% (X_test.shape[0], (y_test != y_pred).sum()))
# make predictions
expected3 = y_test
predicted3 = logregcv.predict(X_test)
# summarize the fit of the model
print(metrics.classification_report(expected3, predicted3))
print(metrics.confusion_matrix(expected3, predicted3))
print(logregcv.score(X_test, y_test))

Number of mislabeled points out of a total 173 points : 27
              precision    recall  f1-score   support

           0       0.85      0.86      0.86        94
           1       0.83      0.82      0.83        79

    accuracy                           0.84       173
   macro avg       0.84      0.84      0.84       173
weighted avg       0.84      0.84      0.84       173

[[81 13]
 [14 65]]
0.8439306358381503
