# Fitting a Logistic Regression Model on Credit Card Data

<b> Prepate the dataset until you have split the dataset into training and test sets  </b>

In [1]:
# import packages
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
# url_path
url_path = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter15/Dataset/crx.data'

In [3]:
# load the data
credData = pd.read_csv(url_path, header=None, na_values='?')
credData.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [4]:
credData.loc[credData[15] == '+', 15] = 1
credData.loc[credData[15] == '-', 15] = 0
credData.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [5]:
credData.shape

(690, 16)

In [6]:
credData.isnull().sum()

0     12
1     12
2      0
3      6
4      6
5      9
6      9
7      0
8      0
9      0
10     0
11     0
12     0
13    13
14     0
15     0
dtype: int64

In [7]:
credData.dropna(axis=0, inplace=True)
credData.shape

(653, 16)

In [8]:
credCat = pd.get_dummies(credData[[0, 3, 4, 5, 6, 8, 9, 11, 12]])

In [9]:
credNum = credData[[1, 2, 7, 10, 13, 14]]

In [10]:
X = pd.concat([credCat, credNum], axis=1)
y = pd.Series(credData[15], dtype='int')

In [11]:
scaler = MinMaxScaler()
X_trans = pd.DataFrame(scaler.fit_transform(X))

In [12]:
X_train, X_test, y_train, y_test = train_test_split(X_trans, y, test_size=0.3, random_state=123)

<b> Fit a logistic regression model on the training set </b>

In [13]:
benchmarkModel = LogisticRegression()
benchmarkModel.fit(X_train, y_train)

LogisticRegression()

<b> Get the predictions on the test set </b>

In [14]:
pred = benchmarkModel.predict(X_test)

In [15]:
print(f'Accuracy of the Logistic Regression Model: {benchmarkModel.score(X_test, y_test)}')

Accuracy of the Logistic Regression Model: 0.8877551020408163


<b> Print the confusion matrix and classification report for the benchmark model</b>

In [16]:
print(confusion_matrix(y_test, pred))

[[93 14]
 [ 8 81]]


In [17]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.92      0.87      0.89       107
           1       0.85      0.91      0.88        89

    accuracy                           0.89       196
   macro avg       0.89      0.89      0.89       196
weighted avg       0.89      0.89      0.89       196



# Comparison of Advanced Ensemble Techniques

<b> Implement the bagging technique </b>

In [18]:
# import package
from sklearn.ensemble import BaggingClassifier

In [19]:
# create base learner
bl1 = LogisticRegression(random_state=123)

In [20]:
# create the model
baggingLearner = BaggingClassifier(base_estimator=bl1,
                                  n_estimators=15,
                                  max_samples=0.7,
                                  max_features=0.8)

In [21]:
# fit the model
model = baggingLearner.fit(X_train, y_train)

In [22]:
# make predictions
pred = model.predict(X_test)

In [23]:
# confusion matrix
print(confusion_matrix(y_test, pred))

[[95 12]
 [12 77]]


In [24]:
# classification report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.89      0.89      0.89       107
           1       0.87      0.87      0.87        89

    accuracy                           0.88       196
   macro avg       0.88      0.88      0.88       196
weighted avg       0.88      0.88      0.88       196



<b> Implement boosting </b>

In [25]:
# import package
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

In [26]:
# create base learner
bl1 = RandomForestClassifier(random_state=123)

In [27]:
# create model
boosting = AdaBoostClassifier(base_estimator=bl1, n_estimators=300)

In [28]:
# fit the model
model = boosting.fit(X_train, y_train)

In [29]:
# make predictions
pred = model.predict(X_test)

In [30]:
# confusion matrix
print(confusion_matrix(y_test, pred))

[[94 13]
 [ 9 80]]


In [31]:
# classification report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.91      0.88      0.90       107
           1       0.86      0.90      0.88        89

    accuracy                           0.89       196
   macro avg       0.89      0.89      0.89       196
weighted avg       0.89      0.89      0.89       196



<b> Implement stacking technique </b>

In [32]:
# import package
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from mlxtend.classifier import StackingClassifier

In [33]:
# create base learners
bl1 = KNeighborsClassifier(n_neighbors=5)
bl2 = LogisticRegression(random_state=123) 
ml = RandomForestClassifier(random_state=123)

In [34]:
# create a stacking model
stackcl = StackingClassifier(classifiers=[bl1, bl2], meta_classifier=ml)

In [35]:
# fit the model
model = stackcl.fit(X_train, y_train)

In [36]:
# make predictions
pred = model.predict(X_test)

In [37]:
# confusion matrix
print(confusion_matrix(y_test, pred))

[[99  8]
 [18 71]]


In [38]:
# classification report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

           0       0.85      0.93      0.88       107
           1       0.90      0.80      0.85        89

    accuracy                           0.87       196
   macro avg       0.87      0.86      0.86       196
weighted avg       0.87      0.87      0.87       196



<b> Compare the results across all three techniques and select the best technique </b>

From a business perspective, the boosting algorithm has generated more balanced results where the recall value of both creditworthy and not creditworthy customers (90%) is similar.