In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV

In [3]:
# Load the dataset
cc_apps = pd.read_csv("../Data/cc_approvals.data", header=None) 
cc_apps.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,g,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,g,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,g,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,g,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,s,0,+


#### Preprocess the Data

In [5]:
# Replace the '?'s with NaN in dataset
cc_apps_nans_replaced = cc_apps.replace("?", np.nan)

In [6]:
# Create a copy of the NaN replacement DataFrame
cc_apps_imputed = cc_apps_nans_replaced.copy()

In [7]:
# Iterate over each column of cc_apps_nans_replaced 
# and impute the most frequent value for object data types 
# and the mean for numeric data types
for col in cc_apps_imputed.columns:
    # Check if the column is of object type
    if cc_apps_imputed[col].dtypes == "object":
        # Impute with the most frequent value
        cc_apps_imputed[col] = cc_apps_imputed[col].fillna(
            cc_apps_imputed[col].value_counts().index[0]
        )
    else:
        cc_apps_imputed[col] = cc_apps_imputed[col].fillna(cc_apps_imputed[col].mean())

In [8]:
# Dummify the categorical features
cc_apps_encoded = pd.get_dummies(cc_apps_imputed, drop_first=True)

#### Prepare the Data for Modeling

In [9]:
# Extract the last column as target variable
X = cc_apps_encoded.iloc[:, :-1].values
y = cc_apps_encoded.iloc[:, [-1]].values

In [10]:
# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [11]:
# Instantiate StandardScaler and use it to rescale X_train and X_test
scaler = StandardScaler()
rescaledX_train = scaler.fit_transform(X_train)
rescaledX_test = scaler.transform(X_test)

#### Prepare the Model and Train

In [12]:
# Instantiate a LogisticRegression classifier with default parameter values
logreg = LogisticRegression()

In [13]:
# Fit logreg to the train set
logreg.fit(rescaledX_train, y_train)

  y = column_or_1d(y, warn=True)


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


In [14]:
# Use logreg to predict instances from the training set
y_train_pred = logreg.predict(rescaledX_train)

In [15]:
# Print the confusion matrix of the logreg model
print(confusion_matrix(y_train, y_train_pred))

[[203   1]
 [  1 257]]


#### Finding the best Scoring Model

In [16]:
# Define the grid of values for tol and max_iter
tol = [0.01, 0.001, 0.0001]
max_iter = [100, 150, 200]

In [17]:
# Create a dictionary where tol and max_iter are keys 
# and the lists of their values are the corresponding values
param_grid = dict(tol=tol, max_iter=max_iter)

In [18]:
# Instantiate GridSearchCV with the required parameters
grid_model = GridSearchCV(estimator=logreg, param_grid=param_grid, cv=5)

In [19]:
# Fit grid_model to the data
grid_model_result = grid_model.fit(rescaledX_train, y_train)

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

In [20]:
# Summarize results
best_train_score, best_train_params = grid_model_result.best_score_, grid_model_result.best_params_
print("Best: %f using %s" % (best_train_score, best_train_params))

Best: 0.818256 using {'max_iter': 100, 'tol': 0.01}


In [21]:
# Extract the best model and evaluate it on the test set
best_model = grid_model_result.best_estimator_
best_score =  best_model.score(rescaledX_test, y_test)

In [22]:
print("Accuracy of logistic regression classifier: ", best_score)

Accuracy of logistic regression classifier:  0.7982456140350878
