# Logistic Regression

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE

In [2]:
df = pd.read_csv('data/UCI_Credit_Card.csv')
y = df['default.payment.next.month']
X = df.drop(columns=['default.payment.next.month'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [3]:
def get_metrics(model):
    metrics = {}
    
    y_hat_train = model.predict(X_train)
    y_hat_test = model.predict(X_test)
    
    metrics['train_accuracy'] = accuracy_score(y_train, y_hat_train)
    metrics['test_accuracy'] = accuracy_score(y_test, y_hat_test)
    
    metrics['train_f1'] = f1_score(y_train, y_hat_train)
    metrics['test_f1'] = f1_score(y_test, y_hat_test)
    
    metrics['train_precision'] = precision_score(y_train, y_hat_train)
    metrics['test_precision'] = precision_score(y_test, y_hat_test)
    
    metrics['train_recall'] = recall_score(y_train, y_hat_train)
    metrics['test_recall'] = recall_score(y_test, y_hat_test)
    
    return metrics

## Baseline Model

In [4]:
logreg0 = LogisticRegression(fit_intercept=False, C=1e12, solver='liblinear')
logreg0.fit(X_train, y_train)
get_metrics(logreg0)

{'train_accuracy': 0.7775555555555556,
 'test_accuracy': 0.7825333333333333,
 'train_f1': 0.0003994407829039345,
 'test_f1': 0.0012247397428046538,
 'train_precision': 0.3333333333333333,
 'test_precision': 1.0,
 'train_recall': 0.00019984012789768185,
 'test_recall': 0.0006127450980392157}

## Data Cleaning

In [5]:
to_drop = ['ID']
categoricals = ['SEX', 'EDUCATION', 'MARRIAGE']
continuous = ['LIMIT_BAL', 'AGE', 'PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6', 'BILL_AMT1', 'BILL_AMT2', 'BILL_AMT3', 'BILL_AMT4', 'BILL_AMT5', 'BILL_AMT6', 'PAY_AMT1', 'PAY_AMT2', 'PAY_AMT3', 'PAY_AMT4', 'PAY_AMT5', 'PAY_AMT6']
target = ['default.payment.next.month']

In [6]:
X_train.drop(to_drop, axis=1, inplace=True)
X_test.drop(to_drop, axis=1, inplace=True)
for col in ['PAY_0', 'PAY_2', 'PAY_3', 'PAY_4', 'PAY_5', 'PAY_6']:
    X_train[col] = X_train[col].map(lambda x: 0 if x <= 0 else x)
    X_test[col] = X_test[col].map(lambda x: 0 if x <= 0 else x)

In [7]:
logreg1 = LogisticRegression(fit_intercept=False, C=1e12, solver='liblinear')
logreg1.fit(X_train, y_train)
get_metrics(logreg1)

{'train_accuracy': 0.7775111111111112,
 'test_accuracy': 0.7825333333333333,
 'train_f1': 0.0,
 'test_f1': 0.0012247397428046538,
 'train_precision': 0.0,
 'test_precision': 1.0,
 'train_recall': 0.0,
 'test_recall': 0.0006127450980392157}

## Creating Dummy Variables

In [8]:
for col in categoricals:
    X_train[col] = X_train[col].map(str)
    X_test[col] = X_test[col].map(str)
X_train = pd.get_dummies(X_train, drop_first=True, dtype=int)
X_test = pd.get_dummies(X_test, drop_first=True, dtype=int)

In [9]:
logreg2 = LogisticRegression(fit_intercept=False, C=1e12, solver='liblinear')
logreg2.fit(X_train, y_train)
get_metrics(logreg2)

{'train_accuracy': 0.7775111111111112,
 'test_accuracy': 0.7825333333333333,
 'train_f1': 0.0,
 'test_f1': 0.0012247397428046538,
 'train_precision': 0.0,
 'test_precision': 1.0,
 'train_recall': 0.0,
 'test_recall': 0.0006127450980392157}

## Transforming Continuous Variables

In [10]:
scaler = MinMaxScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train))
X_test = pd.DataFrame(scaler.transform(X_test))

In [11]:
logreg3 = LogisticRegression(fit_intercept=False, C=1e12, solver='liblinear')
logreg3.fit(X_train, y_train)
get_metrics(logreg3)

{'train_accuracy': 0.8154666666666667,
 'test_accuracy': 0.8192,
 'train_f1': 0.4327868852459016,
 'test_f1': 0.4278481012658228,
 'train_precision': 0.6839378238341969,
 'test_precision': 0.6869918699186992,
 'train_recall': 0.31654676258992803,
 'test_recall': 0.31066176470588236}

In [12]:
X_train = X_train.applymap(lambda x: np.log(x+1))
X_test = X_test.applymap(lambda x: np.log(x+1))

In [13]:
logreg4 = LogisticRegression(fit_intercept=False, C=1e12, solver='liblinear')
logreg4.fit(X_train, y_train)
get_metrics(logreg4)

{'train_accuracy': 0.8159555555555555,
 'test_accuracy': 0.8194666666666667,
 'train_f1': 0.438660702182459,
 'test_f1': 0.432523051131601,
 'train_precision': 0.6818373367045933,
 'test_precision': 0.6843501326259946,
 'train_recall': 0.32334132693844925,
 'test_recall': 0.3161764705882353}

## Addressing Class Imbalance

In [14]:
df[target[0]].value_counts(), y_train.value_counts(), y_test.value_counts()

(0    23364
 1     6636
 Name: default.payment.next.month, dtype: int64,
 0    17496
 1     5004
 Name: default.payment.next.month, dtype: int64,
 0    5868
 1    1632
 Name: default.payment.next.month, dtype: int64)

In [15]:
X_train, y_train = SMOTE().fit_resample(X_train, y_train)

In [16]:
logreg5 = LogisticRegression(fit_intercept=False, C=1e12, solver='liblinear')
logreg5.fit(X_train, y_train)
get_metrics(logreg5)

{'train_accuracy': 0.7019890260631001,
 'test_accuracy': 0.7770666666666667,
 'train_f1': 0.6576943277310925,
 'test_f1': 0.5292792792792792,
 'train_precision': 0.7725169648365207,
 'test_precision': 0.4895833333333333,
 'train_recall': 0.5725880201188843,
 'test_recall': 0.5759803921568627}