# Model Building
- Try several "simple" models for classification (pycaret or manual?)
- Compare with a neural net model
- to test: using sklearn class weight parameter instead of resampling (logistic/random forest)
- use extratrees model instead of random forest
- Try creating extra features with square/cube (see sklearn polynomial features)
- Possibly, test my favorite model with outlier removal.
- Credit for imbalanced-learn usage examples: https://imbalanced-learn.org/stable/common_pitfalls.html

In [5]:
# import libraries and load data
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

# scaling, sampling, and modeling tools
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.preprocessing import StandardScaler
from imblearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate

# basic models for testing
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier

df = pd.read_csv('data\creditcard.csv')

In [6]:
# Scale "amount" and "time" features to match remaining
std_scaler = StandardScaler()

df['std_scaled_amount'] = std_scaler.fit_transform(df['Amount'].values.reshape(-1,1))
df['std_scaled_time'] = std_scaler.fit_transform(df['Time'].values.reshape(-1,1))
df.drop(['Time','Amount'], axis=1, inplace=True)

In [7]:
# train/test split BEFORE under or over sampling
X = df.drop('Class', axis=1)
y = df['Class']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

# split Class balance evaluation
classcount = y_train.value_counts()
perc_pos = (classcount[1]/(classcount[0]+classcount[1]))*100
print("Training Data contains {} fraudulent transactions which are {:.3f}% of all transactions".format(classcount[1], perc_pos))
classcount = y_test.value_counts()
perc_pos = (classcount[1]/(classcount[0]+classcount[1]))*100
print("Test Data contains {} fraudulent transactions which are {:.3f}% of all transactions".format(classcount[1], perc_pos))
print("Original Data contains 492 fraudulent transactions which are 0.173% of all transactions") # calculated in data wrangling

Training Data contains 344 fraudulent transactions which are 0.173% of all transactions
Test Data contains 148 fraudulent transactions which are 0.173% of all transactions
Original Data contains 492 fraudulent transactions which are 0.173% of all transactions


In [8]:
# testing class weight parameter of logistic regression against undersampling and oversampling techniques
logreg = LogisticRegression()
subsample = RandomUnderSampler(random_state=42)
oversample = SMOTE(random_state=42)
logregwt = LogisticRegression(class_weight='balanced')

# cross validate weighted logistic regression
cv_results = cross_validate(logregwt, X_train, y_train, scoring="balanced_accuracy",
    return_train_score=True, return_estimator=True,
    n_jobs=-1
                           )
print(f"Weighted Logistic Regression, balanced accuracy mean +/- std. dev.: "
    f"{cv_results['test_score'].mean():.3f} +/- "
    f"{cv_results['test_score'].std():.3f}")

Weighted Logistic Regression, balanced accuracy mean +/- std. dev.: 0.950 +/- 0.010


In [12]:
# testing pipeline with subsample and nonweighted logistic regression
pipeline = make_pipeline(subsample, logreg)

cv_results = cross_validate(pipeline, X_train, y_train, scoring="balanced_accuracy",
    return_train_score=True, return_estimator=True,
    n_jobs=-1)

print(f"Subsampled Logistic Regression, balanced accuracy mean +/- std. dev.: "
    f"{cv_results['test_score'].mean():.3f} +/- "
    f"{cv_results['test_score'].std():.3f}")

Subsampled Logistic Regression, balanced accuracy mean +/- std. dev.: 0.945 +/- 0.011


In [13]:
# testing pipeline with oversample and nonweighted logistic regression
pipeline = make_pipeline(oversample, logreg)

cv_results = cross_validate(pipeline, X_train, y_train, scoring="balanced_accuracy",
    return_train_score=True, return_estimator=True,
    n_jobs=-1)

print(f"Oversampled Logistic Regression, balanced accuracy mean +/- std. dev.: "
    f"{cv_results['test_score'].mean():.3f} +/- "
    f"{cv_results['test_score'].std():.3f}")

Oversampled Logistic Regression, balanced accuracy mean +/- std. dev.: 0.949 +/- 0.010


In [15]:
svcwt = SVC(random_state=42, class_weight='balanced')
# cross validate weighted SVC
cv_results = cross_validate(svcwt, X_train, y_train, scoring="balanced_accuracy",
    return_train_score=True, return_estimator=True,
    n_jobs=-1)

print(f"Weighted SVC, balanced accuracy mean +/- std. dev.: "
    f"{cv_results['test_score'].mean():.3f} +/- "
    f"{cv_results['test_score'].std():.3f}")

Weighted Logistic Regression, balanced accuracy mean +/- std. dev.: 0.878 +/- 0.026


In [16]:
# testing pipeline with subsample SVC
svc = SVC(random_state=42)
pipeline = make_pipeline(subsample, svc)

cv_results = cross_validate(pipeline, X_train, y_train, scoring="balanced_accuracy",
    return_train_score=True, return_estimator=True,
    n_jobs=-1)

print(f"Subsampled SVC, balanced accuracy mean +/- std. dev.: "
    f"{cv_results['test_score'].mean():.3f} +/- "
    f"{cv_results['test_score'].std():.3f}")

Subsampled SVC, balanced accuracy mean +/- std. dev.: 0.940 +/- 0.013


In [17]:
# testing pipeline with oversample and nonweighted SVC
pipeline = make_pipeline(oversample, svc)

cv_results = cross_validate(pipeline, X_train, y_train, scoring="balanced_accuracy",
    return_train_score=True, return_estimator=True,
    n_jobs=-1)

print(f"Oversampled SVC, balanced accuracy mean +/- std. dev.: "
    f"{cv_results['test_score'].mean():.3f} +/- "
    f"{cv_results['test_score'].std():.3f}")

Oversampled SVC, balanced accuracy mean +/- std. dev.: 0.940 +/- 0.012


Mental note, SVC took a while, especially the oversample! Docs recommend LinearSVC for larger datasets...

In [18]:
knn = KNeighborsClassifier(n_jobs=-1)

# do I need to subsample for neighbors or trees? Try without!
cv_results = cross_validate(knn, X_train, y_train, scoring="balanced_accuracy",
    return_train_score=True, return_estimator=True,
    n_jobs=-1)

print(f"KNN original data, balanced accuracy mean +/- std. dev.: "
    f"{cv_results['test_score'].mean():.3f} +/- "
    f"{cv_results['test_score'].std():.3f}")

KNN original data, balanced accuracy mean +/- std. dev.: 0.888 +/- 0.014


In [19]:
# testing pipeline with subsample KNN
pipeline = make_pipeline(subsample, knn)

cv_results = cross_validate(pipeline, X_train, y_train, scoring="balanced_accuracy",
    return_train_score=True, return_estimator=True,
    n_jobs=-1)

print(f"Subsampled KNN, balanced accuracy mean +/- std. dev.: "
    f"{cv_results['test_score'].mean():.3f} +/- "
    f"{cv_results['test_score'].std():.3f}")

Subsampled KNN, balanced accuracy mean +/- std. dev.: 0.935 +/- 0.015


In [20]:
# testing pipeline with oversample KNN
pipeline = make_pipeline(oversample, knn)

cv_results = cross_validate(pipeline, X_train, y_train, scoring="balanced_accuracy",
    return_train_score=True, return_estimator=True,
    n_jobs=-1)

print(f"Oversampled KNN, balanced accuracy mean +/- std. dev.: "
    f"{cv_results['test_score'].mean():.3f} +/- "
    f"{cv_results['test_score'].std():.3f}")

Oversampled KNN, balanced accuracy mean +/- std. dev.: 0.928 +/- 0.014


In [21]:
dtreewt = DecisionTreeClassifier(random_state=42, class_weight='balanced')

# do I need to subsample for neighbors or trees? Try without!
cv_results = cross_validate(dtreewt, X_train, y_train, scoring="balanced_accuracy",
    return_train_score=True, return_estimator=True,
    n_jobs=-1)

print(f"Decision Tree weighted, balanced accuracy mean +/- std. dev.: "
    f"{cv_results['test_score'].mean():.3f} +/- "
    f"{cv_results['test_score'].std():.3f}")

Decision Tree weighted, balanced accuracy mean +/- std. dev.: 0.862 +/- 0.019


In [22]:
dtree = DecisionTreeClassifier(random_state=42)

# testing pipeline with subsample decision tree
pipeline = make_pipeline(subsample, dtree)

cv_results = cross_validate(pipeline, X_train, y_train, scoring="balanced_accuracy",
    return_train_score=True, return_estimator=True,
    n_jobs=-1)

print(f"Subsampled Decision Tree, balanced accuracy mean +/- std. dev.: "
    f"{cv_results['test_score'].mean():.3f} +/- "
    f"{cv_results['test_score'].std():.3f}")

Subsampled Decision Tree, balanced accuracy mean +/- std. dev.: 0.907 +/- 0.016


In [23]:
# testing pipeline with oversample decision tree
pipeline = make_pipeline(oversample, dtree)

cv_results = cross_validate(pipeline, X_train, y_train, scoring="balanced_accuracy",
    return_train_score=True, return_estimator=True,
    n_jobs=-1)

print(f"Oversampled Decision Tree, balanced accuracy mean +/- std. dev.: "
    f"{cv_results['test_score'].mean():.3f} +/- "
    f"{cv_results['test_score'].std():.3f}")

Oversampled Decision Tree, balanced accuracy mean +/- std. dev.: 0.896 +/- 0.009


In [24]:
extreewt = ExtraTreesClassifier(random_state=42, class_weight='balanced')

# do I need to subsample for neighbors or trees? Try without!
cv_results = cross_validate(extreewt, X_train, y_train, scoring="balanced_accuracy",
    return_train_score=True, return_estimator=True,
    n_jobs=-1)

print(f"Extra Trees weighted, balanced accuracy mean +/- std. dev.: "
    f"{cv_results['test_score'].mean():.3f} +/- "
    f"{cv_results['test_score'].std():.3f}")

Extra Trees weighted, balanced accuracy mean +/- std. dev.: 0.891 +/- 0.017


In [25]:
extree = ExtraTreesClassifier(random_state=42)

# testing pipeline with subsample extra trees
pipeline = make_pipeline(subsample, extree)

cv_results = cross_validate(pipeline, X_train, y_train, scoring="balanced_accuracy",
    return_train_score=True, return_estimator=True,
    n_jobs=-1)

print(f"Subsampled Extra Trees, balanced accuracy mean +/- std. dev.: "
    f"{cv_results['test_score'].mean():.3f} +/- "
    f"{cv_results['test_score'].std():.3f}")

Subsampled Extra Trees, balanced accuracy mean +/- std. dev.: 0.942 +/- 0.008


In [26]:
# testing pipeline with oversample extra trees
pipeline = make_pipeline(oversample, extree)

cv_results = cross_validate(pipeline, X_train, y_train, scoring="balanced_accuracy",
    return_train_score=True, return_estimator=True,
    n_jobs=-1)

print(f"Oversampled Extra Trees, balanced accuracy mean +/- std. dev.: "
    f"{cv_results['test_score'].mean():.3f} +/- "
    f"{cv_results['test_score'].std():.3f}")

Oversampled Extra Trees, balanced accuracy mean +/- std. dev.: 0.914 +/- 0.013


What else do I want to know from these models? Do I care about checking "important" features? Should I just go on to parameter tuning with one, or do a few and see if tuning improves one more than others?