In [1]:
import pandas as pd
import pickle
pd.set_option('display.max_columns', 500)

df = None
with open('data/feature.pk','rb') as f:
    df = pickle.load(f)

df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,V10,V11,V12,V13,V14,V15,V16,V17,V18,V19,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,0.090794,-0.5516,-0.617801,-0.99139,-0.311169,1.468177,-0.470401,0.207971,0.025791,0.403993,0.251412,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,-0.166974,1.612727,1.065235,0.489095,-0.143772,0.635558,0.463917,-0.114805,-0.183361,-0.145783,-0.069083,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,0.207643,0.624501,0.066084,0.717293,-0.165946,2.345865,-2.890083,1.109969,-0.121359,-2.261857,0.52498,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,-0.054952,-0.226487,0.178228,0.507757,-0.287924,-0.631418,-1.059647,-0.684093,1.965775,-1.232622,-0.208038,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,0.753074,-0.822843,0.538196,1.345852,-1.11967,0.175121,-0.451449,-0.237033,-0.038195,0.803487,0.408542,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [5]:
from sklearn.model_selection import train_test_split

target_col = 'Class'
test_size = 0.2
X = df.drop(target_col, axis=1)
y = df[target_col]
    
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)


In [33]:

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, cohen_kappa_score, matthews_corrcoef, balanced_accuracy_score, classification_report

def getMetrics(y_pred,y_test,method_name=""):
    
    # Calculate performance metrics
    accuracy = accuracy_score(y_test, y_pred)
    precision_0 = precision_score(y_test, y_pred, pos_label=0)
    precision_1 = precision_score(y_test, y_pred, pos_label=1)
    recall_0 = recall_score(y_test, y_pred, pos_label=0)
    recall_1 = recall_score(y_test, y_pred, pos_label=1)
    f1_0 = f1_score(y_test, y_pred, pos_label=0)
    f1_1 = f1_score(y_test, y_pred, pos_label=1)
    roc_auc = roc_auc_score(y_test, y_pred)
    cohen_kappa = cohen_kappa_score(y_test, y_pred)
    matthews_corr = matthews_corrcoef(y_test, y_pred)
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)
    g_mean = (recall_0*recall_1)**0.5
    classification_error = 1 - accuracy
    sensitivity_0 = recall_0
    sensitivity_1 = recall_1
    specificity_0 = 1 - recall_0
    specificity_1 = 1 - recall_1
    
    # Return dictionary with performance metrics
    return {
        'method_name': method_name,
        'accuracy': accuracy,
        'precision_0': precision_0,
        'precision_1': precision_1,
        'recall_0': recall_0,
        'recall_1': recall_1,
        'f1_0': f1_0,
        'f1_1': f1_1,
        'roc_auc': roc_auc,
        'cohen_kappa': cohen_kappa,
        'matthews_corr': matthews_corr,
        'balanced_accuracy': balanced_accuracy,
        'g_mean': g_mean,
        'classification_error': classification_error,
        'sensitivity_0': sensitivity_0,
        'sensitivity_1': sensitivity_1,
        'specificity_0': specificity_0,
        'specificity_1': specificity_1
    }


In [37]:
def compareModels(ans):
    n = len(ans)
    name_str = 'method_name'
    for score_name in [v for v in ans[0] if v != name_str]:
        print(f'====== {score_name} ======')
        ans = sorted(ans, key = lambda x: x[score_name], reverse = True)
        print('score:',end=" ")
        for i in range(n):
            text = f"{ans[i][score_name]} ({ans[i][name_str]})"
            print(text,end='\t')
        print()
        print('improvement(from next/worst):',end=" ")
        for i in range(n-1):
            text = f"{round(100*((ans[i][score_name]/ans[i+1][score_name])-1),6)}%, {round(100*((ans[i][score_name]/ans[n-1][score_name])-1),6)}% ({ans[i][name_str]})"
            print(text,end='\t')
        print()

- Model Selection
    - Linear models:
        - linear regression, logistic regression, ridge regression, lasso regression
    - Tree-based models:
        - decision trees, random forest, gradient boosting trees, XGBosst, LightGBM, CatBoost
    - Neural Networks:
        - multi-layer perceptron, convolutional neural networks, recurrent neural networks, autoencoders, generative adversarial networks
    - Supposert Vector Machine:
        - Linear SVM, Non-Linear SVM
    - Clusteering models:
        - K-Means, Hierarchical, DBSCAN, Gaussian Mixture models
    - Ensemble Models:
        - Bagging, Boosting, Stacking
    - Bayesian Models:
        - Naive Bayes, Bayesian Networks, Gaussian Processes
    - Evolutionary models:
        - genetic algorithms, particle swarm optimization
    - k-Nearest Neighbors

# Linear models
- linear regression, logistic regression, ridge regression, lasso regression

### Linear regression

The model tries to find the best fit line or hyperplane. If there is only one variable, it estimates $b_0$ and $b_1$ considering that
$$y = b_0 + b_1 x_1$$
in a way that it minimizes the sum of squared residuals.

If there are more variables, it considers:
$$y = b_0 + b_1 x_1 + ... + b_n x_n$$
Note that it assumes that the variables are independent.

Also, the output is not categorical, it's numeric. So it may need adjustment.

In [19]:
from sklearn.linear_model import LinearRegression
import numpy as np

def apply_fit_predict_model(modelGenerator, X_train, y_train, X_test):
    model = modelGenerator()
    model.fit(X_train, y_train)
    return model.predict(X_test)

def apply_linear_regression(X, y, X_test):

    return np.array(list(map(lambda num: 1 if num >= 0.5 else 0, apply_fit_predict_model(lambda : LinearRegression(),X,y,X_test))))


lin_reg_y_pred = apply_linear_regression(X_train,y_train,X_test)

### Logistic regression
This model is used to predict the probability of a binary outcome. It uses the logistic mathematical function (or sigmoid function) to transform a linear combination of the independent vaiables into a probability between 0 and 1. The formula is:
$$p = \frac{1}{a+e^{-(b_0+b_1 x_1+...+b_n x_n)}}$$
Once the probability is calculated, a threshold is used to classify it as 0 or 1 (e.g. 0.5 as we have done above).

In [20]:
from sklearn.linear_model import LogisticRegression


def logistic_regression(X_train, y_train, X_test):
    lr_y_pred = apply_fit_predict_model(lambda : LogisticRegression(), X_train, y_train, X_test)

    return lr_y_pred

lr_y_pred = logistic_regression(X_train, y_train, X_test)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Ridge Regression

Ridge is similar to linear regression but it tries to overcome overfitting by adding a cost function. The cost function to be minimizes is:
$$J(w) = MSE(w) + alpha * sum(w^2)$$
It tries to find coefficient with smaller values, trying to reduce theirvariance and making the model more stable. The alpha parameter may be tunned. It's useful in cases in which there may be many featues and some may be irrelevant or redundant.

In [24]:
from sklearn.linear_model import Ridge
import numpy as np
def ridge_regression(X_train, y_train, X_test, alpha=1.0):
    ridge_y_pred = apply_fit_predict_model(lambda: Ridge(alpha=alpha), X_train, y_train, X_test)

    return np.array(list(map(lambda x: 1 if x >= 0.5 else 0,ridge_y_pred)))

ridge_y_pred = ridge_regression(X_train, y_train, X_test, alpha=1.0)

### Lasso (Least absolute shrinkage and selection operator) Regression

Lasso is similar to ridge but the cost function is
$$J(w) = MSE(w) + alpha * sum(|w|)$$
The cost functino on Lasso tends to shrink some coefficients all the way to zero more effectively. It, as Ridge, tries to perform feature selection and produce sparse models, with only the most important features.

In [30]:
from sklearn.linear_model import Lasso

def lasso_regression(X_train, y_train, X_test, alpha=1.0):
    lasso_y_pred = apply_fit_predict_model(lambda: Lasso(alpha=alpha), X_train, y_train, X_test)

    return np.array(list(map(lambda x: 1 if x >= 0.5 else 0,lasso_y_pred)))

lasso_y_pred = lasso_regression(X_train, y_train, X_test, alpha=1.0)


### Linear models comparison

In [39]:
ans = {'linear':lin_reg_y_pred, 'logistic':lr_y_pred, 'ridge':ridge_y_pred, 'lasso':lasso_y_pred}

lst = []
for model_name in ans:
    lst += [getMetrics(ans[model_name],y_test,method_name=model_name)]

compareModels(lst)

score: 0.9739021859557181 (logistic)	0.9337794347818441 (linear)	0.9337794347818441 (ridge)	0.9033114679141093 (lasso)	
improvement(from next/worst): 4.296812%, 7.81466% (logistic)	0.0%, 3.372919% (linear)	3.372919%, 3.372919% (ridge)	
score: 0.9661454721952573 (logistic)	0.8905711882429495 (linear)	0.8905711882429495 (ridge)	0.8424905682975028 (lasso)	
improvement(from next/worst): 8.486046%, 14.677304% (logistic)	0.0%, 5.706962% (linear)	5.706962%, 5.706962% (ridge)	
score: 0.9898785425101214 (lasso)	0.9874598260938147 (linear)	0.9874598260938147 (ridge)	0.9818872907669796 (logistic)	
improvement(from next/worst): 0.244943%, 0.813867% (lasso)	0.0%, 0.567533% (linear)	0.567533%, 0.567533% (ridge)	
score: 0.9916299559471365 (lasso)	0.9887929515418502 (linear)	0.9887929515418502 (ridge)	0.9821145374449339 (logistic)	
improvement(from next/worst): 0.286916%, 0.968871% (lasso)	0.0%, 0.680004% (linear)	0.680004%, 0.680004% (ridge)	
score: 0.9657224094355518 (logistic)	0.8789841336703174 (l

The logistic model seemed to be the better even though lasso had a better precision for 1 (while logistic got the worst precision). But lasso also got the highest classification error (due to its poor performance on 0).

# Tree-based models
- decision trees, random forest, gradient boosting trees, XGBoost, LightGBM, CatBoost

### Decision tree

This method tries to subdivide the feature space into numerous regions and assign a classification to each region. It does that by recursively selecting a features and a split value. The feature and split value selection are based on some impurity measure, like Gini impurity of entropy. The goal is that after splitting the new regions are more "pure" (in terms of the target variable).

This process can be visualized as a tree where each node is composed by a selected feature and a split value and the leafs are the classification. It's easy to interpret and handle both numeric and categorical variables. However, it has some problems as overfitting and is sensitive to small changes in data.


In [40]:
from sklearn.tree import DecisionTreeClassifier

def apply_decision_tree(X_train, y_train, X_test):
    return apply_fit_predict_model(lambda: DecisionTreeClassifier(), X_train, y_train, X_test)

dt_y_pred = apply_decision_tree(X_train, y_train, X_test)

### Random Forest

This is an ensemble method, that is, produces many models and combines them to form a single one. In this case, it generates random subsets of the training set and trains a decision tree for each. Then, the outputs are combined to make a final prediction.

It's an ensemble method that uses bagging to create the random subsets of data. Bagging creates data with replacement which means that some points may be selected multiple times and some may not be selected at all.

It has the advantages of decision tree: handles numerical and categorical data, as well as missing values and noisy data. But, also, it has a low risk of overfitting.

In [43]:
from sklearn.ensemble import RandomForestClassifier

def apply_random_forest(X_train, y_train, X_test):
    return apply_fit_predict_model(lambda: RandomForestClassifier(), X_train, y_train, X_test)
random_forest_y_pred = apply_random_forest(X_train, y_train, X_test)

### Gradient boosting trees

This method builds a series of trees in which each subsequent tree correct the errors of the previous tree. The algorithm is:
1. builds a decision tree
2. the error is calculated for each observation
3. a second tree is built to predict the errors
4. performs 2 and 3 repeatedly
5. the final prediction is made by combining the predictions of all trees

The algorithm performs gradient descent optimization to minimize the loss function (error) and update the predictions.

A problem is that it's expensive computationally and may require hyperparameter tuning.

In [42]:
from sklearn.ensemble import GradientBoostingClassifier

def apply_gradient_boosting(X_train, y_train, X_test):
    return apply_fit_predict_model(lambda: GradientBoostingClassifier(), X_train, y_train, X_test)
gb_y_pred = apply_gradient_boosting(X_train, y_train, X_test)

### XGBoost (eXtreme Gradient Boosting)

It's a popular ensemble method. It's based in gradient boosting but its' designed to be more efficient.

For example, it uses a technique called "regularization" to prevent ovverfitting by adding a penalty to the loss function that encourages simpler trees.
It uses "shrinkage" to reduce the contribution of each individual tree making the model more robust to outliers and reducing overfitting again.


XGBoost is also good for handling missing data since it can learn how to impute missing values.


In [44]:
from xgboost import XGBClassifier

def apply_xgboost(X_train, y_train, X_test):
    return apply_fit_predict_model(lambda: XGBClassifier(), X_train, y_train, X_test)
    
xgb_y_pred = apply_xgboost(X_train, y_train, X_test)

### LightGBM (Light Gradient Boosting Machine)

This model is also based on gradient boosting and was developed by Microsoft. It was built to be fast, scalable and handle large datasets.
It follows the approach of gradient boosting but uses gradient-based one-sided sampling to build the trees more efficiently.

Instead of selecting a random subset of the data, each sample is assigned a weight (an importance, based on its gradient value) and LightGBM focus on samples that are more informative. So it can build more accurate trees with less data.

LightGBM applies a gradient-based exclusive feature bundling technique to group categorical features together based on their importance.

In [49]:
from lightgbm import LGBMClassifier

def apply_lightgbm(X_train, y_train, X_test):
    return apply_fit_predict_model(lambda: LGBMClassifier(), X_train, y_train, X_test)
lgbm_y_pred = apply_lightgbm(X_train, y_train, X_test)

### CatBoost (Categorical Boosting)

This method comes with an open-source library released by a Russian internet company in 2017.

It's very effective on categorical data since it can handle them without preprocessing. It also can handle missing values without the need for imputation. It includes support for text features, useful for natural language processing. it's fast with parallel computation and GPU acceleration. It prevents overfitting by applying ordered boosting, random permutation and gradient-based feature importance. 

In [48]:
from catboost import CatBoostClassifier

def apply_catboost(X_train, y_train, X_test):
    return apply_fit_predict_model(lambda: CatBoostClassifier(), X_train, y_train, X_test)

catboost_y_pred = apply_catboost(X_train, y_train, X_test)

Learning rate set to 0.140561
0:	learn: 0.4161384	total: 478ms	remaining: 7m 57s
1:	learn: 0.2570333	total: 1.03s	remaining: 8m 33s
2:	learn: 0.1801662	total: 1.39s	remaining: 7m 40s
3:	learn: 0.1421400	total: 1.68s	remaining: 6m 57s
4:	learn: 0.1158392	total: 1.98s	remaining: 6m 34s
5:	learn: 0.1033104	total: 2.27s	remaining: 6m 15s
6:	learn: 0.0929931	total: 2.44s	remaining: 5m 46s
7:	learn: 0.0851909	total: 2.62s	remaining: 5m 24s
8:	learn: 0.0792788	total: 2.74s	remaining: 5m 1s
9:	learn: 0.0730083	total: 2.91s	remaining: 4m 48s
10:	learn: 0.0696011	total: 3.04s	remaining: 4m 33s
11:	learn: 0.0669351	total: 3.33s	remaining: 4m 33s
12:	learn: 0.0644683	total: 3.46s	remaining: 4m 22s
13:	learn: 0.0613024	total: 3.61s	remaining: 4m 14s
14:	learn: 0.0585287	total: 3.75s	remaining: 4m 5s
15:	learn: 0.0567194	total: 3.88s	remaining: 3m 58s
16:	learn: 0.0546738	total: 4.01s	remaining: 3m 52s
17:	learn: 0.0527617	total: 4.18s	remaining: 3m 47s
18:	learn: 0.0509587	total: 4.32s	remaining: 3

### Tree-based model comparison

In [56]:
ans = {'dt':dt_y_pred, 'rf':random_forest_y_pred, 'gb':gb_y_pred, 'xgb':xgb_y_pred, 'lgbm': lgbm_y_pred,'catb': catboost_y_pred }

lst = []
for model_name in ans:
    lst += [getMetrics(ans[model_name],y_test,method_name=model_name)]

compareModels(lst)

score: 0.999903276295658 (rf)	0.9998593109755025 (xgb)	0.9995515537344143 (catb)	0.9994987953502278 (lgbm)	0.9986106958830874 (dt)	0.9870741958742943 (gb)	
improvement(from next/worst): 0.004397%, 1.299708% (rf)	0.03079%, 1.295254% (xgb)	0.005278%, 1.264075% (catb)	0.088934%, 1.25873% (lgbm)	1.168757%, 1.168757% (dt)	
score: 1.0 (rf)	1.0 (xgb)	0.9999647284976103 (catb)	0.9997179374173645 (lgbm)	0.9994175579794556 (dt)	0.9812145269682092 (gb)	
improvement(from next/worst): 0.0%, 1.914512% (rf)	0.003527%, 1.914512% (xgb)	0.024686%, 1.910918% (catb)	0.030055%, 1.885766% (lgbm)	1.855153%, 1.855153% (dt)	
score: 0.9998069735202766 (rf)	0.9997192588433464 (xgb)	0.999280714373432 (lgbm)	0.9991406976132438 (catb)	0.9978096306161071 (dt)	0.9930535815804434 (gb)	
improvement(from next/worst): 0.008774%, 0.680063% (rf)	0.043886%, 0.67123% (xgb)	0.014014%, 0.627069% (lgbm)	0.133399%, 0.61297% (catb)	0.478932%, 0.478932% (dt)	
score: 0.999806167400881 (rf)	0.9997180616740088 (xgb)	0.999277533039647

  text = f"{round(100*((ans[i][score_name]/ans[i+1][score_name])-1),6)}%, {round(100*((ans[i][score_name]/ans[n-1][score_name])-1),6)}% ({ans[i][name_str]})"
  text = f"{round(100*((ans[i][score_name]/ans[i+1][score_name])-1),6)}%, {round(100*((ans[i][score_name]/ans[n-1][score_name])-1),6)}% ({ans[i][name_str]})"


Random forest had the best performance.

# Neural Networks:
- multi-layer perceptron, convolutional neural networks, recurrent neural networks, autoencoders, generative adversarial networks

### Multi-layer perceptron

In [57]:
from sklearn.neural_network import MLPClassifier

def apply_mlp(X_train, y_train, X_test):
    return apply_fit_predict_model(lambda: MLPClassifier(), X_train, y_train, X_test)
mlp_y_pred = apply_mlp(X_train, y_train, X_test)



### Convolutional neural network

In [None]:
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense

def apply_cnn(X_train, y_train, X_test):
    # Reshape data for CNN
    X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], X_train.shape[2], 1)
    X_test = X_test.reshape(X_test.shape[0], X_test.shape[1], X_test.shape[2], 1)

    # Define CNN model
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(3, 3), activation='relu', input_shape=(X_train.shape[1], X_train.shape[2], 1)))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))

    # Compile and fit the model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(X_train, y_train, epochs=10, batch_size=32)

    # Predict using the model
    return model.predict_classes(X_test)

cnn_y_pred = apply_cnn(X_train, y_train, X_test)

### Recurrent neural network

### Autoencoders

### Generative adversarial networks (GANs)

# Supposert Vector Machine:
- Linear SVM, Non-Linear SVM

# Clusteering models:
- K-Means, Hierarchical, DBSCAN, Gaussian Mixture models

# Bayesian Models:
- Naive Bayes, Bayesian Networks, Gaussian Processes

# Ensemble Models:
- Bagging, Boosting, Stacking

# Evolutionary models:
- genetic algorithms, particle swarm optimization

# k-Nearest Neighbors