In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('creditcard.csv')

In [None]:
bad_percentage = np.round(100-df[df['Class']==0].shape[0]*100/(df.shape[0]), 4)
print(f'The bad percentage of the data is: {bad_percentage}%')

In [None]:
training_set, test_set = train_test_split(df, test_size = 0.25, stratify = df['Class'], random_state=42)
training_set, validation_set = train_test_split(training_set, test_size = 0.25, stratify = training_set['Class'], random_state=42)

In [None]:
training_set = training_set.drop(['Time'], axis=1)
validation_set = validation_set.drop(['Time'], axis=1)
test_set = test_set.drop(['Time'], axis=1)

In [None]:
training_set[training_set['Class']==1].shape

In [None]:
validation_set[validation_set['Class']==1].shape

In [None]:
test_set[test_set['Class']==1].shape

In [None]:
training_set.to_csv('dev.csv', index = False)
validation_set.to_csv('oos.csv', index = False)
test_set.to_csv('oot.csv', index = False)

In [33]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score, confusion_matrix, precision_score, recall_score
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
import h5py
import joblib
from tqdm import tqdm

In [36]:
def load_data(dev_path, oos_path, oot_path):
    dev = pd.read_csv(dev_path)
    oos = pd.read_csv(oos_path)
    oot = pd.read_csv(oot_path)
    return dev, oos, oot

def standardize_data(dev, oos, oot):
    scaler = StandardScaler()
    dev_scaled = scaler.fit_transform(dev.drop(['Class'], axis=1))
    oos_scaled = scaler.transform(oos.drop(['Class'], axis=1))
    oot_scaled = scaler.transform(oot.drop(['Class'], axis=1))
    return dev_scaled, oos_scaled, oot_scaled, scaler

def split_data(dev, oos):
    dev_F = dev[dev['Class'] == 1].drop(columns=['Class'])
    dev_NF = dev[dev['Class'] == 0].drop(columns=['Class'])
    oos_F = oos[oos['Class'] == 1].drop(columns=['Class'])
    oos_NF = oos[oos['Class'] == 0].drop(columns=['Class'])
    return dev_F, dev_NF, oos_F, oos_NF

def build_autoencoder(input_dim, layer_ratios, n_features, activation='relu'):
    model = Sequential()
    model.add(Dense(int(float(layer_ratios[0]) * n_features), activation=activation, input_shape=(input_dim,)))
    model.add(Dropout(0.1))
    model.add(Dense(int(float(layer_ratios[1]) * n_features), activation=activation))
    model.add(Dense(int(float(layer_ratios[2]) * n_features), activation='linear'))  # Bottleneck layer
    model.add(Dense(int(float(layer_ratios[1]) * n_features), activation=activation))
    model.add(Dense(int(float(layer_ratios[0]) * n_features), activation=activation))
    model.add(Dropout(0.1))
    model.add(Dense(input_dim, activation='sigmoid'))
    model.compile(optimizer='adam', loss='mse')
    return model

def train_autoencoder(train_data, val_data, model, epochs=50, batch_size=256):
    early_stopping = EarlyStopping(monitor='val_loss', patience=5)
    model.fit(train_data, train_data, epochs=epochs, batch_size=batch_size, 
              validation_data=(val_data, val_data), callbacks=[early_stopping], verbose=0)
    return model

def get_feature_importance(model, data):
    predictions = model.predict(data)
    reconstruction_error = np.mean((predictions - data) ** 2, axis=0)
    return reconstruction_error

def feature_selection(dev_F, dev_NF, oos_F, oos_NF, n_features, feature_threshold, ratios, activation):
    # Train on fraud examples
    model_F = build_autoencoder(dev_F.shape[1], ratios, n_features, activation)
    model_F = train_autoencoder(dev_F, oos_F, model_F)
    importance_F = get_feature_importance(model_F, dev_F)
    
    # Train on non-fraud examples
    model_NF = build_autoencoder(dev_NF.shape[1], ratios,n_features, activation)
    model_NF = train_autoencoder(dev_NF, oos_NF, model_NF)
    importance_NF = get_feature_importance(model_NF, dev_NF)
    
    # Determine features to drop
    features_to_drop = determine_features_to_drop(importance_F, importance_NF, feature_threshold)
    return features_to_drop

def determine_features_to_drop(importance_F, importance_NF, feature_threshold):
    top_features_NF = np.argsort(importance_NF)[-int(len(importance_NF) * feature_threshold):]
    bottom_features_F = np.where(importance_F <= 0)[0]
    features_to_drop = np.union1d(top_features_NF, bottom_features_F)
    return features_to_drop

def drop_features(data, features_to_drop, all_features):
    data = pd.DataFrame(data, columns = all_features)
    return data.drop(columns=features_to_drop)

def train_final_autoencoder(train_data, val_data, ratios, activation, n_features):
    model = build_autoencoder(train_data.shape[1], ratios, n_features, activation)
    model = train_autoencoder(train_data, val_data, model)
    return model

def encode_data(model, data):
    encoder = Sequential(model.layers[:4])  # Extract encoder part
    encoded_data = encoder.predict(data)
    return encoded_data

def grid_search_logistic(X_train, y_train, X_test, y_test):
    param_grid = {'C': [0.1], 'solver': ['lbfgs']}
    lr = LogisticRegression()
    grid = GridSearchCV(lr, param_grid, cv=5, scoring='f1')
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    y_pred = best_model.predict_proba(X_test)
    
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    print(confusion_matrix(y_test, y_pred))
    return best_model, f1, precision, recall

def save_results(log_file, best_params, features_dropped, encoded_dev, encoded_oos, encoded_oot, encoder_model, logistic_model):
    pd.DataFrame(log_file).to_csv('pipeline_log.csv', index=False)
    with open('best_params.txt', 'w') as f:
        f.write(str(best_params))
    with open('features_dropped.txt', 'w') as f:
        f.write(str(features_dropped))
    pd.DataFrame(encoded_dev).to_csv('encoded_dev.csv', index=False)
    pd.DataFrame(encoded_oos).to_csv('encoded_oos.csv', index=False)
    pd.DataFrame(encoded_oot).to_csv('encoded_oot.csv', index=False)
    encoder_model.save('encoder_model.h5')
    joblib.dump(logistic_model, 'logistic_model.pkl')

def pipeline(dev_path, oos_path, oot_path, hyperparameters):
    dev, oos, oot = load_data(dev_path, oos_path, oot_path)
    dev_F, dev_NF, oos_F, oos_NF = split_data(dev, oos)
    all_features = dev_F.columns
    n_features = dev_F.shape[1]
    dev_scaled, oos_scaled, oot_scaled, scaler = standardize_data(dev, oos, oot)
    dev_F = scaler.transform(dev_F)
    oos_F = scaler.transform(oos_F)
    dev_NF = scaler.transform(dev_NF)
    oos_NF = scaler.transform(oos_NF)
    
    features_to_drop = feature_selection(dev_F, dev_NF, oos_F, oos_NF, n_features, 
                                         hyperparameters['feature_threshold'], 
                                         hyperparameters['ratios'], 
                                         hyperparameters['activation'])
    features_to_drop = [all_features[i] for i in features_to_drop]
    new_dev = drop_features(dev_scaled, features_to_drop, all_features)
    new_dev_F = drop_features(dev_F, features_to_drop, all_features)
    new_dev_NF = drop_features(dev_NF, features_to_drop, all_features)
    new_oos_F = drop_features(oos_F, features_to_drop, all_features)
    new_oos_NF = drop_features(oos_NF, features_to_drop, all_features)
    new_oos = drop_features(oos_scaled, features_to_drop, all_features)
    new_oot = drop_features(oot_scaled, features_to_drop, all_features)
    
    if hyperparameters['train_on'] == 'abnormal':
        train_on = new_dev_NF
        val_on = new_oos_NF
    else:
        train_on = new_dev_F
        val_on = new_oos_F
    
    final_autoencoder = train_final_autoencoder(train_on, val_on, hyperparameters['ratios'], hyperparameters['activation'], n_features)
    encoded_dev = encode_data(final_autoencoder, new_dev)
    encoded_oos = encode_data(final_autoencoder, new_oos)
    encoded_oot = encode_data(final_autoencoder, new_oot)
    
    best_logistic_model, best_f1, precision, recall = grid_search_logistic(encoded_dev, dev['Class'], encoded_oot, oot['Class'])
    print(f'f1_score = {best_f1}')
    print(f'precision = {precision}')
    print(f'recall = {recall}')
    print(confusion_matrix)
    save_results(hyperparameters, best_logistic_model.get_params(), features_to_drop, 
                 encoded_dev, encoded_oos, encoded_oot, final_autoencoder, best_logistic_model)

In [37]:
if __name__ == "__main__":
    hyperparameters = {
        'train_on': 'normal',
        'feature_threshold': 0.1,
        'ratios': [0.82, 0.52, 0.22],
        'activation': 'selu'
    }
    pipeline('dev.csv', 'oos.csv', 'oot.csv', hyperparameters)

[[71065    14]
 [   54    69]]
f1_score = 0.6699029126213593
precision = 0.8313253012048193
recall = 0.5609756097560976
<function confusion_matrix at 0x0000020286BA7420>


  saving_api.save_model(
