In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score
from keras.models import Sequential, load_model
from keras.layers import Dense, Dropout
from keras.callbacks import EarlyStopping
import h5py
import joblib
from tqdm import tqdm

def load_data(dev_path, oos_path, oot_path):
    dev = pd.read_csv(dev_path)
    oos = pd.read_csv(oos_path)
    oot = pd.read_csv(oot_path)
    return dev, oos, oot

def standardize_data(dev, oos, oot):
    scaler = StandardScaler()
    dev_scaled = scaler.fit_transform(dev.drop(columns=['target']))
    oos_scaled = scaler.transform(oos.drop(columns=['target']))
    oot_scaled = scaler.transform(oot.drop(columns=['target']))
    return dev_scaled, oos_scaled, oot_scaled, scaler

def split_data(dev, oos):
    dev_F = dev[dev['target'] == 1].drop(columns=['target'])
    dev_NF = dev[dev['target'] == 0].drop(columns=['target'])
    oos_F = oos[oos['target'] == 1].drop(columns=['target'])
    oos_NF = oos[oos['target'] == 0].drop(columns=['target'])
    return dev_F, dev_NF, oos_F, oos_NF

def build_autoencoder(input_dim, layer_ratios, activation='relu'):
    model = Sequential()
    model.add(Dense(int(layer_ratios[0] * input_dim), activation=activation, input_shape=(input_dim,)))
    model.add(Dropout(0.1))
    model.add(Dense(int(layer_ratios[1] * input_dim), activation=activation))
    model.add(Dense(int(layer_ratios[2] * input_dim), activation='linear'))  # Bottleneck layer
    model.add(Dense(int(layer_ratios[1] * input_dim), activation=activation))
    model.add(Dense(int(layer_ratios[0] * input_dim), activation=activation))
    model.add(Dropout(0.1))
    model.add(Dense(input_dim, activation='sigmoid'))
    model.compile(optimizer='adam', loss='mse')
    return model

def train_autoencoder(data, model, epochs=50, batch_size=256):
    early_stopping = EarlyStopping(monitor='val_loss', patience=5)
    model.fit(data, data, epochs=epochs, batch_size=batch_size, 
              validation_split=0.1, callbacks=[early_stopping], verbose=0)
    return model

def get_feature_importance(model, data):
    predictions = model.predict(data)
    reconstruction_error = np.mean((predictions - data) ** 2, axis=0)
    return reconstruction_error

def feature_selection(dev_F, dev_NF, oos_F, oos_NF, feature_threshold, ratios, activation):
    # Train on fraud examples
    model_F = build_autoencoder(dev_F.shape[1], ratios, activation)
    model_F = train_autoencoder(dev_F, model_F)
    importance_F = get_feature_importance(model_F, dev_F)
    
    # Train on non-fraud examples
    model_NF = build_autoencoder(dev_NF.shape[1], ratios, activation)
    model_NF = train_autoencoder(dev_NF, model_NF)
    importance_NF = get_feature_importance(model_NF, dev_NF)
    
    # Determine features to drop
    features_to_drop = determine_features_to_drop(importance_F, importance_NF, feature_threshold)
    return features_to_drop

def determine_features_to_drop(importance_F, importance_NF, feature_threshold):
    top_features_NF = np.argsort(importance_NF)[-int(len(importance_NF) * feature_threshold):]
    bottom_features_F = np.where(importance_F <= 0)[0]
    features_to_drop = np.union1d(top_features_NF, bottom_features_F)
    return features_to_drop

def drop_features(data, features_to_drop):
    return data.drop(columns=features_to_drop)

def train_final_autoencoder(data, ratios, activation):
    model = build_autoencoder(data.shape[1], ratios, activation)
    model = train_autoencoder(data, model)
    return model

def encode_data(model, data):
    encoder = Sequential(model.layers[:4])  # Extract encoder part
    encoded_data = encoder.predict(data)
    return encoded_data

def grid_search_logistic(X_train, y_train, X_val, y_val):
    param_grid = {'C': [0.01, 0.1, 1, 10, 100], 'solver': ['liblinear', 'saga']}
    lr = LogisticRegression()
    grid = GridSearchCV(lr, param_grid, cv=5, scoring='f1')
    grid.fit(X_train, y_train)
    best_model = grid.best_estimator_
    y_pred = best_model.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    return best_model, f1

def save_results(log_file, best_params, features_dropped, encoded_dev, encoded_oos, encoded_oot, encoder_model, logistic_model):
    pd.DataFrame(log_file).to_csv('pipeline_log.csv', index=False)
    with open('best_params.txt', 'w') as f:
        f.write(str(best_params))
    with open('features_dropped.txt', 'w') as f:
        f.write(str(features_dropped))
    pd.DataFrame(encoded_dev).to_csv('encoded_dev.csv', index=False)
    pd.DataFrame(encoded_oos).to_csv('encoded_oos.csv', index=False)
    pd.DataFrame(encoded_oot).to_csv('encoded_oot.csv', index=False)
    encoder_model.save('encoder_model.h5')
    joblib.dump(logistic_model, 'logistic_model.pkl')

def pipeline(dev_path, oos_path, oot_path, hyperparameters):
    dev, oos, oot = load_data(dev_path, oos_path, oot_path)
    dev_scaled, oos_scaled, oot_scaled, scaler = standardize_data(dev, oos, oot)
    dev_F, dev_NF, oos_F, oos_NF = split_data(dev_scaled, oos_scaled)
    
    features_to_drop = feature_selection(dev_F, dev_NF, oos_F, oos_NF, 
                                         hyperparameters['feature_threshold'], 
                                         hyperparameters['ratios'], 
                                         hyperparameters['activation'])
    
    new_dev = drop_features(dev_scaled, features_to_drop)
    new_oos = drop_features(oos_scaled, features_to_drop)
    new_oot = drop_features(oot_scaled, features_to_drop)
    
    final_autoencoder = train_final_autoencoder(new_dev, hyperparameters['ratios'], hyperparameters['activation'])
    encoded_dev = encode_data(final_autoencoder, new_dev)
    encoded_oos = encode_data(final_autoencoder, new_oos)
    encoded_oot = encode_data(final_autoencoder, new_oot)
    
    best_logistic_model, best_f1 = grid_search_logistic(encoded_dev, dev['target'], encoded_oos, oos['target'])
    
    save_results(hyperparameters, best_logistic_model.get_params(), features_to_drop, 
                 encoded_dev, encoded_oos, encoded_oot, final_autoencoder, best_logistic_model)
    
if __name__ == "__main__":
    hyperparameters = {
        'feature_threshold': 0.1,
        'ratios': [0.8, 0.5, 0.2],
        'activation': 'relu'
    }
    pipeline('dev.csv', 'oos.csv', 'oot.csv', hyperparameters)