## **Csalás felderítés lehetőségei gépi tanuló modellek segítségével - Autoencoder**

****

### **Könyvtárak, függvények, osztályok importálása**

In [None]:
import os
import sys
import importlib
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

from datetime import datetime
from sklearn.preprocessing import StandardScaler
from feature_engine.encoding import CountFrequencyEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, average_precision_score, roc_curve, auc, confusion_matrix, classification_report

print("\nNum GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

### **Saját modulok importálása**

In [None]:
current_dir = os.getcwd()
while True:
    
    if os.path.basename(current_dir) == "DataScience":
        PATH = os.path.join(current_dir, "utils")
        break
    parent_dir = os.path.dirname(current_dir)
    
    if parent_dir == current_dir:
        raise FileNotFoundError("A \"DataScience\" mappa nem található a mappa-hierarchiában.")
    current_dir = parent_dir
    
sys.path.append(PATH)
import methods
import metrics
importlib.reload(methods)
importlib.reload(metrics)

### **Adathalmaz beolvasása**

In [None]:
DATA = methods.read_paysim(get_original_data=False)

DATA.head(5)

### **Adathalmaz felosztása**

In [None]:
TRAINING_SIZE = int(len(DATA)*0.7)
VALIDATE_SIZE = 1/4

non_fraud, fraud = DATA[ DATA["isfraud"]==0 ], DATA[ DATA["isfraud"]==1 ]
non_fraud = non_fraud.sample(frac=1).reset_index(drop=True)

X_train = non_fraud.iloc[:TRAINING_SIZE].drop("isfraud", axis=1)
X_test = pd.concat([non_fraud.iloc[TRAINING_SIZE:], fraud]).sample(frac=1)

X_train, X_val = train_test_split(X_train, test_size=VALIDATE_SIZE, random_state=1)
X_test, y_test = X_test.drop("isfraud", axis=1), X_test["isfraud"]

print(f"Shapes:\nNon-fraud: {non_fraud.shape}\nFraud: {fraud.shape}")
print(f"Shapes:\nTrain: {X_train.shape}\nValidation: {X_val.shape}\nTest: {X_test.shape}")

### **Kategorikus oszlopok enkódolása**

In [None]:
categorical_columns = ["transaction_type", "sender_receiver_type"]
numerical_columns = [col for col in X_train.columns if col not in categorical_columns]

encoder = CountFrequencyEncoder(
    encoding_method="frequency",
    variables=categorical_columns
)
X_train_encoded = encoder.fit_transform(X_train)
X_test_encoded = encoder.transform(X_test)
X_val_encoded = encoder.transform(X_val)

In [None]:
X_train_encoded.head(5)

#### Indexek ellenőrzése

In [None]:
print( (X_train_encoded.index == X_train.index).all() )
print( (X_val_encoded.index == X_val.index).all() )
print( (X_test_encoded.index == X_test.index).all() )

### **Numerikus oszlopok skálázása**

In [None]:
scaler = StandardScaler()
scaler.fit(X_train_encoded[numerical_columns])

X_train_scaled = pd.DataFrame(
    scaler.transform(X_train_encoded.drop(columns=categorical_columns)),
    columns=numerical_columns, index=X_train_encoded.index)
X_train_transformed = pd.concat([X_train_encoded[categorical_columns], X_train_scaled], axis=1)

X_val_scaled = pd.DataFrame(
    scaler.transform(X_val_encoded.drop(columns=categorical_columns)),
    columns=numerical_columns, index=X_val_encoded.index)
X_val_transformed = pd.concat([X_val_encoded[categorical_columns], X_val_scaled], axis=1)

X_test_scaled = pd.DataFrame(
    scaler.transform(X_test_encoded.drop(columns=categorical_columns)),
    columns=numerical_columns, index=X_test_encoded.index)
X_test_transformed = pd.concat([X_test_encoded[categorical_columns], X_test_scaled], axis=1)

In [None]:
X_train_transformed.head(5)

#### Indexek ellenőrzése

In [None]:
print(f"Train index: {(X_train_transformed.index == X_train.index).all()}")
print(f"Test index: {(X_test_transformed.index == X_test.index).all()}")
print(f"Val index: {(X_val_transformed.index == X_val.index).all()}")

### **Tensorflow és Keras segítségével Autoencoder szerkezetének definiálása**

In [None]:
INPUT_DIMENSION, LATENT_DIMENSION = X_train_transformed.shape[1], 3
BATCH_SIZE = 256
EPOCHS = 15

print(f"Input dimension: {INPUT_DIMENSION}")

def build_autoencoder(optimizer_param,
                      hidden_activation_function_param,
                      output_activation_function_param,
                      add_dropout=False,
                      dropout_rate=None):
    
    #encoder
    encoder = tf.keras.models.Sequential(name="encoder")
    encoder.add( tf.keras.layers.Input(shape=(INPUT_DIMENSION,)) )
    encoder.add( tf.keras.layers.Dense(6, activation=hidden_activation_function_param) )
    encoder.add( tf.keras.layers.BatchNormalization() )
    if add_dropout:
        encoder.add( tf.keras.layers.Dropout(dropout_rate) )
    bottleneck_layer = tf.keras.layers.Dense(
        LATENT_DIMENSION, activation=hidden_activation_function_param, name="bottleneck")
    encoder.add(bottleneck_layer)
    
    #decoder
    decoder = tf.keras.models.Sequential(name="decoder")
    decoder.add( tf.keras.layers.Input(shape=(LATENT_DIMENSION,)) )
    decoder.add( tf.keras.layers.Dense(6, activation=hidden_activation_function_param) )
    decoder.add( tf.keras.layers.Dense(INPUT_DIMENSION, activation=output_activation_function_param) )
    
    # full autoencoder
    autoencoder = tf.keras.models.Sequential([encoder, decoder])

    autoencoder.compile(
        optimizer=optimizer_param,
        loss="mse",
        metrics=["mae"]
    )

    return autoencoder, encoder, decoder

### **Autoencoder létrehozása**

In [None]:
autoencoder, encoder, decoder = build_autoencoder(
    optimizer_param=tf.keras.optimizers.Adam(learning_rate = 0.0001),
    hidden_activation_function_param="relu",
    output_activation_function_param="linear",
    add_dropout=False
)
autoencoder.summary()

### **Szükséges mappák, callback-ek definiálása**

In [None]:
saved_models_dirname = "saved_models"
os.makedirs(saved_models_dirname, exist_ok=True)

yyyymmddHHMM = datetime.now().strftime("%Y%m%d%H%M")
filename = os.path.join(saved_models_dirname, f"{yyyymmddHHMM}_batch{BATCH_SIZE}_epochs{EPOCHS}_autoencoder.keras")

autoencoder_result_plots_dir = "autoencoder_result_plots"
current_autoencoder_dir = os.path.join(autoencoder_result_plots_dir, yyyymmddHHMM)
os.makedirs(current_autoencoder_dir, exist_ok=True)

cb_early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    restore_best_weights=True,
    patience=2
)

cb_save_model = tf.keras.callbacks.ModelCheckpoint(
    filepath=filename,
    save_best_only=True,
    save_weights_only=False,
    monitor='val_loss',
    verbose=1,
    mode='min'
)

cb = [cb_save_model, cb_early_stopping]

print(f"Batch size: {BATCH_SIZE}\nEpochs: {EPOCHS}")

### **Autoencoder modell betöltése vagy tanítása**

In [None]:
LOAD_MODEL = True
LOAD_MODEL_FILENAME = "202503011335_batch256_epochs15_autoencoder.keras"

if LOAD_MODEL == True:
    print(f"Modell betöltés fájlból...\n{LOAD_MODEL_FILENAME}")
    autoencoder = tf.keras.models.load_model("saved_models/"+LOAD_MODEL_FILENAME)
else:
    print("Modell tanítás megkezdése...")
    history = autoencoder.fit(
        X_train_transformed, X_train_transformed,
        shuffle=True,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        callbacks = cb,
        validation_data = (X_val_transformed, X_val_transformed)
    )
    methods.plot_history(history)

### **Rekonstrukciók készítése**

In [None]:
X_test_predicted = autoencoder.predict(X_test_transformed)
X_train_predicted = autoencoder.predict(X_train_transformed)

test_mse = metrics.mse(X_test_transformed, X_test_predicted)
test_error = pd.DataFrame({ "Reconstruction_error": test_mse, "True_class": y_test })

train_mse = metrics.mse(X_train_transformed, X_train_predicted)
train_error = pd.DataFrame({ "Reconstruction_error": train_mse, "True_class": 0 })

### **Metrikák, küszöbértékek kiszámítása**

In [None]:
precision, recall, threshold = precision_recall_curve(
    test_error.True_class, test_error.Reconstruction_error)

average_precision = average_precision_score(
    test_error.True_class, test_error.Reconstruction_error)

f1_score = 2*precision[:-1]*recall[:-1] / (precision[:-1]+recall[:-1])

best_index = np.argmax(f1_score)
best_threshold = threshold[best_index]

best_precision = precision[best_index]
best_recall = recall[best_index]
max_f1_score = f1_score[best_index]

In [None]:
train_error.describe()

In [None]:
test_error.describe()

### **Konfúziós mátrix**

In [None]:
predicted_y_test = (test_error.Reconstruction_error >= best_threshold).astype(int)

cm = confusion_matrix(y_test, predicted_y_test)
print(classification_report(y_test, predicted_y_test))
confusion_matrix_plot = methods.plot_confusion_matrix(cm)

if LOAD_MODEL==False:
    confusion_matrix_plot.savefig(os.path.join(current_autoencoder_dir, "confusion_matrix.png"))

### **Legnagyobb F1 score**

In [None]:
f1_score_plot = methods.plot_f1_score(threshold, best_threshold, f1_score, max_f1_score)

if LOAD_MODEL == False:
    f1_score_plot.savefig(os.path.join(current_autoencoder_dir, "max_f1_score.png"))

### **PR görbe**

In [None]:
pr_curve = methods.plot_pr_curve(precision, recall, average_precision, best_precision, best_recall, max_f1_score)

if LOAD_MODEL == False:
    pr_curve.savefig(os.path.join(current_autoencoder_dir, "PR_curve.png"))

### **ROC görbe**

In [None]:
fpr, tpr, thresholds = roc_curve(test_error.True_class, test_error.Reconstruction_error)
roc_auc = auc(fpr, tpr)

roc_auc_curve = methods.plot_roc_curve(fpr, tpr, roc_auc)

if LOAD_MODEL == False:
    roc_auc_curve.savefig(os.path.join(current_autoencoder_dir, "ROC_curve.png"))

### **Metrikák kiíratása**

In [None]:
metrics.print_metrics(y_test, predicted_y_test,LOAD_MODEL,
                      FILENAME=filename,
                      LOAD_MODEL_FILENAME=LOAD_MODEL_FILENAME)
print(f"ROC-AUC score: {roc_auc}")
print(f"Best threshold: {best_threshold}")

### **Teszt adathalmaz MSE eloszlása**

In [None]:
plt.figure(figsize=(10,5))
sns.kdeplot( test_mse, fill=True, color="red", alpha=1.0 )
plt.xlabel("MSE (Mean Squared Error)")
plt.ylabel("Density")
plt.title("Test MSE eloszlás")

if LOAD_MODEL == False:
    plt.savefig(os.path.join(current_autoencoder_dir, "test_mse_distribution.png"))
plt.show()

### **Tanító adathalmaz MSE eloszlása**

In [None]:
plt.figure(figsize=(10,5))
sns.kdeplot( train_mse, fill=True, color="green", alpha=1.0 )
plt.xlabel("MSE (Mean Squared Error)")
plt.ylabel("Density")
plt.title("Train MSE eloszlás")

if LOAD_MODEL == False:
    plt.savefig(os.path.join(current_autoencoder_dir, "train_mse_distribution.png"))
plt.show()