## **Csalás felderítés lehetőségei gépi tanuló modellek segítségével - FNN**

****

### **Könyvtárak, függvények, osztályok importálása**

In [None]:
import os
import sys
import importlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

from datetime import datetime
from sklearn.preprocessing import StandardScaler
from feature_engine.encoding import CountFrequencyEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, average_precision_score, roc_curve, auc, confusion_matrix, classification_report

print("\nNum GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

### **Saját modulok importálása**

In [None]:
current_dir = os.getcwd()
while True:
    
    if os.path.basename(current_dir) == "DataScience":
        PATH = os.path.join(current_dir, "utils")
        break
    parent_dir = os.path.dirname(current_dir)
    
    if parent_dir == current_dir:
        raise FileNotFoundError("A \"DataScience\" mappa nem található a mappa-hierarchiában.")
    
    current_dir = parent_dir
    
sys.path.append(PATH)
import methods
import metrics
importlib.reload(methods)
importlib.reload(metrics)

### **Adathalmaz beolvasása**

In [None]:
DATA = methods.read_paysim(get_original_data=False)

X = DATA.drop('isfraud', axis=1)
y = DATA["isfraud"]

X.head(5)

### **Adathalmaz felosztása**

In [None]:
TEST_SIZE = 0.30
VALIDATE_SIZE = 1/4

X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=1, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=VALIDATE_SIZE, random_state=1, stratify=y_temp)

print(f"Shapes:\nTrain: {X_train.shape}\nValidation: {X_val.shape}\nTest: {X_test.shape}")

### **Kategorikus oszlopok enkódolása**

In [None]:
categorical_columns = ["transaction_type", "sender_receiver_type"]
numerical_columns = [col for col in X_train.columns if col not in categorical_columns]

encoder = CountFrequencyEncoder(
    encoding_method="frequency",
    variables=categorical_columns
)
X_train_encoded = encoder.fit_transform(X_train, y_train)
X_test_encoded = encoder.transform(X_test)
X_val_encoded = encoder.transform(X_val)

In [None]:
X_train_encoded.head(5)

#### Indexek ellenőrzése

In [None]:
print( (X_train_encoded.index == X_train.index).all() )
print( (X_val_encoded.index == X_val.index).all() )
print( (X_test_encoded.index == X_test.index).all() )

### **Numerikus oszlopok skálázása**

In [None]:
standard_scaler = StandardScaler()
standard_scaler.fit(X_train_encoded[numerical_columns])

X_train_scaled = pd.DataFrame(
    standard_scaler.transform(X_train_encoded[numerical_columns]),
    columns=numerical_columns, index=X_train_encoded.index)
X_train_transformed = pd.concat([X_train_encoded[categorical_columns], X_train_scaled], axis=1)

X_val_scaled = pd.DataFrame(
    standard_scaler.transform(X_val_encoded[numerical_columns]),
    columns=numerical_columns, index=X_val_encoded.index)
X_val_transformed = pd.concat([X_val_encoded[categorical_columns], X_val_scaled], axis=1)

X_test_scaled = pd.DataFrame(
    standard_scaler.transform(X_test_encoded[numerical_columns]),
    columns=numerical_columns, index=X_test_encoded.index)
X_test_transformed = pd.concat([X_test_encoded[categorical_columns], X_test_scaled], axis=1)

In [None]:
X_train_transformed.head(5)

#### Indexek ellenőrzése

In [None]:
print(f"Train index: {(X_train_transformed.index == X_train.index).all()}")
print(f"Test index: {(X_test_transformed.index == X_test.index).all()}")
print(f"Val index: {(X_val_transformed.index == X_val.index).all()}")

### **Mintasúlyok létrehozása**

In [None]:
# Ha y_train = 1, akkor súly=10, egyébként súly=1

sample_weights = np.where(y_train==1, 10, 1)

### **Tensorflow és Keras segítségével Autoencoder szerkezetének definiálása**

In [None]:
INPUT_DIMENSION, OUTPUT_DIMENSION = X_train_transformed.shape[1], 1
NEURONS = 6
BATCH_SIZE = 256
EPOCHS = 15

def build_fnn(num_layers,
              optimizer_param,
              hidden_activation_function_param,
              output_activation_function_param,
              add_dropout=False,
              dropout_rate=None):
    
    fnn = tf.keras.models.Sequential()
    fnn.add( tf.keras.layers.Input(shape=(INPUT_DIMENSION,)) )
    for _ in range(num_layers - 1):
        fnn.add(tf.keras.layers.Dense(NEURONS, activation=hidden_activation_function_param))
        fnn.add( tf.keras.layers.BatchNormalization() )
        if add_dropout:
            fnn.add( tf.keras.layers.Dropout(dropout_rate) )
            
    fnn.add(tf.keras.layers.Dense(NEURONS, activation=hidden_activation_function_param))
    fnn.add( tf.keras.layers.BatchNormalization() )
    fnn.add(tf.keras.layers.Dense(OUTPUT_DIMENSION, activation=output_activation_function_param))

    fnn.compile(
        optimizer=optimizer_param,
        loss="binary_crossentropy",
        metrics=["accuracy", "auc", "precision", "recall"]
    )

    return fnn

### **FNN létrehozása**

In [None]:
fnn = build_fnn(
    num_layers=2,
    optimizer_param=tf.keras.optimizers.AdamW(learning_rate = 0.00001),
    hidden_activation_function_param="relu",
    output_activation_function_param="sigmoid",
    add_dropout=False
)
fnn.summary()

### **Szükséges mappák, callback-ek definiálása**

In [None]:
saved_models_dirname = "saved_models"
os.makedirs(saved_models_dirname, exist_ok=True)

yyyymmddHHMM = datetime.now().strftime("%Y%m%d%H%M")
filename = os.path.join(saved_models_dirname, f"{yyyymmddHHMM}_batch{BATCH_SIZE}_epochs{EPOCHS}_fnn.keras")

fnn_result_plots_dir = "fnn_result_plots"
current_fnn_dir = os.path.join(fnn_result_plots_dir, yyyymmddHHMM)
os.makedirs(current_fnn_dir, exist_ok=True)

cb_early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    restore_best_weights=True,
    patience=2
)
cb_save_model = tf.keras.callbacks.ModelCheckpoint(
    filepath=filename,
    save_best_only=True,
    save_weights_only=False,
    monitor='val_loss',
    verbose=1,
    mode='min'
)
cb = [cb_save_model, cb_early_stopping]

print(f"Batch size: {BATCH_SIZE}\nEpochs: {EPOCHS}")

### **FNN modell betöltése vagy tanítása**

In [None]:
LOAD_MODEL = True
LOAD_MODEL_FILENAME = "202504241513_batch256_epochs15_fnn.keras"

if LOAD_MODEL == True:
    print(f"Modell betöltés fájlból...\n{LOAD_MODEL_FILENAME}")
    fnn = tf.keras.models.load_model("saved_models/"+LOAD_MODEL_FILENAME)
else:
    print("Modell tanítás megkezdése...")
    history = fnn.fit(
        X_train_transformed, y_train,
        shuffle=True,
        epochs=EPOCHS,
        batch_size=BATCH_SIZE,
        callbacks = cb,
        validation_data = (X_val_transformed, y_val),
        sample_weight = sample_weights
    )
    model_loss = methods.plot_history(history)
    model_loss.savefig(os.path.join(current_fnn_dir, "model_loss.png"))

### **Előrejelzések készítése**

In [None]:
y_test_predicted = fnn.predict(X_test_transformed)
y_val_predicted = fnn.predict(X_val_transformed)

### **Metrikák, küszöbértékek kiszámítása**

In [None]:
precision, recall, threshold = precision_recall_curve(y_val, y_val_predicted)
f1_score = 2*precision[:-1]*recall[:-1] / (precision[:-1]+recall[:-1])
average_precision = average_precision_score(y_test, y_test_predicted)

best_index = np.argmax(f1_score)
best_threshold = threshold[best_index]

best_precision = precision[best_index]
best_recall = recall[best_index]
max_f1_score = f1_score[best_index]

### **ROC görbe**

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, y_test_predicted)
roc_auc = auc(fpr, tpr)

roc_auc_curve = methods.plot_roc_curve(fpr, tpr, roc_auc)

if LOAD_MODEL == False:
    roc_auc_curve.savefig(os.path.join(current_fnn_dir, "ROC_curve.png"))

### **Osztályokhoz rendelés, és $classification\_report$**

In [None]:
y_test_predicted_best_threshold = (y_test_predicted >= best_threshold).astype(int)

print(f"Test:\n{classification_report(y_test, y_test_predicted_best_threshold)}")

### **Metrikák kiíratása**

In [None]:
metrics.print_metrics(y_test, y_test_predicted_best_threshold,LOAD_MODEL,
                      FILENAME=filename,
                      LOAD_MODEL_FILENAME=LOAD_MODEL_FILENAME)
print(f"ROC-AUC score: {roc_auc}")
print(f"Best threshold: {best_threshold}")

### **Konfúziós mátrix**

In [None]:
cm = confusion_matrix(y_test, y_test_predicted_best_threshold)

confusion_matrix_best_threshold = methods.plot_confusion_matrix(cm)
if LOAD_MODEL==False:
    confusion_matrix_best_threshold.savefig(os.path.join(current_fnn_dir, "confusion_matrix.png"))

### **Legnagyobb F1 score**

In [None]:
f1_score_plot = methods.plot_f1_score(threshold, best_threshold, f1_score, max_f1_score)

if LOAD_MODEL == False:
    f1_score_plot.savefig(os.path.join(current_fnn_dir, "max_f1_score.png"))

### **PR görbe**

In [None]:
pr_curve = methods.plot_pr_curve(precision, recall, average_precision, best_precision, best_recall, max_f1_score)

if LOAD_MODEL == False:
    pr_curve.savefig(os.path.join(current_fnn_dir, "PR_curve.png"))