## **Csalás felderítés lehetőségei gépi tanuló modellek segítségével - LightGBM**

****

### **Könyvtárak, függvények, osztályok importálása**

In [None]:
import time
import os
import sys
import importlib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib

from datetime import datetime
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, average_precision_score, roc_curve, auc, confusion_matrix, classification_report

import lightgbm
from sklearn.model_selection import GridSearchCV

### **Saját modulok importálása**

In [None]:
current_dir = os.getcwd()
while True:
    
    if os.path.basename(current_dir) == "DataScience":
        PATH = os.path.join(current_dir, "utils")
        break
    parent_dir = os.path.dirname(current_dir)
    
    if parent_dir == current_dir:
        raise FileNotFoundError("A \"DataScience\" mappa nem található a mappa-hierarchiában.")
    
    current_dir = parent_dir
    
sys.path.append(PATH)
import methods
import metrics
importlib.reload(methods)
importlib.reload(metrics)

### **Adathalmaz beolvasása**

In [None]:
DATA = methods.read_paysim(get_original_data=False)

X = DATA.drop('isfraud', axis=1)
y = DATA["isfraud"]

X.head(5)

### **Adathalmaz felosztása**

In [None]:
TEST_SIZE = 0.30
VALIDATE_SIZE = 1/4

X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=1, stratify=y)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=VALIDATE_SIZE, random_state=1, stratify=y_temp)
print(f"Shapes:\nTrain: {X_train.shape}\nValidation: {X_val.shape}\nTest: {X_test.shape}")

del X_temp, y_temp

### **Numerikus oszlopok skálázása**

In [None]:
categorical_columns = ["transaction_type", "sender_receiver_type"]
numerical_columns = [col for col in X.columns if col not in categorical_columns]

standard_scaler = StandardScaler()
standard_scaler.fit(X_train[numerical_columns])

X_train_scaled = pd.DataFrame(
    standard_scaler.transform(X_train[numerical_columns]),
    columns=numerical_columns, index=X_train.index)

X_val_scaled = pd.DataFrame(
    standard_scaler.transform(X_val[numerical_columns]),
    columns=numerical_columns, index=X_val.index)

X_test_scaled = pd.DataFrame(
    standard_scaler.transform(X_test[numerical_columns]),
    columns=numerical_columns, index=X_test.index)

X_train_transformed = pd.concat([X_train[categorical_columns], X_train_scaled], axis=1)
X_val_transformed = pd.concat([X_val[categorical_columns], X_val_scaled], axis=1)
X_test_transformed = pd.concat([X_test[categorical_columns], X_test_scaled], axis=1)


del X_train_scaled, X_val_scaled, X_test_scaled

### **Kategorikus oszlopok $category$ típussá konvertálása**

In [None]:
for col in categorical_columns:
    X_train_transformed[col] = X_train_transformed[col].astype("category")
    X_test_transformed[col] = X_test_transformed[col].astype("category")
    X_val_transformed[col] = X_val_transformed[col].astype("category")

In [None]:
X_train_transformed.head(5)

#### Indexek ellenőrzése

In [None]:
print(f"Train index: {(X_train_transformed.index == X_train.index).all()}")
print(f"Test index: {(X_test_transformed.index == X_test.index).all()}")
print(f"Val index: {(X_val_transformed.index == X_val.index).all()}")

### **Szükséges mappák létrehozása**

In [None]:
saved_models_dirname = "saved_models"
os.makedirs(saved_models_dirname, exist_ok=True)

yyyymmddHHMM = datetime.now().strftime("%Y%m%d%H%M")
filename = os.path.join(saved_models_dirname, f"{yyyymmddHHMM}_lightgbm.pkl")

lightgbm_result_plots_dir = "lightgbm_result_plots"
current_lightgbm_dir = os.path.join(lightgbm_result_plots_dir, yyyymmddHHMM)
os.makedirs(current_lightgbm_dir, exist_ok=True)

### **GridSearchCV és LightGBM modell definiálása**

In [None]:
LOAD_MODEL = True
LOAD_MODEL_FILENAME = "202504162048_lightgbm.pkl"
GRIDSEARCH = False

#* Mindenképpen meg lesz adva a modell definiálásánál
lgbm_fix_params = {
    "random_state": 1, "objective": "binary",
    "metric": "binary_logloss",
}
lgbm_params = {
    "learning_rate": 0.001, "colsample_bytree": 0.7,
    "subsample": 0.7, "n_estimators": 100,
    "num_leaves": 16, "max_depth": 4,
    "reg_alpha": 0.8, "reg_lambda": 0.8,
}
evals = {}

#* Modell betöltése
if LOAD_MODEL == True:
    print(f"Modell betöltés fájlból...\n{LOAD_MODEL_FILENAME}")
    lgbm = joblib.load(f"saved_models/{LOAD_MODEL_FILENAME}")
    print("Modell betöltve")

#* GridSearchCV / Előre definiált hiperparaméterek alapján modell létrehozás
else:
    
    if GRIDSEARCH==True:
        
        gridsearch_params = {}
        
        grid_search = GridSearchCV(
            estimator=lightgbm.LGBMClassifier(**lgbm_fix_params, **lgbm_params),
            param_grid=gridsearch_params,
            cv=3,
            scoring=["precision", "recall", "f1", "roc_auc"],
            refit="f1",
            return_train_score=True,
            n_jobs=6
        )
        
        grid_search_start = time.time()
        print(f"Info: GridSearchCV elkezdése a következő hiperparaméterekkel:\n{gridsearch_params}")
        grid_search.fit(X_train_transformed, y_train)
        print("Info: GridSearchCV befejezve")
        grid_search_end = time.time()
        print(f"Eltelt idő: {grid_search_end - grid_search_start} s")
                
        print(f"\nInfo: GridSearch legjobb hiperparaméterek kombinációk:\n{grid_search.best_params_}")
        print(f"Alap hiperparaméterek:\n{lgbm_fix_params}")
        
        lgbm_params = {**lgbm_params, **grid_search.best_params_}
        
    print(f"Info: LightGBM modell létrehozása a következő hiperparaméterekkel\n{lgbm_params}")
    
    lgbm = lightgbm.LGBMClassifier(
        **lgbm_fix_params,
        **lgbm_params
    )
        
    print(f"\nTeljes hiperparaméter lista:")
    for i,j in lgbm.get_params().items():
        print(f"{i}: {j}")

### **LightGBM modell betöltése vagy tanítása**

In [None]:
lgbm_fit_params = {
    "eval_set": [(X_train_transformed, y_train), (X_val_transformed, y_val)],
    "eval_metric": "binary_logloss",
    "callbacks": [lightgbm.early_stopping(stopping_rounds=10, verbose=True),
                  lightgbm.record_evaluation(evals)],
    "categorical_feature": categorical_columns
}

if LOAD_MODEL == False:
    print("Modell tanítás megkezdése...")
    lgbm.fit(
        X_train_transformed, y_train,
        **lgbm_fit_params
    )
    print(f"Modell mentése a következő elérési útra: {filename}")
    joblib.dump(lgbm, filename)
else:
    print("Info: Beolvasott modell nem kerül újra tanításra")

### **Metrika változása tanítás alatt (tanulási görbe)**

In [None]:
if LOAD_MODEL == False:
    lightgbm.plot_metric(evals)
    plt.savefig(os.path.join(current_lightgbm_dir, "metric_during_training.png"))
else:
    print("Info: Modell betöltésnél nem ábrázolható a tanítás közbeni metrika változása.\nA következő képen lehet megtekinteni: \"metric_during_training.png\"")

### **Feature Importance**

In [None]:
lightgbm.plot_importance(lgbm)
plt.tight_layout()

if LOAD_MODEL == False:
    plt.savefig(os.path.join(current_lightgbm_dir, "feature_importance.png"))

#### Maximális fa mélység lekérése (optimalizáláshoz kell)

In [None]:
tree_info = lgbm.booster_.dump_model()["tree_info"]
max_depth = methods.lgbm_get_max_tree_depth(tree_info)
print(f"Maximális fa mélység: {max_depth}")

### **Előrejelzések készítése**

In [None]:
X_val_predicted_proba = lgbm.predict_proba(X_val_transformed)[:, 1]
X_test_predicted_proba = lgbm.predict_proba(X_test_transformed)[:, 1]

### **Metrikák, küszöbértékek kiszámítása**

In [None]:
precision, recall, threshold = precision_recall_curve(y_val, X_val_predicted_proba)
f1_score = 2*precision[:-1]*recall[:-1] / (precision[:-1]+recall[:-1])
average_precision = average_precision_score(y_test, X_test_predicted_proba)

best_index = np.argmax(f1_score)
best_threshold = threshold[best_index]

best_precision = precision[best_index]
best_recall = recall[best_index]
max_f1_score = f1_score[best_index]

### **ROC görbe**

In [None]:
fpr, tpr, thresholds = roc_curve(y_test, X_test_predicted_proba)
roc_auc = auc(fpr, tpr)

roc_auc_curve = methods.plot_roc_curve(fpr, tpr, roc_auc)

if LOAD_MODEL == False:
    roc_auc_curve.savefig(os.path.join(current_lightgbm_dir, "ROC_curve.png"))

### **Osztályokhoz rendelés, és $classification\_report$**

In [None]:
y_test_predicted_best_threshold = (X_test_predicted_proba >= best_threshold).astype(int)

print(f"Test:\n{classification_report(y_test, y_test_predicted_best_threshold)}")

### **Metrikák kiíratása**

In [None]:
metrics.print_metrics(y_test, y_test_predicted_best_threshold, LOAD_MODEL,
                      FILENAME=filename,
                      LOAD_MODEL_FILENAME=LOAD_MODEL_FILENAME)
print(f"ROC-AUC score: {roc_auc}")
print(f"Best threshold: {best_threshold}")

### **Konfúziós mátrix**

In [None]:
cm = confusion_matrix(y_test, y_test_predicted_best_threshold)

confusion_matrix_best_threshold = methods.plot_confusion_matrix(cm)
if LOAD_MODEL==False:
    confusion_matrix_best_threshold.savefig(os.path.join(current_lightgbm_dir, "confusion_matrix.png"))

### **Legnagyobb F1 score**

In [None]:
f1_score_plot = methods.plot_f1_score(threshold, best_threshold, f1_score, max_f1_score)

if LOAD_MODEL == False:
    f1_score_plot.savefig(os.path.join(current_lightgbm_dir, "max_f1_score.png"))

### **PR görbe**

In [None]:
pr_curve = methods.plot_pr_curve(precision, recall, average_precision, best_precision, best_recall, max_f1_score)

if LOAD_MODEL == False:
    pr_curve.savefig(os.path.join(current_lightgbm_dir, "PR_curve.png"))