In [1]:
import sys
import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

# 1. Elérési út beállítása (hogy megtalálja a src modult)
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
src_path = os.path.join(project_root, 'src')
if src_path not in sys.path:
    sys.path.insert(0, src_path)

# 2. Saját modulok importálása
try:
    import config
    from utils import setup_logger, FlagDataset, BaselineLSTM
except ImportError as e:
    print(f"Error importing modules: {e}")
    # Fallback: ha a scriptet közvetlenül a src-ből futtatjuk
    import config
    from utils import setup_logger, FlagDataset, BaselineLSTM

# 3. Logger inicializálása
logger = setup_logger()



In [2]:

def prepare_data(label_path, data_root, output_dir, batch_size, seq_len):
    """
    Betölti az adatokat, spliteli és DataLoader-eket készít.
    """
    logger.info(f"\n[1] DATA PREPARATION...")
    logger.info(f"    Labels: {label_path}")
    logger.info(f"    CSV Data: {data_root}")

    if not os.path.exists(label_path):
        logger.error(f"ERROR: Label file not found: {label_path}")
        return None

    df_labels = pd.read_csv(label_path)

    # Szűrés: Csak azokat tartsuk meg, ahol van generált CSV
    df_labels = df_labels.dropna(subset=['clean_csv_filename'])

    logger.info(f"    -> Total samples: {len(df_labels)}")
    if len(df_labels) < 32:
        logger.error("ERROR: Not enough data for training (<32 samples)!")
        return None

    # Label Encoding
    le = LabelEncoder()
    df_labels['label_idx'] = le.fit_transform(df_labels['label'])

    # Osztálynevek mentése
    classes_path = os.path.join(output_dir, 'classes.npy')
    np.save(classes_path, le.classes_)
    logger.info(f"    -> Classes saved to: {classes_path}")
    logger.info(f"    -> Classes: {le.classes_}")

    # Split (Train / Val)
    train_val, test = train_test_split(df_labels, test_size=0.15, stratify=df_labels['label'], random_state=42)
    train, val = train_test_split(train_val, test_size=0.176, stratify=train_val['label'], random_state=42)

    # Class Weights számítása
    y_train = train['label_idx'].values
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)

    # Datasets
    # Fontos: data_root a CSV-k helye!
    train_ds = FlagDataset(train, csv_dir=data_root, seq_len=seq_len)
    val_ds = FlagDataset(val, csv_dir=data_root, seq_len=seq_len)

    # DataLoaders
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

    return {
        'train': train_loader,
        'val': val_loader,
        'weights': class_weights,
        'num_classes': len(le.classes_)
    }


def train_engine(model, data_package, model_name="baseline_lstm"):
    logger.info(f"\n[2] TRAINING {model_name.upper()} START...")
    logger.info(f"    -> Config: BS={config.BATCH_SIZE}, Seq={config.SEQUENCE_LENGTH}, In={config.INPUT_SIZE}")
    logger.info(f"    -> Model: Hidden={config.HIDDEN_SIZE}, Layers={config.NUM_LAYERS}")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logger.info(f"    -> Device: {device}")

    model = model.to(device)

    # Loss & Optimizer
    weights_tensor = torch.tensor(data_package['weights'], dtype=torch.float).to(device)
    criterion = nn.CrossEntropyLoss(weight=weights_tensor)

    optimizer = optim.Adam(model.parameters(), lr=config.LEARNING_RATE)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)

    best_val_acc = 0.0
    best_val_loss = float('inf')
    patience_counter = 0

    for epoch in range(config.EPOCHS):
        # --- TRAIN LOOP ---
        model.train()
        train_loss, correct, total = 0, 0, 0

        for X, y in data_package['train']:
            X, y = X.to(device), y.to(device)
            optimizer.zero_grad()
            outputs = model(X)
            loss = criterion(outputs, y)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), config.CLIP_VALUE)
            optimizer.step()

            train_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += y.size(0)
            correct += (predicted == y).sum().item()

        # --- VALIDATION LOOP ---
        model.eval()
        val_loss, v_correct, v_total = 0, 0, 0
        with torch.no_grad():
            for X, y in data_package['val']:
                X, y = X.to(device), y.to(device)
                outputs = model(X)
                loss = criterion(outputs, y)
                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                v_total += y.size(0)
                v_correct += (predicted == y).sum().item()

        # Metrikák
        avg_t_loss = train_loss / len(data_package['train'])
        avg_v_loss = val_loss / len(data_package['val'])

        t_acc = 100 * correct / total if total > 0 else 0
        v_acc = 100 * v_correct / v_total if v_total > 0 else 0

        scheduler.step(avg_v_loss)

        logger.info(
            f"Epoch {epoch + 1}/{config.EPOCHS} | "
            f"Loss: {avg_t_loss:.4f}/{avg_v_loss:.4f} | "
            f"Acc: {t_acc:.1f}%/{v_acc:.1f}% | "
            f"LR: {optimizer.param_groups[0]['lr']:.6f}"
        )

        # Checkpoint
        if v_acc > best_val_acc:
            best_val_acc = v_acc
            save_path = os.path.join(config.OUTPUT_DIR, f'{model_name}_best.pth')
            torch.save(model.state_dict(), save_path)
            logger.info(f"    -> Model saved! (Acc: {v_acc:.1f}%)")

        # Early Stopping
        if avg_v_loss < best_val_loss:
            best_val_loss = avg_v_loss
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= config.EARLY_STOPPING_PATIENCE:
                logger.info(f"\n[STOP] Early Stopping triggered after {config.EARLY_STOPPING_PATIENCE} epochs.")
                break

    logger.info("\n[INFO] Training finished.")


# Baseline model new

In [1]:

if __name__ == "__main__":
    # Ensure Output Directory Exists
    if not os.path.exists(config.OUTPUT_DIR):
        os.makedirs(config.OUTPUT_DIR)

    # Adatok betöltése a CONFIG paramétereivel
    data = prepare_data(
        label_path=config.LABEL_FILE,
        data_root=config.DATA_ROOT,
        output_dir=config.OUTPUT_DIR,
        batch_size=config.BATCH_SIZE,       # Configból
        seq_len=config.SEQUENCE_LENGTH      # Configból
    )

    if data:
        # Modell inicializálása
        # A BaselineLSTM osztály a utils.py-ban már a config-ból olvassa a hidden size-t!
        model = BaselineLSTM(
            input_size=config.INPUT_SIZE,   # Configból
            num_classes=data['num_classes']
        )

        # Start Training
        train_engine(model, data, model_name="baseline_lstm")

2025-12-12 20:01:20,099 - INFO - 
[1] DATA PREPARATION...
2025-12-12 20:01:20,099 - INFO -     Labels: /home/bence/PycharmProjects/Melytanulas/data/ground_truth_labels.csv
2025-12-12 20:01:20,100 - INFO -     CSV Data: /home/bence/PycharmProjects/Melytanulas/data
2025-12-12 20:01:20,105 - INFO -     -> Total samples: 552
2025-12-12 20:01:20,107 - INFO -     -> Classes saved to: /home/bence/PycharmProjects/Melytanulas/notebook/output/classes.npy
2025-12-12 20:01:20,107 - INFO -     -> Classes: ['Bearish Normal' 'Bearish Pennant' 'Bearish Wedge' 'Bullish Normal'
 'Bullish Pennant' 'Bullish Wedge']
2025-12-12 20:01:20,115 - INFO - 
[2] TRAINING BASELINE_LSTM START...
2025-12-12 20:01:20,115 - INFO -     -> Config: BS=32, Seq=50, In=4
2025-12-12 20:01:20,115 - INFO -     -> Model: Hidden=64, Layers=2
2025-12-12 20:01:20,116 - INFO -     -> Device: cpu
2025-12-12 20:01:28,024 - INFO - Epoch 1/1000 | Loss: 1.7915/1.7882 | Acc: 9.6%/14.5% | LR: 0.001000
2025-12-12 20:01:28,027 - INFO -     ->

In [3]:
import os
import pandas as pd
import numpy as np
import torch
from torch.utils.data import DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


def evaluate_model():
    logger.info("--- BASELINE MODEL KIÉRTÉKELÉSE ---")

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    logger.info(f"Eszköz: {device}")

    # Útvonalak beállítása a config alapján
    # Feltételezzük, hogy a modell és az osztályok az OUTPUT mappában vannak
    model_path = os.path.join(config.OUTPUT_DIR, 'baseline_lstm_best.pth')
    classes_path = os.path.join(config.OUTPUT_DIR, 'classes.npy')

    # A Label file helye a config-ból jön
    # (Megjegyzés: Ha a 01-es script a DATA_ROOT-ba mentette, akkor a config.LABEL_FILE-nak oda kell mutatnia!)
    label_path = config.LABEL_FILE

    # 1. Ellenőrzés
    if not os.path.exists(label_path):
        # Fallback: Ha nincs az OUTPUT-ban, megnézzük a DATA-ban is
        alt_label_path = os.path.join(config.DATA_ROOT, "ground_truth_labels.csv")
        if os.path.exists(alt_label_path):
            label_path = alt_label_path
            logger.info(f"Label fájl megtalálva a DATA mappában: {label_path}")
        else:
            logger.error(f"HIBA: Nincs label fájl itt: {config.LABEL_FILE} (sem a DATA mappában)")
            return

    if not os.path.exists(model_path):
        logger.error(f"HIBA: Nincs elmentett modell itt: {model_path}. Futtasd le előbb a 02-training.py-t!")
        return

    if not os.path.exists(classes_path):
        logger.error(f"HIBA: Nincs classes.npy fájl itt: {classes_path}")
        return

    # 2. Test Set Regenerálása
    # Fontos: Ugyanazt a logikát és random_state-et kell használni, mint a trainingnél!
    df_labels = pd.read_csv(label_path)

    # Adattisztítás (ugyanaz, mint trainingnél)
    df_labels = df_labels.dropna(subset=['clean_csv_filename'])

    # Asset szűrés (biztonság kedvéért)
    if hasattr(config, 'ALLOWED_ASSETS'):
        mask = df_labels['clean_csv_filename'].apply(lambda x: any(a in str(x) for a in config.ALLOWED_ASSETS))
        df_labels = df_labels[mask].reset_index(drop=True)

    le = LabelEncoder()
    # A transformhoz illeszteni kell, de a visszafejtéshez a mentett osztályokat használjuk
    df_labels['label_idx'] = le.fit_transform(df_labels['label'])

    # Betöltjük a mentett osztályneveket a helyes sorrend érdekében
    class_names = np.load(classes_path, allow_pickle=True)

    # Split - random_state=42 GARANTÁLJA az egyezést a training splittel
    train_val, test_df = train_test_split(df_labels, test_size=0.15, stratify=df_labels['label'], random_state=42)

    logger.info(f"Test Set mérete: {len(test_df)} minta")

    # Dataset és Loader
    # FONTOS: A CSV-k a config.DATA_ROOT-ban vannak (a 01-es script oda mentette őket)!
    test_ds = FlagDataset(test_df, config.DATA_ROOT)
    test_loader = DataLoader(test_ds, batch_size=config.BATCH_SIZE, shuffle=False)

    # 3. Modell Betöltése
    model = BaselineLSTM(input_size=config.INPUT_SIZE, num_classes=len(class_names))

    try:
        model.load_state_dict(torch.load(model_path, map_location=device))
        model.to(device)
        model.eval()
    except Exception as e:
        logger.error(f"HIBA a modell betöltésekor: {e}")
        return

    # 4. Predikció
    all_preds = []
    all_targets = []

    logger.info("Predikció futtatása...")
    with torch.no_grad():
        for X, y in test_loader:
            X, y = X.to(device), y.to(device)
            outputs = model(X)
            _, predicted = torch.max(outputs.data, 1)

            all_preds.extend(predicted.cpu().numpy())
            all_targets.extend(y.cpu().numpy())

    # 5. Eredmény
    logger.info("VÉGLEGES TEST SET EREDMÉNYEK")

    # Classification Report generálása
    report = classification_report(all_targets, all_preds, target_names=class_names, zero_division=0)
    logger.info("\n" + report)


if __name__ == "__main__":
    evaluate_model()

2025-12-12 20:03:47,988 - INFO - --- BASELINE MODEL KIÉRTÉKELÉSE ---
2025-12-12 20:03:47,989 - INFO - Eszköz: cpu
2025-12-12 20:03:47,996 - INFO - Test Set mérete: 83 minta
2025-12-12 20:03:48,000 - INFO - Predikció futtatása...
2025-12-12 20:03:51,190 - INFO - VÉGLEGES TEST SET EREDMÉNYEK
2025-12-12 20:03:51,196 - INFO - 
                 precision    recall  f1-score   support

 Bearish Normal       0.45      0.77      0.57        22
Bearish Pennant       0.00      0.00      0.00         6
  Bearish Wedge       0.00      0.00      0.00         9
 Bullish Normal       0.70      0.27      0.39        26
Bullish Pennant       0.14      0.25      0.18         8
  Bullish Wedge       0.18      0.25      0.21        12

       accuracy                           0.35        83
      macro avg       0.24      0.26      0.22        83
   weighted avg       0.38      0.35      0.32        83



# Main model

In [3]:
# ... (Importoknál add hozzá a HybridModel-t) ...
try:
    import config
    from utils import setup_logger, FlagDataset, BaselineLSTM, HybridModel # <--- ITT
except ImportError:
    # ... fallback importnál is add hozzá ...
    import config
    from utils import setup_logger, FlagDataset, BaselineLSTM, HybridModel

# ... (prepare_data és train_engine változatlan) ...

if __name__ == "__main__":
    if not os.path.exists(config.OUTPUT_DIR):
        os.makedirs(config.OUTPUT_DIR)

    # Adatok betöltése (Ugyanaz maradhat)
    data = prepare_data(
        label_path=config.LABEL_FILE,
        data_root=config.DATA_ROOT,
        output_dir=config.OUTPUT_DIR,
        batch_size=config.BATCH_SIZE,
        seq_len=config.SEQUENCE_LENGTH
    )

    if data:
        # --- VÁLTÁS HYBRID MODELLRE ---
        logger.info(">>> HYBRID MODEL (CNN-Transformer) KIVÁLASZTVA <<<")

        model = HybridModel(
            input_size=config.INPUT_SIZE,
            num_classes=data['num_classes']
        )

        # Tanítás indítása (FONTOS: új név!)
        train_engine(model, data, model_name="hybrid_model")

2025-12-12 20:09:21,142 - INFO - 
[1] DATA PREPARATION...
2025-12-12 20:09:21,142 - INFO -     Labels: /home/bence/PycharmProjects/Melytanulas/data/ground_truth_labels.csv
2025-12-12 20:09:21,142 - INFO -     CSV Data: /home/bence/PycharmProjects/Melytanulas/data
2025-12-12 20:09:21,148 - INFO -     -> Total samples: 552
2025-12-12 20:09:21,151 - INFO -     -> Classes saved to: /home/bence/PycharmProjects/Melytanulas/notebook/output/classes.npy
2025-12-12 20:09:21,151 - INFO -     -> Classes: ['Bearish Normal' 'Bearish Pennant' 'Bearish Wedge' 'Bullish Normal'
 'Bullish Pennant' 'Bullish Wedge']
2025-12-12 20:09:21,155 - INFO - >>> HYBRID MODEL (CNN-Transformer) KIVÁLASZTVA <<<
2025-12-12 20:09:21,160 - INFO - 
[2] TRAINING HYBRID_MODEL START...
2025-12-12 20:09:21,161 - INFO -     -> Config: BS=32, Seq=50, In=4
2025-12-12 20:09:21,161 - INFO -     -> Model: Hidden=64, Layers=2
2025-12-12 20:09:21,161 - INFO -     -> Device: cpu
2025-12-12 20:09:28,573 - INFO - Epoch 1/1000 | Loss: 1.7