In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, learning_curve
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, auc, f1_score
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

# Mappák létrehozása
def create_directories():
    base_dir = 'combined_output'
    subdirs = {
        'combined_data': os.path.join(base_dir, 'combined_data'),
        'predictions': os.path.join(base_dir, 'predictions'),
        'feature_importance': os.path.join(base_dir, 'feature_importance'),
        'learning_curves': os.path.join(base_dir, 'learning_curves'),
        'confusion_matrices': os.path.join(base_dir, 'confusion_matrices'),
        'results': os.path.join(base_dir, 'results')
    }
    for subdir in subdirs.values():
        os.makedirs(subdir, exist_ok=True)
    return subdirs

# Adatelőkészítés
def prepare_data():
    data = pd.read_csv('animal_condition.csv')
    print("Missing values in Dangerous:", data['Dangerous'].isnull().sum())
    data = data.dropna(subset=['Dangerous'])
    data['Dangerous'] = data['Dangerous'].str.strip().str.capitalize()
    valid_values = ['Yes', 'No']
    data = data[data['Dangerous'].isin(valid_values)]
    print("Original class distribution:\n", data['Dangerous'].value_counts())

    feature_cols = ['AnimalName', 'symptoms1', 'symptoms2', 'symptoms3', 'symptoms4', 'symptoms5']
    data[feature_cols] = data[feature_cols].fillna('Unknown')
    X = pd.get_dummies(data[feature_cols], columns=feature_cols)
    y = data['Dangerous'].map({'Yes': 1, 'No': 0})

    # Hold-out set létrehozása (20%)
    X_orig, X_holdout, y_orig, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    # Imbalanced tesztkészlet létrehozása (20% az X_orig-ból)
    X_train_val, X_test_imbalanced, y_train_val, y_test_imbalanced = train_test_split(X_orig, y_orig, test_size=0.2, random_state=42, stratify=y_orig)

    # Kombinált technikák definiálása
    combined_samplers = {
        'smote_tomek': SMOTETomek(smote=SMOTE(sampling_strategy=0.5, random_state=42), tomek=TomekLinks(sampling_strategy='majority'), random_state=42),
        'random_over_under': Pipeline([
            ('oversample', RandomOverSampler(sampling_strategy=0.5, random_state=42)),
            ('undersample', RandomUnderSampler(sampling_strategy=1.0, random_state=42))
        ]),
        'adasyn_nearmiss': Pipeline([
            ('oversample', ADASYN(sampling_strategy=0.5, random_state=42)),
            ('undersample', NearMiss(sampling_strategy=1.0, version=1))
        ])
    }

    combined_data = {}
    for name, sampler in combined_samplers.items():
        print(f"\nApplying {name}...")
        X_res, y_res = sampler.fit_resample(X_train_val, y_train_val)
        df = pd.DataFrame(X_res, columns=X.columns)
        df['Dangerous'] = y_res.map({1: 'Yes', 0: 'No'})
        # Ellenőrizzük, hogy a célfájl létezik-e, és töröljük, ha igen
        csv_path = os.path.join('combined_output/combined_data', f'combined_{name}.csv')
        if os.path.exists(csv_path):
            os.remove(csv_path)
        df.to_csv(csv_path, index=False)
        print(f"Class distribution after {name}:\n", df['Dangerous'].value_counts())
        combined_data[name] = df

    return combined_data, X_holdout, y_holdout, X_test_imbalanced, y_test_imbalanced, X.columns, data

# Mappák létrehozása
output_dirs = create_directories()

# Betöltés
combined_files = {
    'smote_tomek': os.path.join(output_dirs['combined_data'], 'combined_smote_tomek.csv'),
    'random_over_under': os.path.join(output_dirs['combined_data'], 'combined_random_over_under.csv'),
    'adasyn_nearmiss': os.path.join(output_dirs['combined_data'], 'combined_adasyn_nearmiss.csv')
}
combined_data = {}
X_holdout, y_holdout = None, None
X_test_imbalanced, y_test_imbalanced = None, None
feature_columns = None
original_data = None

if all(os.path.exists(file) for file in combined_files.values()):
    print("Loading existing combined datasets...")
    for name, file in combined_files.items():
        data = pd.read_csv(file)
        # Ellenőrizzük, hogy az oszlopok konzisztensek legyenek
        original_data = pd.read_csv('animal_condition.csv')
        original_data = original_data.dropna(subset=['Dangerous'])
        original_data['Dangerous'] = original_data['Dangerous'].str.strip().str.capitalize()
        original_data = original_data[original_data['Dangerous'].isin(['Yes', 'No'])]
        feature_cols = ['AnimalName', 'symptoms1', 'symptoms2', 'symptoms3', 'symptoms4', 'symptoms5']
        original_data[feature_cols] = original_data[feature_cols].fillna('Unknown')
        X_orig = pd.get_dummies(original_data[feature_cols], columns=feature_cols)
        # Az adatkészlet már one-hot encoded, csak biztosítjuk, hogy az oszlopok megegyezzenek
        data = data.reindex(columns=X_orig.columns, fill_value=0)
        data['Dangerous'] = pd.read_csv(file)['Dangerous']
        combined_data[name] = data
        print(f"Class distribution after loading {name}:\n", data['Dangerous'].value_counts())
    X_orig_full = X_orig
    y_orig_full = original_data['Dangerous'].map({'Yes': 1, 'No': 0})
    X_orig, X_holdout, y_orig, y_holdout = train_test_split(X_orig_full, y_orig_full, test_size=0.2, random_state=42, stratify=y_orig_full)
    X_train_val, X_test_imbalanced, y_train_val, y_test_imbalanced = train_test_split(X_orig, y_orig, test_size=0.2, random_state=42, stratify=y_orig)
    feature_columns = X_orig.columns
else:
    combined_data, X_holdout, y_holdout, X_test_imbalanced, y_test_imbalanced, feature_columns, original_data = prepare_data()

# Modellek definiálása
models = {
    'random_forest': RandomForestClassifier(class_weight='balanced', random_state=42, n_estimators=100, max_depth=3, min_samples_split=15),
    'logistic_regression': LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000, C=0.005),
    'xgboost': XGBClassifier(scale_pos_weight=849/20, random_state=42, eval_metric='logloss', max_depth=3, reg_lambda=3, alpha=1)
}

# Tesztelés és értékelés
results = []
learning_curve_data = []
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
X_orig_full = pd.get_dummies(original_data[feature_cols], columns=feature_cols)
y_orig_full = original_data['Dangerous'].map({'Yes': 1, 'No': 0})
scaler_orig = StandardScaler()
X_orig_scaled = scaler_orig.fit_transform(X_orig_full)

for sampler_name, data in combined_data.items():
    print(f"\nProcessing {sampler_name} dataset...")
    X = data.drop('Dangerous', axis=1)
    y = data['Dangerous'].map({'Yes': 1, 'No': 0})
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

    for model_name, model in models.items():
        print(f"Training {model_name} on {sampler_name}...")

        cv_f1 = cross_val_score(model, X_train, y_train, cv=skf, scoring='f1_macro')
        cv_roc_auc = cross_val_score(model, X_train, y_train, cv=skf, scoring='roc_auc')
        cv_f1_orig = cross_val_score(model, X_orig_scaled, y_orig_full, cv=skf, scoring='f1_macro')
        cv_roc_auc_orig = cross_val_score(model, X_orig_scaled, y_orig_full, cv=skf, scoring='roc_auc')

        model.fit(X_train, y_train)

        y_test_pred = model.predict(X_test)
        y_test_scores = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else model.predict(X_test)

        test_f1 = f1_score(y_test, y_test_pred, average='macro')
        test_roc_auc = roc_auc_score(y_test, y_test_pred)
        precision, recall, _ = precision_recall_curve(y_test, y_test_scores)
        pr_auc = auc(recall, precision)

        # Konfúziós mátrix számítása
        cm = confusion_matrix(y_test, y_test_pred)

        X_holdout_scaled = scaler.transform(X_holdout)
        y_holdout_pred = model.predict(X_holdout_scaled)
        holdout_f1 = f1_score(y_holdout, y_holdout_pred, average='macro')
        holdout_roc_auc = roc_auc_score(y_holdout, y_holdout_pred)

        # Feature Importance mentése (csak fa alapú modellekhez)
        if model_name in ['random_forest', 'xgboost']:
            feature_importance = pd.DataFrame({
                'Feature': feature_columns,
                'Importance': model.feature_importances_
            }).sort_values(by='Importance', ascending=False)
            importance_file = f'feature_importance_{sampler_name}_{model_name}.csv'
            target_path = os.path.join(output_dirs['feature_importance'], importance_file)
            if os.path.exists(target_path):
                os.remove(target_path)
            feature_importance.to_csv(importance_file, index=False)

        # Predikciók mentése
        pred_df = pd.DataFrame({
            'true_label': y_test,
            'predicted_label': y_test_pred,
            'proba_yes': y_test_scores
        })
        pred_file = f'predictions_{sampler_name}_{model_name}.csv'
        target_path = os.path.join(output_dirs['predictions'], pred_file)
        if os.path.exists(target_path):
            os.remove(target_path)
        pred_df.to_csv(pred_file, index=False)

        result = {
            'sampler': sampler_name,
            'model': model_name,
            'cv_f1_mean': cv_f1.mean(),
            'cv_f1_orig': cv_f1_orig.mean(),
            'test_f1': test_f1,
            'test_roc_auc': test_roc_auc,
            'pr_auc': pr_auc,
            'holdout_f1': holdout_f1,
            'holdout_roc_auc': holdout_roc_auc,
            'confusion_matrices': cm.tolist()  # Konfúziós mátrix tárolása
        }
        results.append(result)

        train_sizes, train_scores, valid_scores = learning_curve(model, X_scaled, y, cv=skf, scoring='f1_macro', train_sizes=np.linspace(0.1, 1.0, 10))
        for size, tr_score, val_score in zip(train_sizes, train_scores.mean(axis=1), valid_scores.mean(axis=1)):
            learning_curve_data.append({
                'sampler': sampler_name,
                'model': model_name,
                'train_size': size,
                'train_f1': tr_score,
                'valid_f1': val_score
            })

        # Tanulási görbe ábra mentése
        curve_file = f'learning_curve_{sampler_name}_{model_name}.png'
        target_path = os.path.join(output_dirs['learning_curves'], curve_file)
        if os.path.exists(target_path):
            os.remove(target_path)
        plt.figure(figsize=(8, 6))
        plt.plot(train_sizes, train_scores.mean(axis=1), label='Train F1')
        plt.plot(train_sizes, valid_scores.mean(axis=1), label='Validation F1')
        plt.title(f'Learning Curve: {sampler_name} - {model_name}')
        plt.xlabel('Training Size')
        plt.ylabel('F1 Score (Macro)')
        plt.legend()
        plt.grid(True)
        plt.savefig(curve_file)
        plt.close()

# Kimeneti fájlok áthelyezése a megfelelő mappákba
for res in results:
    sampler = res['sampler']
    model = res['model']

    # 🔹 Predikciók
    pred_file = f'predictions_{sampler}_{model}.csv'
    if os.path.exists(pred_file):
        target_path = os.path.join(output_dirs['predictions'], pred_file)
        if os.path.exists(target_path):
            os.remove(target_path)
        os.rename(pred_file, target_path)
    else:
        print(f"Warning: {pred_file} does not exist and cannot be moved.")

    # 🔹 Feature importance
    importance_file = f'feature_importance_{sampler}_{model}.csv'
    if os.path.exists(importance_file):
        target_path = os.path.join(output_dirs['feature_importance'], importance_file)
        if os.path.exists(target_path):
            os.remove(target_path)
        os.rename(importance_file, target_path)
    else:
        print(f"Warning: {importance_file} does not exist and cannot be moved.")

    # 🔹 Learning curve
    curve_file = f'learning_curve_{sampler}_{model}.png'
    if os.path.exists(curve_file):
        target_path = os.path.join(output_dirs['learning_curves'], curve_file)
        if os.path.exists(target_path):
            os.remove(target_path)
        os.rename(curve_file, target_path)
    else:
        print(f"Warning: {curve_file} does not exist and cannot be moved.")

    # 🔹 Konfúziós mátrix ábra mentése
    cm = np.array(res['confusion_matrices'])
    cm_filename = f'conf_matrix_{sampler}_{model}.png'
    target_path = os.path.join(output_dirs['confusion_matrices'], cm_filename)
    if os.path.exists(target_path):
        os.remove(target_path)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Pred: No', 'Pred: Yes'],
                yticklabels=['Actual: No', 'Actual: Yes'])
    plt.title(f'Confusion Matrix\n{sampler} + {model}')
    plt.xlabel('Predicted label')
    plt.ylabel('True label')
    plt.tight_layout()
    plt.savefig(target_path)
    plt.close()

# 📄 Végső eredmények mentése mappába
results_df = pd.DataFrame(results)
results_csv_path = os.path.join(output_dirs['results'], 'classification_results.csv')
results_json_path = os.path.join(output_dirs['results'], 'classification_results.json')
learning_curve_path = os.path.join(output_dirs['results'], 'learning_curve_results.csv')

if os.path.exists(results_csv_path):
    os.remove(results_csv_path)
if os.path.exists(results_json_path):
    os.remove(results_json_path)
if os.path.exists(learning_curve_path):
    os.remove(learning_curve_path)

results_df.to_csv(results_csv_path, index=False)
results_df.to_json(results_json_path, orient='records', lines=True)
learning_curve_df = pd.DataFrame(learning_curve_data)
learning_curve_df.to_csv(learning_curve_path, index=False)

print("\n📁 Minden fájl sikeresen elmentve strukturált mappákba! 😎")

KeyboardInterrupt: 

In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, learning_curve
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, auc, f1_score
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings
warnings.filterwarnings('ignore')

# Mappák létrehozása
def create_directories():
    base_dir = 'combined_output'
    subdirs = {
        'combined_data': os.path.join(base_dir, 'combined_data'),
        'predictions': os.path.join(base_dir, 'predictions'),
        'feature_importance': os.path.join(base_dir, 'feature_importance'),
        'learning_curves': os.path.join(base_dir, 'learning_curves'),
        'confusion_matrices': os.path.join(base_dir, 'confusion_matrices'),
        'results': os.path.join(base_dir, 'results')
    }
    for subdir in subdirs.values():
        os.makedirs(subdir, exist_ok=True)
    return subdirs

# Adatelőkészítés
def prepare_data():
    # Az előkészített adatfájl betöltése
    data = pd.read_csv('animal_condition_processed.csv')
    print("Missing values in Dangerous:", data['Dangerous'].isnull().sum())
    data = data.dropna(subset=['Dangerous'])
    data['Dangerous'] = data['Dangerous'].str.strip().str.capitalize()
    valid_values = ['Yes', 'No']
    data = data[data['Dangerous'].isin(valid_values)]
    print("Original class distribution:\n", data['Dangerous'].value_counts())

    feature_cols = ['AnimalGroup', 'Animal', 'Species', 'symptoms1', 'symptoms2', 'symptoms3', 'symptoms4', 'symptoms5']
    data[feature_cols] = data[feature_cols].fillna('unknown')
    X = pd.get_dummies(data[feature_cols], columns=feature_cols).astype(int)
    y = data['Dangerous'].map({'Yes': 1, 'No': 0})

    # Hold-out set létrehozása (20%)
    X_orig, X_holdout, y_orig, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    # Imbalanced tesztkészlet létrehozása (20% az X_orig-ból)
    X_train_val, X_test_imbalanced, y_train_val, y_test_imbalanced = train_test_split(X_orig, y_orig, test_size=0.2, random_state=42, stratify=y_orig)

    # Kombinált technikák definiálása
    combined_samplers = {
        'smote_tomek': SMOTETomek(smote=SMOTE(sampling_strategy=0.5, random_state=42), tomek=TomekLinks(sampling_strategy='majority'), random_state=42),
        'random_over_under': Pipeline([
            ('oversample', RandomOverSampler(sampling_strategy=0.5, random_state=42)),
            ('undersample', RandomUnderSampler(sampling_strategy=1.0, random_state=42))
        ]),
        'adasyn_nearmiss': Pipeline([
            ('oversample', ADASYN(sampling_strategy=0.5, random_state=42)),
            ('undersample', NearMiss(sampling_strategy=1.0, version=1))
        ])
    }

    combined_data = {}
    for name, sampler in combined_samplers.items():
        print(f"\nApplying {name}...")
        X_res, y_res = sampler.fit_resample(X_train_val, y_train_val)
        df = pd.DataFrame(X_res, columns=X.columns)
        df['Dangerous'] = y_res.map({1: 'Yes', 0: 'No'})
        # Ellenőrizzük, hogy a célfájl létezik-e, és töröljük, ha igen
        csv_path = os.path.join('combined_output/combined_data', f'combined_{name}.csv')
        if os.path.exists(csv_path):
            os.remove(csv_path)
        df.to_csv(csv_path, index=False)
        print(f"Class distribution after {name}:\n", df['Dangerous'].value_counts())
        combined_data[name] = df

    return combined_data, X_holdout, y_holdout, X_test_imbalanced, y_test_imbalanced, X.columns, data

# Mappák létrehozása
output_dirs = create_directories()

# Betöltés
combined_files = {
    'smote_tomek': os.path.join(output_dirs['combined_data'], 'combined_smote_tomek.csv'),
    'random_over_under': os.path.join(output_dirs['combined_data'], 'combined_random_over_under.csv'),
    'adasyn_nearmiss': os.path.join(output_dirs['combined_data'], 'combined_adasyn_nearmiss.csv')
}
combined_data = {}
X_holdout, y_holdout = None, None
X_test_imbalanced, y_test_imbalanced = None, None
feature_columns = None
original_data = None

if all(os.path.exists(file) for file in combined_files.values()):
    print("Loading existing combined datasets...")
    for name, file in combined_files.items():
        data = pd.read_csv(file)
        # Ellenőrizzük, hogy az oszlopok konzisztensek legyenek
        original_data = pd.read_csv('animal_condition_processed.csv')
        original_data = original_data.dropna(subset=['Dangerous'])
        original_data['Dangerous'] = original_data['Dangerous'].str.strip().str.capitalize()
        original_data = original_data[original_data['Dangerous'].isin(['Yes', 'No'])]
        feature_cols = ['AnimalGroup', 'Animal', 'Species', 'symptoms1', 'symptoms2', 'symptoms3', 'symptoms4', 'symptoms5']
        original_data[feature_cols] = original_data[feature_cols].fillna('unknown')
        X_orig = pd.get_dummies(original_data[feature_cols], columns=feature_cols).astype(int)
        # Az adatkészlet már one-hot encoded, csak biztosítjuk, hogy az oszlopok megegyezzenek
        data = data.reindex(columns=X_orig.columns, fill_value=0)
        data['Dangerous'] = pd.read_csv(file)['Dangerous']
        combined_data[name] = data
        print(f"Class distribution after loading {name}:\n", data['Dangerous'].value_counts())
    X_orig_full = X_orig
    y_orig_full = original_data['Dangerous'].map({'Yes': 1, 'No': 0})
    X_orig, X_holdout, y_orig, y_holdout = train_test_split(X_orig_full, y_orig_full, test_size=0.2, random_state=42, stratify=y_orig_full)
    X_train_val, X_test_imbalanced, y_train_val, y_test_imbalanced = train_test_split(X_orig, y_orig, test_size=0.2, random_state=42, stratify=y_orig)
    feature_columns = X_orig.columns
else:
    combined_data, X_holdout, y_holdout, X_test_imbalanced, y_test_imbalanced, feature_columns, original_data = prepare_data()

# Modellek definiálása
models = {
    'random_forest': RandomForestClassifier(class_weight='balanced', random_state=42, n_estimators=100, max_depth=3, min_samples_split=15),
    'logistic_regression': LogisticRegression(class_weight='balanced', random_state=42, max_iter=1000, C=0.005),
    'decision_tree': DecisionTreeClassifier(class_weight='balanced', random_state=42, max_depth=3, min_samples_split=15)
}

# Tesztelés és értékelés
results = []
learning_curve_data = []
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
X_orig_full = pd.get_dummies(original_data[feature_cols], columns=feature_cols).astype(int)
y_orig_full = original_data['Dangerous'].map({'Yes': 1, 'No': 0})
scaler_orig = StandardScaler()
X_orig_scaled = scaler_orig.fit_transform(X_orig_full)

for sampler_name, data in combined_data.items():
    print(f"\nProcessing {sampler_name} dataset...")
    X = data.drop('Dangerous', axis=1)
    y = data['Dangerous'].map({'Yes': 1, 'No': 0})
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)

    for model_name, model in models.items():
        print(f"Training {model_name} on {sampler_name}...")

        cv_f1 = cross_val_score(model, X_train, y_train, cv=skf, scoring='f1_macro')
        cv_roc_auc = cross_val_score(model, X_train, y_train, cv=skf, scoring='roc_auc')
        cv_f1_orig = cross_val_score(model, X_orig_scaled, y_orig_full, cv=skf, scoring='f1_macro')
        cv_roc_auc_orig = cross_val_score(model, X_orig_scaled, y_orig_full, cv=skf, scoring='roc_auc')

        model.fit(X_train, y_train)

        y_test_pred = model.predict(X_test)
        y_test_scores = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else model.predict(X_test)

        test_f1 = f1_score(y_test, y_test_pred, average='macro')
        test_roc_auc = roc_auc_score(y_test, y_test_pred)
        precision, recall, _ = precision_recall_curve(y_test, y_test_scores)
        pr_auc = auc(recall, precision)

        # Konfúziós mátrix számítása
        cm = confusion_matrix(y_test, y_test_pred)

        X_holdout_scaled = scaler.transform(X_holdout)
        y_holdout_pred = model.predict(X_holdout_scaled)
        holdout_f1 = f1_score(y_holdout, y_holdout_pred, average='macro')
        holdout_roc_auc = roc_auc_score(y_holdout, y_holdout_pred)

        # Feature Importance mentése (csak fa alapú modellekhez)
        if model_name in ['random_forest', 'decision_tree']:
            feature_importance = pd.DataFrame({
                'Feature': feature_columns,
                'Importance': model.feature_importances_
            }).sort_values(by='Importance', ascending=False)
            importance_file = f'feature_importance_{sampler_name}_{model_name}.csv'
            target_path = os.path.join(output_dirs['feature_importance'], importance_file)
            if os.path.exists(target_path):
                os.remove(target_path)
            feature_importance.to_csv(target_path, index=False)

        # Predikciók mentése
        pred_df = pd.DataFrame({
            'true_label': y_test,
            'predicted_label': y_test_pred,
            'proba_yes': y_test_scores
        })
        pred_file = f'predictions_{sampler_name}_{model_name}.csv'
        target_path = os.path.join(output_dirs['predictions'], pred_file)
        if os.path.exists(target_path):
            os.remove(target_path)
        pred_df.to_csv(target_path, index=False)

        result = {
            'sampler': sampler_name,
            'model': model_name,
            'cv_f1_mean': cv_f1.mean(),
            'cv_f1_orig': cv_f1_orig.mean(),
            'test_f1': test_f1,
            'test_roc_auc': test_roc_auc,
            'pr_auc': pr_auc,
            'holdout_f1': holdout_f1,
            'holdout_roc_auc': holdout_roc_auc,
            'confusion_matrices': cm.tolist()
        }
        results.append(result)

        train_sizes, train_scores, valid_scores = learning_curve(model, X_scaled, y, cv=skf, scoring='f1_macro', train_sizes=np.linspace(0.1, 1.0, 10))
        for size, tr_score, val_score in zip(train_sizes, train_scores.mean(axis=1), valid_scores.mean(axis=1)):
            learning_curve_data.append({
                'sampler': sampler_name,
                'model': model_name,
                'train_size': size,
                'train_f1': tr_score,
                'valid_f1': val_score
            })

        # Tanulási görbe ábra mentése
        curve_file = f'learning_curve_{sampler_name}_{model_name}.png'
        target_path = os.path.join(output_dirs['learning_curves'], curve_file)
        if os.path.exists(target_path):
            os.remove(target_path)
        plt.figure(figsize=(8, 6))
        plt.plot(train_sizes, train_scores.mean(axis=1), label='Train F1')
        plt.plot(train_sizes, valid_scores.mean(axis=1), label='Validation F1')
        plt.title(f'Learning Curve: {sampler_name} - {model_name}')
        plt.xlabel('Training Size')
        plt.ylabel('F1 Score (Macro)')
        plt.legend()
        plt.grid(True)
        plt.savefig(target_path)
        plt.close()

# Kimeneti fájlok áthelyezése a megfelelő mappákba
for res in results:
    sampler = res['sampler']
    model = res['model']

    # Predikciók
    pred_file = f'predictions_{sampler}_{model}.csv'
    if os.path.exists(pred_file):
        target_path = os.path.join(output_dirs['predictions'], pred_file)
        if os.path.exists(target_path):
            os.remove(target_path)
        os.rename(pred_file, target_path)
    else:
        print(f"Warning: {pred_file} does not exist and cannot be moved.")

    # Feature importance
    importance_file = f'feature_importance_{sampler}_{model}.csv'
    if os.path.exists(importance_file):
        target_path = os.path.join(output_dirs['feature_importance'], importance_file)
        if os.path.exists(target_path):
            os.remove(target_path)
        os.rename(importance_file, target_path)
    else:
        print(f"Warning: {importance_file} does not exist and cannot be moved.")

    # Learning curve
    curve_file = f'learning_curve_{sampler}_{model}.png'
    if os.path.exists(curve_file):
        target_path = os.path.join(output_dirs['learning_curves'], curve_file)
        if os.path.exists(target_path):
            os.remove(target_path)
        os.rename(curve_file, target_path)
    else:
        print(f"Warning: {curve_file} does not exist and cannot be moved.")

    # Konfúziós mátrix ábra mentése
    cm = np.array(res['confusion_matrices'])
    cm_filename = f'conf_matrix_{sampler}_{model}.png'
    target_path = os.path.join(output_dirs['confusion_matrices'], cm_filename)
    if os.path.exists(target_path):
        os.remove(target_path)
    plt.figure(figsize=(6, 4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Pred: No', 'Pred: Yes'],
                yticklabels=['Actual: No', 'Actual: Yes'])
    plt.title(f'Confusion Matrix\n{sampler} + {model}')
    plt.xlabel('Predicted label')
    plt.ylabel('True label')
    plt.tight_layout()
    plt.savefig(target_path)
    plt.close()

# Végső eredmények mentése mappába
results_df = pd.DataFrame(results)
results_csv_path = os.path.join(output_dirs['results'], 'classification_results.csv')
results_json_path = os.path.join(output_dirs['results'], 'classification_results.json')
learning_curve_path = os.path.join(output_dirs['results'], 'learning_curve_results.csv')

if os.path.exists(results_csv_path):
    os.remove(results_csv_path)
if os.path.exists(results_json_path):
    os.remove(results_json_path)
if os.path.exists(learning_curve_path):
    os.remove(learning_curve_path)

results_df.to_csv(results_csv_path, index=False)
results_df.to_json(results_json_path, orient='records', lines=True)
learning_curve_df = pd.DataFrame(learning_curve_data)
learning_curve_df.to_csv(learning_curve_path, index=False)

print("\n📁 Minden fájl sikeresen elmentve strukturált mappákba! 😎")

Missing values in Dangerous: 0
Original class distribution:
 Dangerous
Yes    818
No      20
Name: count, dtype: int64

Applying smote_tomek...
Class distribution after smote_tomek:
 Dangerous
Yes    523
No     261
Name: count, dtype: int64

Applying random_over_under...
Class distribution after random_over_under:
 Dangerous
No     261
Yes    261
Name: count, dtype: int64

Applying adasyn_nearmiss...
Class distribution after adasyn_nearmiss:
 Dangerous
No     260
Yes    260
Name: count, dtype: int64

Processing smote_tomek dataset...
Training random_forest on smote_tomek...
Training logistic_regression on smote_tomek...
Training decision_tree on smote_tomek...

Processing random_over_under dataset...
Training random_forest on random_over_under...
Training logistic_regression on random_over_under...
Training decision_tree on random_over_under...

Processing adasyn_nearmiss dataset...
Training random_forest on adasyn_nearmiss...
Training logistic_regression on adasyn_nearmiss...
Training

In [None]:
# 📦 Alap könyvtárak
import os
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# 📈 Modellek és skálázás
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, learning_curve
from sklearn.metrics import f1_score, roc_auc_score, precision_recall_curve, auc, confusion_matrix

# ⚖️ Imbalanced data kezelés
from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler, TomekLinks, NearMiss
from imblearn.combine import SMOTETomek
from imblearn.pipeline import Pipeline

# 🌳 XGBoost
from xgboost import XGBClassifier

# 🔕 Warnings kikapcsolása
warnings.filterwarnings("ignore")


Loading existing combined datasets...
Class distribution after loading smote_tomek:
 Dangerous
Yes    543
No     271
Name: count, dtype: int64
Class distribution after loading random_over_under:
 Dangerous
No     271
Yes    271
Name: count, dtype: int64
Class distribution after loading adasyn_nearmiss:
 Dangerous
No     267
Yes    267
Name: count, dtype: int64

Processing smote_tomek dataset...
Training random_forest on smote_tomek with F1 Early Stopping...
Epoch 1: Validation F1 = 0.4899
Epoch 2: Validation F1 = 0.6902
Epoch 3: Validation F1 = 0.7493
Epoch 4: Validation F1 = 0.8740
Epoch 5: Validation F1 = 0.8565
Epoch 6: Validation F1 = 0.8905
Epoch 7: Validation F1 = 0.9196
Epoch 8: Validation F1 = 0.9290
Epoch 9: Validation F1 = 0.9290
Epoch 10: Validation F1 = 0.9290
Epoch 11: Validation F1 = 0.9290
Epoch 12: Validation F1 = 0.9100
Epoch 13: Validation F1 = 0.9196
Early stopping triggered after 13 trees.


AttributeError: 'F1EarlyStopping' object has no attribute 'best_model'