## Random Oversampling

In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
import warnings
warnings.filterwarnings('ignore')

# Adatok bet√∂lt√©se
data = pd.read_csv('animal_condition.csv')

# Hi√°nyz√≥ √©rt√©kek kezel√©se
print("Missing values in Dangerous:", data['Dangerous'].isnull().sum())
data = data.dropna(subset=['Dangerous'])
data['Dangerous'] = data['Dangerous'].str.strip().str.capitalize()
valid_values = ['Yes', 'No']
data = data[data['Dangerous'].isin(valid_values)]
print("Original class distribution:\n", data['Dangerous'].value_counts())

# Jellemz≈ëk √©s c√©lv√°ltoz√≥
feature_cols = ['AnimalName', 'symptoms1', 'symptoms2', 'symptoms3', 'symptoms4', 'symptoms5']
if data[feature_cols].isnull().sum().any():
    data[feature_cols] = data[feature_cols].fillna('Unknown')

# Eredeti kategorikus √©rt√©kek ment√©se
data_original = data.copy()

# Kategorikus jellemz≈ëk k√≥dol√°sa
le_dict = {}
for col in feature_cols:
    le = LabelEncoder()
    data[col] = le.fit_transform(data[col].astype(str))
    le_dict[col] = le
le_dangerous = LabelEncoder()
data['Dangerous'] = le_dangerous.fit_transform(data['Dangerous'].map({'Yes': 1, 'No': 0}))

# Jellemz≈ëk √©s c√©lv√°ltoz√≥ sz√©tv√°laszt√°sa
X = data[feature_cols]
y = data['Dangerous']

# T√∫lmintav√©telez√©si technik√°k
oversamplers = {
    'random_oversampling': RandomOverSampler(random_state=42),
    'smote': SMOTE(random_state=42),
    'adasyn': ADASYN(random_state=42)
}

# Eredm√©nyek t√°rol√°sa
for name, oversampler in oversamplers.items():
    print(f"\nApplying {name}...")
    X_resampled, y_resampled = oversampler.fit_resample(X, y)
    
    # Visszaalak√≠t√°s kategorikus √©rt√©kekre
    resampled_data = pd.DataFrame(X_resampled, columns=feature_cols)
    for col in feature_cols:
        resampled_data[col] = le_dict[col].inverse_transform(resampled_data[col].astype(int))
    resampled_data['Dangerous'] = le_dangerous.inverse_transform(y_resampled)
    resampled_data['Dangerous'] = resampled_data['Dangerous'].map({1: 'Yes', 0: 'No'})
    
    # Oszt√°lyeloszl√°s ki√≠r√°sa
    print(f"Class distribution after {name}:\n", resampled_data['Dangerous'].value_counts())
    
    # Ment√©s k√ºl√∂nb√∂z≈ë form√°tumokban
    resampled_data.to_csv(f'oversampled_{name}.csv', index=False)
    resampled_data.to_json(f'oversampled_{name}.json', orient='records', lines=True)
    resampled_data.to_parquet(f'oversampled_{name}.parquet', index=False)
    print(f"Saved oversampled data to oversampled_{name}.csv, .json, and .parquet")

Missing values in Dangerous: 2
Original class distribution:
 Dangerous
Yes    849
No      20
Name: count, dtype: int64

Applying random_oversampling...
Class distribution after random_oversampling:
 Dangerous
Yes    849
No     849
Name: count, dtype: int64
Saved oversampled data to oversampled_random_oversampling.csv, .json, and .parquet

Applying smote...
Class distribution after smote:
 Dangerous
Yes    849
No     849
Name: count, dtype: int64
Saved oversampled data to oversampled_smote.csv, .json, and .parquet

Applying adasyn...
Class distribution after adasyn:
 Dangerous
Yes    849
No     840
Name: count, dtype: int64
Saved oversampled data to oversampled_adasyn.csv, .json, and .parquet


In [14]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, learning_curve
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, auc, f1_score
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import os
import warnings
warnings.filterwarnings('ignore')

# Adatok el≈ëk√©sz√≠t√©se
def prepare_data():
    data = pd.read_csv('animal_condition.csv')
    print("Missing values in Dangerous:", data['Dangerous'].isnull().sum())
    data = data.dropna(subset=['Dangerous'])
    data['Dangerous'] = data['Dangerous'].str.strip().str.capitalize()
    valid_values = ['Yes', 'No']
    data = data[data['Dangerous'].isin(valid_values)]
    print("Original class distribution:\n", data['Dangerous'].value_counts())

    feature_cols = ['AnimalName', 'symptoms1', 'symptoms2', 'symptoms3', 'symptoms4', 'symptoms5']
    data[feature_cols] = data[feature_cols].fillna('Unknown')

    # One-Hot Encoding
    X = pd.get_dummies(data[feature_cols], columns=feature_cols)
    y = data['Dangerous'].map({'Yes': 1, 'No': 0})
    
    # Hold-out set l√©trehoz√°sa (20%)
    X_orig, X_holdout, y_orig, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # K√ºl√∂n tesztk√©szlet az eredeti adaton (20%)
    X_train_val, X_test_imbalanced, y_train_val, y_test_imbalanced = train_test_split(X_orig, y_orig, test_size=0.2, random_state=42, stratify=y_orig)
    
    oversamplers = {
        'random_oversampling': RandomOverSampler(sampling_strategy=200/679, random_state=42),  # 679 = 764 * 0.8
        'smote': SMOTE(sampling_strategy=200/679, random_state=42),
        'adasyn': ADASYN(sampling_strategy=200/679, random_state=42)
    }
    
    oversampled_data = {}
    for name, oversampler in oversamplers.items():
        X_resampled, y_resampled = oversampler.fit_resample(X_train_val, y_train_val)
        resampled_data = pd.DataFrame(X_resampled, columns=X.columns)
        resampled_data['Dangerous'] = y_resampled.map({1: 'Yes', 0: 'No'})
        oversampled_data[name] = resampled_data
        resampled_data.to_csv(f'oversampled_{name}.csv', index=False)
        print(f"Class distribution after {name}:\n", resampled_data['Dangerous'].value_counts())
    return oversampled_data, X_holdout, y_holdout, X_test_imbalanced, y_test_imbalanced, X.columns, data

# Adatk√©szletek bet√∂lt√©se vagy l√©trehoz√°sa
oversampled_files = {
    'random_oversampling': 'oversampled_random_oversampling.csv',
    'smote': 'oversampled_smote.csv',
    'adasyn': 'oversampled_adasyn.csv'
}
oversampled_data = {}
X_holdout, y_holdout = None, None
X_test_imbalanced, y_test_imbalanced = None, None
feature_columns = None
original_data = None
if all(os.path.exists(file) for file in oversampled_files.values()):
    for name, file in oversampled_files.items():
        data = pd.read_csv(file)
        feature_cols = ['AnimalName', 'symptoms1', 'symptoms2', 'symptoms3', 'symptoms4', 'symptoms5']
        X = pd.get_dummies(data[feature_cols], columns=feature_cols)
        original_data = pd.read_csv('animal_condition.csv')
        original_data = original_data.dropna(subset=['Dangerous'])
        original_data['Dangerous'] = original_data['Dangerous'].str.strip().str.capitalize()
        original_data = original_data[original_data['Dangerous'].isin(['Yes', 'No'])]
        original_data[feature_cols] = original_data[feature_cols].fillna('Unknown')
        X_orig = pd.get_dummies(original_data[feature_cols], columns=feature_cols)
        X = X.reindex(columns=X_orig.columns, fill_value=0)
        data = X.copy()
        data['Dangerous'] = pd.read_csv(file)['Dangerous']
        oversampled_data[name] = data
    X_orig_full = X_orig
    y_orig_full = original_data['Dangerous'].map({'Yes': 1, 'No': 0})
    X_orig, X_holdout, y_orig, y_holdout = train_test_split(X_orig_full, y_orig_full, test_size=0.2, random_state=42, stratify=y_orig_full)
    X_train_val, X_test_imbalanced, y_train_val, y_test_imbalanced = train_test_split(X_orig, y_orig, test_size=0.2, random_state=42, stratify=y_orig)
    feature_columns = X_orig.columns
else:
    oversampled_data, X_holdout, y_holdout, X_test_imbalanced, y_test_imbalanced, feature_columns, original_data = prepare_data()

# Modellek
models = {
    'random_forest': RandomForestClassifier(
        class_weight='balanced', random_state=42, n_estimators=100, max_depth=3, min_samples_split=15
    ),
    'logistic_regression': LogisticRegression(
        class_weight='balanced', random_state=42, max_iter=1000, C=0.005
    ),
    'xgboost': XGBClassifier(
        scale_pos_weight=849/20, random_state=42, eval_metric='logloss', max_depth=3, reg_lambda=3, alpha=1
    )
}

# Eredm√©nyek √©s predikci√≥k t√°rol√°sa
results = []
predictions = []
learning_curve_data = []

# Keresztvalid√°ci√≥ az eredeti adaton
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
X_orig_full = pd.get_dummies(original_data[feature_cols], columns=feature_cols)
y_orig_full = original_data['Dangerous'].map({'Yes': 1, 'No': 0})
scaler_orig = StandardScaler()
X_orig_scaled = scaler_orig.fit_transform(X_orig_full)

# Keresztvalid√°ci√≥ √©s tesztel√©s
for oversampler_name, data in oversampled_data.items():
    print(f"\nProcessing {oversampler_name} dataset...")
    
    X = data.drop('Dangerous', axis=1)
    y = data['Dangerous'].map({'Yes': 1, 'No': 0})
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)
    
    for model_name, model in models.items():
        print(f"Training {model_name} on {oversampler_name}...")
        
        # Keresztvalid√°ci√≥ a t√∫lmintav√©telezett adaton
        cv_f1 = cross_val_score(model, X_train, y_train, cv=skf, scoring='f1_macro')
        cv_roc_auc = cross_val_score(model, X_train, y_train, cv=skf, scoring='roc_auc')
        
        # Keresztvalid√°ci√≥ az eredeti imbalanced adaton
        cv_f1_orig = cross_val_score(model, X_orig_scaled, y_orig_full, cv=skf, scoring='f1_macro')
        cv_roc_auc_orig = cross_val_score(model, X_orig_scaled, y_orig_full, cv=skf, scoring='roc_auc')
        
        model.fit(X_train, y_train)
        
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        train_f1 = f1_score(y_train, y_train_pred, average='macro')
        test_f1 = f1_score(y_test, y_test_pred, average='macro')
        train_roc_auc = roc_auc_score(y_train, y_train_pred)
        test_roc_auc = roc_auc_score(y_test, y_test_pred)
        
        try:
            y_test_scores = model.predict_proba(X_test)[:, 1]
        except AttributeError:
            y_test_scores = model.predict(X_test)
        precision, recall, _ = precision_recall_curve(y_test, y_test_scores)
        pr_auc = auc(recall, precision)
        
        # Imbalanced tesztk√©szlet teljes√≠tm√©ny
        X_test_imbalanced_scaled = scaler.transform(X_test_imbalanced)
        y_test_imbalanced_pred = model.predict(X_test_imbalanced_scaled)
        test_imbalanced_f1 = f1_score(y_test_imbalanced, y_test_imbalanced_pred, average='macro')
        test_imbalanced_roc_auc = roc_auc_score(y_test_imbalanced, y_test_imbalanced_pred)
        test_imbalanced_report = classification_report(y_test_imbalanced, y_test_imbalanced_pred, target_names=['No', 'Yes'], output_dict=True)
        
        # Hold-out set teljes√≠tm√©ny
        X_holdout_scaled = scaler.transform(X_holdout)
        y_holdout_pred = model.predict(X_holdout_scaled)
        holdout_f1 = f1_score(y_holdout, y_holdout_pred, average='macro')
        holdout_roc_auc = roc_auc_score(y_holdout, y_holdout_pred)
        holdout_report = classification_report(y_holdout, y_holdout_pred, target_names=['No', 'Yes'], output_dict=True)
        
        f1_gap = train_f1 - test_f1
        roc_auc_gap = train_roc_auc - test_roc_auc
        
        report = classification_report(y_test, y_test_pred, target_names=['No', 'Yes'], output_dict=True)
        cm = confusion_matrix(y_test, y_test_pred)
        
        if model_name in ['random_forest', 'xgboost']:
            feature_importance = pd.DataFrame({
                'Feature': feature_columns,
                'Importance': model.feature_importances_
            }).sort_values(by='Importance', ascending=False)
            print(f"Feature Importance for {oversampler_name} - {model_name}:\n", feature_importance.head(10))
            feature_importance.to_csv(f'feature_importance_{oversampler_name}_{model_name}.csv', index=False)
        
        result = {
            'oversampler': oversampler_name,
            'model': model_name,
            'cv_f1_mean': cv_f1.mean(),
            'cv_f1_std': cv_f1.std(),
            'cv_roc_auc_mean': cv_roc_auc.mean(),
            'cv_roc_auc_std': cv_roc_auc.std(),
            'cv_f1_orig': cv_f1_orig.mean(),
            'cv_roc_auc_orig': cv_roc_auc_orig.mean(),
            'train_f1': train_f1,
            'test_f1': test_f1,
            'f1_gap': f1_gap,
            'train_roc_auc': train_roc_auc,
            'test_roc_auc': test_roc_auc,
            'roc_auc_gap': roc_auc_gap,
            'pr_auc': pr_auc,
            'no_precision': report['No']['precision'],
            'no_recall': report['No']['recall'],
            'no_f1': report['No']['f1-score'],
            'yes_precision': report['Yes']['precision'],
            'yes_recall': report['Yes']['recall'],
            'yes_f1': report['Yes']['f1-score'],
            'test_imbalanced_f1': test_imbalanced_f1,
            'test_imbalanced_roc_auc': test_imbalanced_roc_auc,
            'test_imbalanced_no_f1': test_imbalanced_report['No']['f1-score'],
            'holdout_f1': holdout_f1,
            'holdout_roc_auc': holdout_roc_auc,
            'holdout_no_f1': holdout_report['No']['f1-score'],
            'confusion_matrix': cm.tolist()
        }
        results.append(result)
        
        pred_df = pd.DataFrame({
            'true_label': y_test,
            'predicted_label': y_test_pred,
            'proba_yes': y_test_scores
        })
        pred_df.to_csv(f'predictions_{oversampler_name}_{model_name}.csv', index=False)
        
        train_sizes, train_scores, valid_scores = learning_curve(
            model, X_scaled, y, cv=skf, scoring='f1_macro', train_sizes=np.linspace(0.1, 1.0, 10)
        )
        for size, train_score, valid_score in zip(train_sizes, train_scores.mean(axis=1), valid_scores.mean(axis=1)):
            learning_curve_data.append({
                'oversampler': oversampler_name,
                'model': model_name,
                'train_size': size,
                'train_f1': train_score,
                'valid_f1': valid_score,
                'f1_gap': train_score - valid_score
            })
        
        plt.figure(figsize=(8, 6))
        plt.plot(train_sizes, train_scores.mean(axis=1), label='Train F1')
        plt.plot(train_sizes, valid_scores.mean(axis=1), label='Validation F1')
        plt.title(f'Learning Curve: {oversampler_name} - {model_name}')
        plt.xlabel('Training Size')
        plt.ylabel('F1 Score (Macro)')
        plt.legend()
        plt.grid(True)
        plt.savefig(f'learning_curve_{oversampler_name}_{model_name}.png')
        plt.close()

# Eredm√©nyek ment√©se
results_df = pd.DataFrame(results)
results_df.to_csv('classification_results.csv', index=False)
results_df.to_json('classification_results.json', orient='records', lines=True)
learning_curve_df = pd.DataFrame(learning_curve_data)
learning_curve_df.to_csv('learning_curve_results.csv', index=False)

# Eredm√©nyek ki√≠r√°sa
print("\nClassification Results with Holdout and Imbalanced Test:")
print(results_df[['oversampler', 'model', 'cv_f1_mean', 'cv_f1_orig', 'test_f1', 'f1_gap', 'test_roc_auc', 'roc_auc_gap', 'pr_auc', 'no_f1', 'test_imbalanced_f1', 'test_imbalanced_no_f1', 'holdout_f1', 'holdout_no_f1']])


Processing random_oversampling dataset...
Training random_forest on random_oversampling...
Feature Importance for random_oversampling - random_forest:
                                  Feature  Importance
546                   symptoms3_Diarrhea    0.052586
4                     AnimalName_Chicken    0.042808
819          symptoms4_Labored breathing    0.040455
472   symptoms2_Yellow or green dropping    0.039445
245                   symptoms1_Weakness    0.035639
1021         symptoms5_Greenish diarrhea    0.031373
651                symptoms3_Slow growth    0.030415
911                    symptoms4_Wattles    0.028413
916                symptoms4_Weight loss    0.027768
306                 symptoms2_Depression    0.027276
Training logistic_regression on random_oversampling...
Training xgboost on random_oversampling...
Feature Importance for random_oversampling - xgboost:
                         Feature  Importance
191   symptoms1_Severe swellimg    0.345018
4            AnimalName

In [11]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, learning_curve
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precision_recall_curve, auc, f1_score
from imblearn.over_sampling import RandomOverSampler, SMOTE, ADASYN
import matplotlib.pyplot as plt
import os
import warnings
warnings.filterwarnings('ignore')

# Glob√°lis feature_cols defin√≠ci√≥
feature_cols = ['AnimalGroup', 'Animal', 'Species', 'symptoms1', 'symptoms2', 'symptoms3', 'symptoms4', 'symptoms5']

# Adatok el≈ëk√©sz√≠t√©se
def prepare_data():
    # Az el≈ëk√©sz√≠tett adatf√°jl bet√∂lt√©se
    data = pd.read_csv('animal_condition_processed_no_encoding.csv')
    print("Missing values in Dangerous:", data['Dangerous'].isnull().sum())
    data = data.dropna(subset=['Dangerous'])
    data['Dangerous'] = data['Dangerous'].str.strip().str.capitalize()
    valid_values = ['Yes', 'No']
    data = data[data['Dangerous'].isin(valid_values)]
    print("Original class distribution:\n", data['Dangerous'].value_counts())

    data[feature_cols] = data[feature_cols].fillna('unknown')

    # One-Hot Encoding √©s t√≠puskonverzi√≥ numerikusra
    X = pd.get_dummies(data[feature_cols], columns=feature_cols).astype(int)
    y = data['Dangerous'].map({'Yes': 1, 'No': 0})
    
    # Hold-out set l√©trehoz√°sa (20%)
    X_orig, X_holdout, y_orig, y_holdout = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # K√ºl√∂n tesztk√©szlet az eredeti adaton (20%)
    X_train_val, X_test_imbalanced, y_train_val, y_test_imbalanced = train_test_split(X_orig, y_orig, test_size=0.2, random_state=42, stratify=y_orig)
    
    # Oversampling strat√©gi√°k defini√°l√°sa
    oversamplers = {
        'random_oversampling': RandomOverSampler(sampling_strategy='auto', random_state=42),
        'smote': SMOTE(sampling_strategy='auto', random_state=42),
        'adasyn': ADASYN(sampling_strategy='auto', random_state=42)
    }
    
    oversampled_data = {}
    for name, oversampler in oversamplers.items():
        print(f"\nApplying {name}...")
        X_resampled, y_resampled = oversampler.fit_resample(X_train_val, y_train_val)
        resampled_data = pd.DataFrame(X_resampled, columns=X.columns)
        resampled_data['Dangerous'] = y_resampled.map({1: 'Yes', 0: 'No'})
        resampled_data.to_csv(f'oversampled_{name}.csv', index=False)
        print(f"Class distribution after {name}:\n", resampled_data['Dangerous'].value_counts())
        oversampled_data[name] = resampled_data
    return oversampled_data, X_holdout, y_holdout, X_test_imbalanced, y_test_imbalanced, X.columns, data

# Adatk√©szletek bet√∂lt√©se vagy l√©trehoz√°sa
oversampled_files = {
    'random_oversampling': 'oversampled_random_oversampling.csv',
    'smote': 'oversampled_smote.csv',
    'adasyn': 'oversampled_adasyn.csv'
}
oversampled_data = {}
X_holdout, y_holdout = None, None
X_test_imbalanced, y_test_imbalanced = None, None
feature_columns = None
original_data = None
if all(os.path.exists(file) for file in oversampled_files.values()):
    print("Loading existing oversampled datasets...")
    for name, file in oversampled_files.items():
        data = pd.read_csv(file)
        original_data = pd.read_csv('animal_condition_processed.csv')
        original_data = original_data.dropna(subset=['Dangerous'])
        original_data['Dangerous'] = original_data['Dangerous'].str.strip().str.capitalize()
        original_data = original_data[original_data['Dangerous'].isin(['Yes', 'No'])]
        original_data[feature_cols] = original_data[feature_cols].fillna('unknown')
        X_orig = pd.get_dummies(original_data[feature_cols], columns=feature_cols).astype(int)
        X = data.drop('Dangerous', axis=1).reindex(columns=X_orig.columns, fill_value=0)
        data = X.copy()
        data['Dangerous'] = pd.read_csv(file)['Dangerous']
        oversampled_data[name] = data
        print(f"Class distribution after loading {name}:\n", data['Dangerous'].value_counts())
    X_orig_full = X_orig
    y_orig_full = original_data['Dangerous'].map({'Yes': 1, 'No': 0})
    X_orig, X_holdout, y_orig, y_holdout = train_test_split(X_orig_full, y_orig_full, test_size=0.2, random_state=42, stratify=y_orig_full)
    X_train_val, X_test_imbalanced, y_train_val, y_test_imbalanced = train_test_split(X_orig, y_orig, test_size=0.2, random_state=42, stratify=y_orig)
    feature_columns = X_orig.columns
else:
    oversampled_data, X_holdout, y_holdout, X_test_imbalanced, y_test_imbalanced, feature_columns, original_data = prepare_data()

# Keresztvalid√°ci√≥ az eredeti adaton
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
X_orig_full = pd.get_dummies(original_data[feature_cols], columns=feature_cols).astype(int)
y_orig_full = original_data['Dangerous'].map({'Yes': 1, 'No': 0})
scaler_orig = StandardScaler()
X_orig_scaled = scaler_orig.fit_transform(X_orig_full)

# Modellek
models = {
    'random_forest': RandomForestClassifier(
        class_weight='balanced', random_state=42, n_estimators=100, max_depth=3, min_samples_split=15
    ),
    'logistic_regression': LogisticRegression(
        class_weight='balanced', random_state=42, max_iter=1000, C=0.005
    ),
    'decision_tree': DecisionTreeClassifier(
        class_weight='balanced', random_state=42, max_depth=3, min_samples_split=15
    )
}

# Eredm√©nyek √©s predikci√≥k t√°rol√°sa
results = []
predictions = []
learning_curve_data = []

# Keresztvalid√°ci√≥ √©s tesztel√©s
for oversampler_name, data in oversampled_data.items():
    print(f"\nProcessing {oversampler_name} dataset...")
    
    X = data.drop('Dangerous', axis=1)
    y = data['Dangerous'].map({'Yes': 1, 'No': 0})
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)
    
    for model_name, model in models.items():
        print(f"Training {model_name} on {oversampler_name}...")
        
        # Keresztvalid√°ci√≥ a t√∫lmintav√©telezett adaton
        cv_f1 = cross_val_score(model, X_train, y_train, cv=skf, scoring='f1_macro')
        cv_roc_auc = cross_val_score(model, X_train, y_train, cv=skf, scoring='roc_auc')
        
        # Keresztvalid√°ci√≥ az eredeti imbalanced adaton
        cv_f1_orig = cross_val_score(model, X_orig_scaled, y_orig_full, cv=skf, scoring='f1_macro')
        cv_roc_auc_orig = cross_val_score(model, X_orig_scaled, y_orig_full, cv=skf, scoring='roc_auc')
        
        model.fit(X_train, y_train)
        
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        train_f1 = f1_score(y_train, y_train_pred, average='macro')
        test_f1 = f1_score(y_test, y_test_pred, average='macro')
        train_roc_auc = roc_auc_score(y_train, y_train_pred)
        test_roc_auc = roc_auc_score(y_test, y_test_pred)
        
        try:
            y_test_scores = model.predict_proba(X_test)[:, 1]
        except AttributeError:
            y_test_scores = model.predict(X_test)
        precision, recall, _ = precision_recall_curve(y_test, y_test_scores)
        pr_auc = auc(recall, precision)
        
        # Imbalanced tesztk√©szlet teljes√≠tm√©ny
        X_test_imbalanced_scaled = scaler.transform(X_test_imbalanced)
        y_test_imbalanced_pred = model.predict(X_test_imbalanced_scaled)
        test_imbalanced_f1 = f1_score(y_test_imbalanced, y_test_imbalanced_pred, average='macro')
        test_imbalanced_roc_auc = roc_auc_score(y_test_imbalanced, y_test_imbalanced_pred)
        test_imbalanced_report = classification_report(y_test_imbalanced, y_test_imbalanced_pred, target_names=['No', 'Yes'], output_dict=True)
        
        # Hold-out set teljes√≠tm√©ny
        X_holdout_scaled = scaler.transform(X_holdout)
        y_holdout_pred = model.predict(X_holdout_scaled)
        holdout_f1 = f1_score(y_holdout, y_holdout_pred, average='macro')
        holdout_roc_auc = roc_auc_score(y_holdout, y_holdout_pred)
        holdout_report = classification_report(y_holdout, y_holdout_pred, target_names=['No', 'Yes'], output_dict=True)
        
        f1_gap = train_f1 - test_f1
        roc_auc_gap = train_roc_auc - test_roc_auc
        
        report = classification_report(y_test, y_test_pred, target_names=['No', 'Yes'], output_dict=True)
        cm = confusion_matrix(y_test, y_test_pred)
        
        if model_name in ['random_forest', 'decision_tree']:
            feature_importance = pd.DataFrame({
                'Feature': feature_columns,
                'Importance': model.feature_importances_
            }).sort_values(by='Importance', ascending=False)
            print(f"Feature Importance for {oversampler_name} - {model_name}:\n", feature_importance.head(10))
            feature_importance.to_csv(f'feature_importance_{oversampler_name}_{model_name}.csv', index=False)
        
        result = {
            'oversampler': oversampler_name,
            'model': model_name,
            'cv_f1_mean': cv_f1.mean(),
            'cv_f1_std': cv_f1.std(),
            'cv_roc_auc_mean': cv_roc_auc.mean(),
            'cv_roc_auc_std': cv_roc_auc.std(),
            'cv_f1_orig': cv_f1_orig.mean(),
            'cv_roc_auc_orig': cv_roc_auc_orig.mean(),
            'train_f1': train_f1,
            'test_f1': test_f1,
            'f1_gap': f1_gap,
            'train_roc_auc': train_roc_auc,
            'test_roc_auc': test_roc_auc,
            'roc_auc_gap': roc_auc_gap,
            'pr_auc': pr_auc,
            'no_precision': report['No']['precision'],
            'no_recall': report['No']['recall'],
            'no_f1': report['No']['f1-score'],
            'yes_precision': report['Yes']['precision'],
            'yes_recall': report['Yes']['recall'],
            'yes_f1': report['Yes']['f1-score'],
            'test_imbalanced_f1': test_imbalanced_f1,
            'test_imbalanced_roc_auc': test_imbalanced_roc_auc,
            'test_imbalanced_no_f1': test_imbalanced_report['No']['f1-score'],
            'holdout_f1': holdout_f1,
            'holdout_roc_auc': holdout_roc_auc,
            'holdout_no_f1': holdout_report['No']['f1-score'],
            'confusion_matrix': cm.tolist()
        }
        results.append(result)
        
        pred_df = pd.DataFrame({
            'true_label': y_test,
            'predicted_label': y_test_pred,
            'proba_yes': y_test_scores
        })
        pred_df.to_csv(f'predictions_{oversampler_name}_{model_name}.csv', index=False)
        
        train_sizes, train_scores, valid_scores = learning_curve(
            model, X_scaled, y, cv=skf, scoring='f1_macro', train_sizes=np.linspace(0.1, 1.0, 10)
        )
        for size, train_score, valid_score in zip(train_sizes, train_scores.mean(axis=1), valid_scores.mean(axis=1)):
            learning_curve_data.append({
                'oversampler': oversampler_name,
                'model': model_name,
                'train_size': size,
                'train_f1': train_score,
                'valid_f1': valid_score,
                'f1_gap': train_score - valid_score
            })
        
        plt.figure(figsize=(8, 6))
        plt.plot(train_sizes, train_scores.mean(axis=1), label='Train F1')
        plt.plot(train_sizes, valid_scores.mean(axis=1), label='Validation F1')
        plt.title(f'Learning Curve: {oversampler_name} - {model_name}')
        plt.xlabel('Training Size')
        plt.ylabel('F1 Score (Macro)')
        plt.legend()
        plt.grid(True)
        plt.savefig(f'learning_curve_{oversampler_name}_{model_name}.png')
        plt.close()

# Eredm√©nyek ment√©se
results_df = pd.DataFrame(results)
results_df.to_csv('classification_results.csv', index=False)
results_df.to_json('classification_results.json', orient='records', lines=True)
learning_curve_df = pd.DataFrame(learning_curve_data)
learning_curve_df.to_csv('learning_curve_results.csv', index=False)

# Eredm√©nyek ki√≠r√°sa
print("\nClassification Results with Holdout and Imbalanced Test:")
print(results_df[['oversampler', 'model', 'cv_f1_mean', 'cv_f1_orig', 'test_f1', 'f1_gap', 'test_roc_auc', 'roc_auc_gap', 'pr_auc', 'no_f1', 'test_imbalanced_f1', 'test_imbalanced_no_f1', 'holdout_f1', 'holdout_no_f1']])

Loading existing oversampled datasets...
Class distribution after loading random_oversampling:
 Dangerous
Yes    543
No     543
Name: count, dtype: int64
Class distribution after loading smote:
 Dangerous
Yes    543
No     543
Name: count, dtype: int64
Class distribution after loading adasyn:
 Dangerous
No     546
Yes    543
Name: count, dtype: int64

Processing random_oversampling dataset...
Training random_forest on random_oversampling...
Feature Importance for random_oversampling - random_forest:
                          Feature  Importance
7                  Animal_cattle    0.063824
168     symptoms1_neck paralysis    0.062046
22             Species_buffaloes    0.058139
653   symptoms3_ruffled feathers    0.036683
38                 Species_sheep    0.035910
262           symptoms1_weakness    0.033116
1109          symptoms5_weakness    0.030241
93      symptoms1_drooping wings    0.027935
666        symptoms3_slow growth    0.026036
1014            symptoms5_lesion    0.025991

In [14]:
# Teljes kieg√©sz√≠tett k√≥d Luca k√©r√©se alapj√°n
# Megjegyz√©s: Az eredeti k√≥d el≈ëtt m√°r szerepelt a k√≥d alapja, itt csak a mappa-struktur√°lt kieg√©sz√≠t√©s l√°that√≥

import os
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

# üìÅ Kimeneti mapp√°k l√©trehoz√°sa
output_dirs = {
    'predictions': 'oversampling_output/predictions',
    'feature_importance': 'oversampling_output/feature_importance',
    'learning_curves': 'oversampling_output/learning_curves',
    'confusion_matrices': 'oversampling_output/confusion_matrices',
    'results': 'oversampling_output/results'
}

for dir_path in output_dirs.values():
    os.makedirs(dir_path, exist_ok=True)

# üîÑ M√°r l√©tez≈ë f√°jlok √°thelyez√©se a megfelel≈ë mapp√°kba
for res in results:
    oversampler = res['oversampler']
    model = res['model']

    # üîπ Predikci√≥k
    pred_file = f'predictions_{oversampler}_{model}.csv'
    if os.path.exists(pred_file):
        os.rename(pred_file, os.path.join(output_dirs['predictions'], pred_file))

    # üîπ Feature importance
    importance_file = f'feature_importance_{oversampler}_{model}.csv'
    if os.path.exists(importance_file):
        os.rename(importance_file, os.path.join(output_dirs['feature_importance'], importance_file))

    # üîπ Learning curve
    curve_file = f'learning_curve_{oversampler}_{model}.png'
    if os.path.exists(curve_file):
        os.rename(curve_file, os.path.join(output_dirs['learning_curves'], curve_file))

    # üîπ Konf√∫zios m√°trix ment√©se
    cm = np.array(res['confusion_matrix'])
    plt.figure(figsize=(4, 3))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Pred: No', 'Pred: Yes'],
                yticklabels=['Actual: No', 'Actual: Yes'])
    plt.title(f'Confusion Matrix\n{oversampler} + {model}')
    plt.xlabel('Predicted label')
    plt.ylabel('True label')
    plt.tight_layout()
    cm_filename = f'conf_matrix_{oversampler}_{model}.png'
    plt.savefig(os.path.join(output_dirs['confusion_matrices'], cm_filename))
    plt.close()

# üìÑ V√©gs≈ë eredm√©nyek ment√©se mapp√°ba
results_df.to_csv(os.path.join(output_dirs['results'], 'classification_results.csv'), index=False)
results_df.to_json(os.path.join(output_dirs['results'], 'classification_results.json'), orient='records', lines=True)
learning_curve_df.to_csv(os.path.join(output_dirs['results'], 'learning_curve_results.csv'), index=False)

print("\nüìÅ Minden f√°jl sikeresen elmentve struktur√°lt mapp√°kba! üòé")


üìÅ Minden f√°jl sikeresen elmentve struktur√°lt mapp√°kba! üòé


In [3]:
import pandas as pd

# Adatok bet√∂lt√©se
df = pd.read_csv("animal_condition.csv")

# 1. Minden sz√∂veges oszlop kisbet≈±sre alak√≠t√°sa
text_columns = ['AnimalName', 'symptoms1', 'symptoms2', 'symptoms3', 'symptoms4', 'symptoms5', 'Dangerous']
for col in text_columns:
    df[col] = df[col].str.strip().str.lower()

# 2. √öj oszlopok l√©trehoz√°sa: AnimalGroup, Animal, Species
def categorize_animal(animal_name):
    animal_name = animal_name.lower().strip()
    # Alap√©rtelmezett √©rt√©kek
    animal_group = 'unknown'
    animal = animal_name
    species = animal_name

    # Kategoriz√°l√°s
    if animal_name in ['dog', 'dogs']:
        animal_group = 'mammal'
        animal = 'dog'
        species = 'unknown'
    elif animal_name in ['cat']:
        animal_group = 'mammal'
        animal = 'cat'
        species = 'unknown'
    elif animal_name in ['rabbit']:
        animal_group = 'mammal'
        animal = 'rabbit'
        species = 'unknown'
    elif animal_name in ['cow', 'cattle', 'buffaloes']:
        animal_group = 'mammal'
        animal = 'cattle'
        species = animal_name if animal_name != 'cattle' else 'unknown'
    elif animal_name in ['horse', 'donkey', 'mules']:
        animal_group = 'mammal'
        animal = 'equine'
        species = animal_name
    elif animal_name in ['deer', 'reindeer', 'elk', 'wapiti', 'mule deer', 'black-tailed deer', 'sika deer', 'white-tailed deer', 'moos']:
        animal_group = 'mammal'
        animal = 'deer'
        species = animal_name if animal_name != 'deer' else 'unknown'
    elif animal_name in ['lion', 'tiger']:
        animal_group = 'mammal'
        animal = 'big cat'
        species = animal_name
    elif animal_name in ['fox', 'wolves', 'hyaenas']:
        animal_group = 'mammal'
        animal = 'canid'
        species = animal_name
    elif animal_name in ['goat', 'goats', 'sheep']:
        animal_group = 'mammal'
        animal = 'caprine'
        species = animal_name if animal_name != 'goats' else 'goat'
    elif animal_name in ['pig', 'pigs']:
        animal_group = 'mammal'
        animal = 'pig'
        species = 'unknown'
    elif animal_name in ['elephant']:
        animal_group = 'mammal'
        animal = 'elephant'
        species = 'unknown'
    elif animal_name in ['hamster']:
        animal_group = 'mammal'
        animal = 'hamster'
        species = 'unknown'
    elif animal_name in ['monkey']:
        animal_group = 'mammal'
        animal = 'monkey'
        species = 'unknown'
    elif animal_name in ['mammal','mammals']:
        animal_group = 'mammal'
        animal = 'unknown'
        species = 'unknown'
    elif animal_name in ['chicken', 'fowl', 'duck', 'birds', 'other birds']:
        animal_group = 'bird'
        animal = 'poultry' if animal_name in ['chicken', 'fowl', 'duck'] else 'other birds'
        species = animal_name if animal != 'other birds' else 'unknown'
    elif animal_name in ['turtle', 'snake']:
        animal_group = 'reptile'
        animal = animal_name
        species = 'unknown'

    return animal_group, animal, species

# √öj oszlopok hozz√°ad√°sa
df[['AnimalGroup', 'Animal', 'Species']] = df['AnimalName'].apply(lambda x: pd.Series(categorize_animal(x)))

# 3. √ñsszes√≠tett adathalmaz
df_processed = df[['AnimalGroup', 'Animal', 'Species', 'symptoms1', 'symptoms2', 'symptoms3', 'symptoms4', 'symptoms5', 'Dangerous']]

# 4. Feldolgozott adatok ment√©se
df_processed.to_csv("animal_condition_processed.csv", index=False)
print("\nFeldolgozott adatok mentve: animal_condition_processed.csv")


Feldolgozott adatok mentve: animal_condition_processed.csv
