In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_curve, auc, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style='whitegrid')
datasets = {
    "JM1": "../data/jm1.csv",
    "KC1": "../data/kc1.csv",
    "KC2": "../data/kc2.csv",
    "PC1": "../data/pc1.csv",
    "CM1": "../data/cm1.csv"
}

all_results = []

def run_pipeline(dataset_name, path):
    print(f"\n📁 === Processing {dataset_name} ===")
    df = pd.read_csv(path)
    target_col = 'problems' if dataset_name == 'KC2' else 'defects'
    if df[target_col].dtype == object:
        df[target_col] = df[target_col].str.lower().map({'yes': 1, 'no': 0})
    df[target_col] = df[target_col].astype(int)

    X = df.drop(columns=[target_col])
    y = df[target_col]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify=y, test_size=0.2, random_state=42)

    lr = LogisticRegression(max_iter=1000).fit(X_train, y_train)
    rf = RandomForestClassifier().fit(X_train, y_train)
    ann = MLPClassifier(max_iter=500).fit(X_train, y_train)

    sm = SMOTE()
    X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)
    rf_sm = RandomForestClassifier().fit(X_train_sm, y_train_sm)
    xgb_sm = XGBClassifier(use_label_encoder=False, eval_metric='logloss').fit(X_train_sm, y_train_sm)
    ann_sm = MLPClassifier(max_iter=500).fit(X_train_sm, y_train_sm)

    models = {
        'Logistic Regression': lr,
        'Random Forest (No SMOTE)': rf,
        'ANN (No SMOTE)': ann,
        'Random Forest (SMOTE)': rf_sm,
        'XGBoost (SMOTE)': xgb_sm,
        'ANN (SMOTE)': ann_sm
    }

    for name, model in models.items():
        y_pred = model.predict(X_test)
        all_results.append({
            'Dataset': dataset_name,
            'Model': name,
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred),
            'Recall': recall_score(y_test, y_pred),
            'F1-score': f1_score(y_test, y_pred)
        })
for ds_name, ds_path in datasets.items():
    run_pipeline(ds_name, ds_path)

df_all_results = pd.DataFrame(all_results)
df_all_results['SMOTE'] = df_all_results['Model'].apply(lambda x: 'SMOTE' in x)
display(df_all_results.round(3))
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-score']

for metric in metrics:
    fig, axes = plt.subplots(1, 2, figsize=(18, 6), sharey=True)
    df_no = df_all_results[df_all_results['SMOTE'] == False]
    df_smote = df_all_results[df_all_results['SMOTE'] == True]

    sns.barplot(data=df_no, x='Dataset', y=metric, hue='Model', ax=axes[0])
    axes[0].set_title(f'{metric} före SMOTE')
    axes[0].tick_params(axis='x', rotation=15)

    sns.barplot(data=df_smote, x='Dataset', y=metric, hue='Model', ax=axes[1])
    axes[1].set_title(f'{metric} efter SMOTE')
    axes[1].tick_params(axis='x', rotation=15)

    axes[0].legend(title='Modell')
    axes[1].legend(title='Modell')

    plt.tight_layout()
    plt.show()
for dataset in df_all_results['Dataset'].unique():
    print(f"\n📊 ROC-kurvor för dataset: {dataset}")
    df = pd.read_csv(datasets[dataset])
    target_col = 'problems' if dataset == 'KC2' else 'defects'
    if df[target_col].dtype == object:
        df[target_col] = df[target_col].str.lower().map({'yes': 1, 'no': 0})
    df[target_col] = df[target_col].astype(int)

    X = df.drop(columns=[target_col])
    y = df[target_col]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify=y, test_size=0.2, random_state=42)
    X_train_sm, y_train_sm = SMOTE().fit_resample(X_train, y_train)

    models = {
        'Random Forest': RandomForestClassifier().fit(X_train_sm, y_train_sm),
        'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss').fit(X_train_sm, y_train_sm),
        'ANN': MLPClassifier(max_iter=500).fit(X_train_sm, y_train_sm)
    }

    plt.figure(figsize=(10, 7))
    for name, model in models.items():
        if hasattr(model, "predict_proba"):
            y_scores = model.predict_proba(X_test)[:, 1]
        else:
            y_scores = model.decision_function(X_test)
        fpr, tpr, _ = roc_curve(y_test, y_scores)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')

    plt.plot([0, 1], [0, 1], 'k--', label='Slump (AUC = 0.5)')
    plt.title(f'ROC-kurvor – {dataset}')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate (Recall)')
    plt.legend(loc='lower right')
    plt.grid()
    plt.show()
auc_scores = []

for row in all_results:
    dataset = row['Dataset']
    model_name = row['Model']
    df = pd.read_csv(datasets[dataset])
    target_col = 'problems' if dataset == 'KC2' else 'defects'
    if df[target_col].dtype == object:
        df[target_col] = df[target_col].str.lower().map({'yes': 1, 'no': 0})
    df[target_col] = df[target_col].astype(int)

    X = df.drop(columns=[target_col])
    y = df[target_col]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify=y, test_size=0.2, random_state=42)

    if 'SMOTE' in model_name:
        X_train, y_train = SMOTE().fit_resample(X_train, y_train)

    if 'Logistic Regression' in model_name:
        model = LogisticRegression(max_iter=1000)
    elif 'Random Forest' in model_name:
        model = RandomForestClassifier()
    elif 'XGBoost' in model_name:
        model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    elif 'ANN' in model_name:
        model = MLPClassifier(max_iter=500)
    else:
        auc_scores.append(None)
        continue

    model.fit(X_train, y_train)

    if hasattr(model, "predict_proba"):
        y_scores = model.predict_proba(X_test)[:, 1]
    else:
        y_scores = model.decision_function(X_test)

    auc_scores.append(roc_auc_score(y_test, y_scores))

df_all_results['AUC'] = auc_scores
display(df_all_results.round(3))


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
datasets = {
    "JM1": "../data/jm1.csv",
    "KC1": "../data/kc1.csv",
    "KC2": "../data/kc2.csv",
    "PC1": "../data/pc1.csv",
    "CM1": "../data/cm1.csv"
}

all_results = []
def run_pipeline(dataset_name, path):
    print(f"\n📁 Processing {dataset_name}")
    df = pd.read_csv(path)

    target_col = 'problems' if dataset_name == 'KC2' else 'defects'
    if df[target_col].dtype == object:
        df[target_col] = df[target_col].str.lower().map({'yes': 1, 'no': 0})
    df[target_col] = df[target_col].astype(int)

    X = df.drop(columns=[target_col])
    y = df[target_col]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify=y, test_size=0.2, random_state=42)

    # Modelltyper
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Random Forest": RandomForestClassifier(),
        "ANN": MLPClassifier(max_iter=500),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    }

    # Utan SMOTE
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) if hasattr(model, "predict_proba") else None
        all_results.append({
            'Dataset': dataset_name,
            'Model': f"{name} (No SMOTE)",
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred),
            'Recall': recall_score(y_test, y_pred),
            'F1-score': f1_score(y_test, y_pred),
            'AUC': auc,
            'SMOTE': False
        })

    # Med SMOTE
    X_train_sm, y_train_sm = SMOTE().fit_resample(X_train, y_train)
    for name, model in models.items():
        model.fit(X_train_sm, y_train_sm)
        y_pred = model.predict(X_test)
        auc = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) if hasattr(model, "predict_proba") else None
        all_results.append({
            'Dataset': dataset_name,
            'Model': f"{name} (SMOTE)",
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred),
            'Recall': recall_score(y_test, y_pred),
            'F1-score': f1_score(y_test, y_pred),
            'AUC': auc,
            'SMOTE': True
        })
for ds_name, ds_path in datasets.items():
    run_pipeline(ds_name, ds_path)
df_results = pd.DataFrame(all_results)
df_results.to_csv("all_model_results_with_and_without_smote.csv", index=False)
display(df_results.round(3))


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, roc_curve, auc
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import seaborn as sns

sns.set(style='whitegrid')
datasets = {
    "JM1": "../data/jm1.csv",
    "KC1": "../data/kc1.csv",
    "KC2": "../data/kc2.csv",
    "PC1": "../data/pc1.csv",
    "CM1": "../data/cm1.csv"
}

all_results = []
def run_pipeline(dataset_name, path):
    print(f"\n📁 Processing {dataset_name}")
    df = pd.read_csv(path)

    target_col = 'problems' if dataset_name == 'KC2' else 'defects'
    if df[target_col].dtype == object:
        df[target_col] = df[target_col].str.lower().map({'yes': 1, 'no': 0})
    df[target_col] = df[target_col].astype(int)

    X = df.drop(columns=[target_col])
    y = df[target_col]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify=y, test_size=0.2, random_state=42)

    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Random Forest": RandomForestClassifier(),
        "ANN": MLPClassifier(max_iter=500),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    }

    # Utan SMOTE
    for name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        auc_val = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) if hasattr(model, "predict_proba") else None
        all_results.append({
            'Dataset': dataset_name,
            'Model': f"{name} (No SMOTE)",
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred),
            'Recall': recall_score(y_test, y_pred),
            'F1-score': f1_score(y_test, y_pred),
            'AUC': auc_val,
            'SMOTE': False
        })

    # Med SMOTE
    X_train_sm, y_train_sm = SMOTE().fit_resample(X_train, y_train)
    for name, model in models.items():
        model.fit(X_train_sm, y_train_sm)
        y_pred = model.predict(X_test)
        auc_val = roc_auc_score(y_test, model.predict_proba(X_test)[:, 1]) if hasattr(model, "predict_proba") else None
        all_results.append({
            'Dataset': dataset_name,
            'Model': f"{name} (SMOTE)",
            'Accuracy': accuracy_score(y_test, y_pred),
            'Precision': precision_score(y_test, y_pred),
            'Recall': recall_score(y_test, y_pred),
            'F1-score': f1_score(y_test, y_pred),
            'AUC': auc_val,
            'SMOTE': True
        })
for ds_name, ds_path in datasets.items():
    run_pipeline(ds_name, ds_path)
df_results = pd.DataFrame(all_results)
df_results.to_csv("all_model_results_with_and_without_smote.csv", index=False)
display(df_results.round(3))
metrics = ['Accuracy', 'Precision', 'Recall', 'F1-score', 'AUC']

for metric in metrics:
    fig, axes = plt.subplots(1, 2, figsize=(18, 6), sharey=True)
    df_no = df_results[df_results['SMOTE'] == False]
    df_smote = df_results[df_results['SMOTE'] == True]

    sns.barplot(data=df_no, x='Dataset', y=metric, hue='Model', ax=axes[0])
    axes[0].set_title(f'{metric} – Before SMOTE')
    axes[0].tick_params(axis='x', rotation=15)

    sns.barplot(data=df_smote, x='Dataset', y=metric, hue='Model', ax=axes[1])
    axes[1].set_title(f'{metric} – After SMOTE')
    axes[1].tick_params(axis='x', rotation=15)

    axes[0].legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')
    axes[1].legend(title='Model', bbox_to_anchor=(1.05, 1), loc='upper left')

    plt.tight_layout()
    plt.show()
for dataset in df_results['Dataset'].unique():
    print(f"\n📊 ROC Curves – {dataset}")
    df = pd.read_csv(datasets[dataset])
    target_col = 'problems' if dataset == 'KC2' else 'defects'
    if df[target_col].dtype == object:
        df[target_col] = df[target_col].str.lower().map({'yes': 1, 'no': 0})
    df[target_col] = df[target_col].astype(int)

    X = df.drop(columns=[target_col])
    y = df[target_col]
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, stratify=y, test_size=0.2, random_state=42)
    X_train_sm, y_train_sm = SMOTE().fit_resample(X_train, y_train)

    models = {
        'Random Forest': RandomForestClassifier(),
        'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
        'ANN': MLPClassifier(max_iter=500),
        'Logistic Regression': LogisticRegression(max_iter=1000)
    }

    plt.figure(figsize=(10, 7))
    for name, model in models.items():
        model.fit(X_train_sm, y_train_sm)
        if hasattr(model, "predict_proba"):
            y_scores = model.predict_proba(X_test)[:, 1]
        else:
            y_scores = model.decision_function(X_test)
        fpr, tpr, _ = roc_curve(y_test, y_scores)
        roc_auc = auc(fpr, tpr)
        plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.2f})')

    plt.plot([0, 1], [0, 1], 'k--', label='Random Guessing')
    plt.title(f'ROC Curves – {dataset}')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate (Recall)')
    plt.legend(loc='lower right')
    plt.grid()
    plt.show()
