In [4]:
from sklearn.linear_model import LogisticRegression

# Train Logistic Regression
logreg_kc1 = LogisticRegression(max_iter=1000)
logreg_kc1.fit(X_train_smote, y_train_smote)

# Predict
y_pred_logreg_kc1 = logreg_kc1.predict(X_test)

# Evaluate
print("Logistic Regression (KC1)")
print(confusion_matrix(y_test, y_pred_logreg_kc1))
print("\nClassification Report:\n", classification_report(y_test, y_pred_logreg_kc1))


Logistic Regression (KC1)
[[265  92]
 [ 22  43]]

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.74      0.82       357
           1       0.32      0.66      0.43        65

    accuracy                           0.73       422
   macro avg       0.62      0.70      0.63       422
weighted avg       0.83      0.73      0.76       422



In [5]:
from xgboost import XGBClassifier

# Train XGBoost
xgb_model_kc1 = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model_kc1.fit(X_train_smote, y_train_smote)

# Predict
y_pred_xgb_kc1 = xgb_model_kc1.predict(X_test)

# Evaluate
print("XGBoost (KC1)")
print(confusion_matrix(y_test, y_pred_xgb_kc1))
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb_kc1))


XGBoost (KC1)
[[332  25]
 [ 29  36]]

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.93      0.92       357
           1       0.59      0.55      0.57        65

    accuracy                           0.87       422
   macro avg       0.75      0.74      0.75       422
weighted avg       0.87      0.87      0.87       422



Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, confusion_matrix

# Load KC1 dataset
df = pd.read_csv('../data/KC1.csv')
df['defects'] = df['defects'].astype(int)

# Separate features and target
X = df.drop('defects', axis=1)
y = df['defects']

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# Show class balance after SMOTE
print("SMOTE training class distribution:\n", y_train_smote.value_counts())


SMOTE training class distribution:
 defects
0    1426
1    1426
Name: count, dtype: int64


[WinError 2] Det g√•r inte att hitta filen
  File "C:\Users\josef\AppData\Roaming\Python\Python313\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "c:\Program Files\Python313\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Program Files\Python313\Lib\subprocess.py", line 1036, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_session, process_group)
                        ^^^^^^^^^^^^^^^^

In [2]:
# Train Random Forest
rf_model_kc1 = RandomForestClassifier(random_state=42)
rf_model_kc1.fit(X_train_smote, y_train_smote)

# Predict
y_pred_rf_kc1 = rf_model_kc1.predict(X_test)

# Evaluate
print(confusion_matrix(y_test, y_pred_rf_kc1))
print("\nClassification Report:\n", classification_report(y_test, y_pred_rf_kc1))


[[322  35]
 [ 29  36]]

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.90      0.91       357
           1       0.51      0.55      0.53        65

    accuracy                           0.85       422
   macro avg       0.71      0.73      0.72       422
weighted avg       0.85      0.85      0.85       422



In [3]:
# Train ANN
ann_model_kc1 = MLPClassifier(hidden_layer_sizes=(100, 50), max_iter=500, random_state=42)
ann_model_kc1.fit(X_train_smote, y_train_smote)

# Predict
y_pred_ann_kc1 = ann_model_kc1.predict(X_test)

# Evaluate
print(confusion_matrix(y_test, y_pred_ann_kc1))
print("\nClassification Report:\n", classification_report(y_test, y_pred_ann_kc1))


[[297  60]
 [ 25  40]]

Classification Report:
               precision    recall  f1-score   support

           0       0.92      0.83      0.87       357
           1       0.40      0.62      0.48        65

    accuracy                           0.80       422
   macro avg       0.66      0.72      0.68       422
weighted avg       0.84      0.80      0.81       422

