In [1]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from part2.shared import load_processed_data, load_train_with_validation_data
from constants import numeric_features, categorical_features

In [2]:

df = load_processed_data()
df = df.dropna(subset=["Target encoded"])
X = df.drop(columns=["Target", "Target encoded"])
y = df["Target encoded"]

In [3]:
num_features = [c for c in numeric_features if c in X.columns]
cat_features = [c for c in categorical_features if c in X.columns]

In [4]:
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

In [5]:
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

In [6]:
full_pipeline = ColumnTransformer([
    ("num", num_pipeline, num_features),
    ("cat", cat_pipeline, cat_features)
])


In [7]:
X_train, X_val, X_test, y_train, y_val, y_test = load_train_with_validation_data(X, y)

In [8]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "SVM": SVC()
}

for name, model in models.items():
    pipeline = Pipeline([
        ("preprocessor", full_pipeline),
        ("classifier", model)
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)
    y_val_pred = pipeline.predict(X_val)
    y_train_pred = pipeline.predict(X_train)
    print(f"\n{name}")
    print("Test Set Metrics:")
    print(classification_report(y_test, y_pred))
    print("Validation Set Metrics:")
    print(classification_report(y_val, y_val_pred))
    print("Train Set Metrics:")
    print(classification_report(y_train, y_train_pred))


Logistic Regression
Test Set Metrics:
              precision    recall  f1-score   support

        -1.0       0.91      0.86      0.88       284
         1.0       0.91      0.95      0.93       443

    accuracy                           0.91       727
   macro avg       0.91      0.90      0.91       727
weighted avg       0.91      0.91      0.91       727

Validation Set Metrics:
              precision    recall  f1-score   support

        -1.0       0.95      0.83      0.88       147
         1.0       0.89      0.97      0.93       216

    accuracy                           0.91       363
   macro avg       0.92      0.90      0.91       363
weighted avg       0.91      0.91      0.91       363

Train Set Metrics:
              precision    recall  f1-score   support

        -1.0       0.94      0.85      0.89       990
         1.0       0.91      0.97      0.94      1550

    accuracy                           0.92      2540
   macro avg       0.93      0.91      0.91   