In [None]:
import pandas as pd
from preprocessing import get_preset
import numpy as np
from sklearn.model_selection import cross_val_score, KFold
from sklearn.linear_model import LogisticRegression
import random

from sklearn.model_selection import RepeatedStratifiedKFold, RandomizedSearchCV
from scipy.stats import loguniform, randint, uniform
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
from sklearn.neighbors import KNeighborsClassifier


In [None]:
SEED = 42
np.random.seed(SEED)
random.seed(SEED)

In [None]:
train_values = pd.read_csv("../data/train_set_values.csv")
train_labels = pd.read_csv("../data/train_set_labels.csv")
test_values  = pd.read_csv("../data/test_set_values.csv")

# Merge labels into train only
train_df = pd.merge(train_values, train_labels, on="id", how="left")

preset_name = "log_transform+remove_correlated+feature_engineer"
pre = get_preset(preset_name, list(train_df.columns))

train_processed = pre.fit_transform(train_df)
test_processed  = pre.transform(test_values)

print("Train shape:", train_processed.shape)
print("Test  shape:", test_processed.shape)

print(sum(train_processed.isna().sum()))
print(sum(test_processed.isna().sum()))

In [None]:
#sTarget = "status_group" == col. for col in train_processed.columns 
train_processed.head()

In [None]:
# Example: assuming train_processed and test_processed are pandas DataFrames
# Split features (X) and target (y) from train_processed
X = train_processed.drop(["status_group"], axis=1)
y = train_processed["status_group"]

model = RandomForestClassifier(
            n_estimators=400, max_depth=None, class_weight="balanced_subsample",
            n_jobs=-1, random_state=SEED
        )

# kf = KFold(n_splits=5, shuffle=True, random_state=SEED)

# scores = cross_val_score(model, X, y, cv=kf, scoring="r2") 

# print("Cross-validation scores:", scores)
# print("Mean CV score:", np.mean(scores))

# Fit the model on the full training set after CV
model.fit(X, y)

# Generate predictions on test set
y_pred = model.predict(X)

f1_scor = f1_score(y, y_pred, average="weighted")
print(f1_scor)

In [None]:
X = train_processed.drop(columns=["status_group"])
y = train_processed["status_group"]
X_test = test_processed.copy()

# Optional: if you have mixed dtypes, define columns and scale only numerics:
num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
preprocess = ColumnTransformer(
    transformers=[("scale", StandardScaler(with_mean=True), num_cols)],
    remainder="passthrough",
)

pipelines = {
    # "logreg_multinomial": Pipeline([
    #     ("prep", preprocess),
    #     ("clf", LogisticRegression(
    #         multi_class="multinomial", solver="lbfgs", max_iter=2000,
    #         class_weight="balanced"
    #     ))
    # ]),
    # "linear_svm_calibrated": Pipeline([
    #     ("prep", preprocess),
    #     ("clf", CalibratedClassifierCV(
    #         estimator=LinearSVC(class_weight="balanced"),
    #         method="sigmoid", cv=3
    #     ))
    # ]),
    "random_forest": Pipeline([
        ("prep", preprocess),
        ("clf", RandomForestClassifier(
            class_weight="balanced_subsample", n_jobs=-1, random_state=42
        ))
    ]),
    # "hist_gbdt": Pipeline([
    #     ("prep", preprocess),
    #     ("clf", HistGradientBoostingClassifier(random_state=42))
    # ]),
    # "knn": Pipeline([
    #     ("prep", preprocess),
    #     ("clf", KNeighborsClassifier())
    # ]),
}

# --- 3) Parameter distributions (note the 'clf__' prefix)
param_dists = {
    # "logreg_multinomial": {
    #     "clf__C": loguniform(1e-3, 1e2),   # ~[0.001, 100]
    # },
    # "linear_svm_calibrated": {
    #     "clf__estimator__C": loguniform(1e-3, 1e2),
    #     "clf__method": ["sigmoid", "isotonic"],  # categorical choices are fine
    # },
    "random_forest": {
        "clf__n_estimators": randint(150, 1001),
        "clf__max_depth": [None, 10, 20, 40, 80],
        "clf__min_samples_leaf": randint(1, 6),
        "clf__max_features": ["sqrt", None],
    },
    # "hist_gbdt": {
    #     "clf__learning_rate": loguniform(1e-3, 3e-1),  # ~[0.001, 0.3]
    #     "clf__max_depth": [None, 3, 6, 9],
    #     "clf__max_leaf_nodes": randint(15, 256),
    #     "clf__max_iter": randint(150, 601),
    #     "clf__l2_regularization": loguniform(1e-8, 10.0),
    # },
    # "knn": {
    #     "clf__n_neighbors": randint(3, 64),
    #     "clf__weights": ["uniform", "distance"],
    #     "clf__p": [1, 2],  # Manhattan vs Euclidean
    # },
}


cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=2, random_state=SEED)

scoring = {
    "accuracy": "accuracy",
    "f1_macro": "f1_macro",
    "balanced_acc": "balanced_accuracy",
    "roc_auc_ovr": "roc_auc_ovr_weighted"
}

N_ITER = 1
search_results = {}
best_estimators = {}


In [None]:
for name, pipe in pipelines.items():
    dist = param_dists[name]
    search = RandomizedSearchCV(
        estimator=pipe,
        param_distributions=dist,
        n_iter=N_ITER,
        scoring=scoring,
        refit="f1_macro",
        cv=cv,
        n_jobs=-1,
        verbose=1,
        random_state=42,
        return_train_score=False
    )
    search.fit(X, y)
    best_estimators[name] = search.best_estimator_
    search_results[name] = {
        "best_params": search.best_params_,
        "best_f1_macro": search.best_score_,
        "best_accuracy": search.cv_results_["mean_test_accuracy"][search.best_index_],
        "best_balanced_acc": search.cv_results_["mean_test_balanced_acc"][search.best_index_],
        "best_roc_auc_ovr": search.cv_results_["mean_test_roc_auc_ovr"][search.best_index_],
    }

# Compare models
summary_df = pd.DataFrame(search_results).T.sort_values("best_f1_macro", ascending=False)
print(summary_df)

# --- 5) Pick winner, fit on full data, diagnostics
winner_name = summary_df.index[0]
winner = best_estimators[winner_name]
print(f"\nSelected model: {winner_name} with params: {search_results[winner_name]['best_params']}")

winner.fit(X, y)
train_pred = winner.predict(X)
print("\nConfusion matrix (train fit):\n", confusion_matrix(y, train_pred))
print("\nClassification report (train fit):\n", classification_report(y, train_pred, digits=3))

# --- 6) Predict test set
test_pred = winner.predict(X_test)
test_pred_proba = winner.predict_proba(X_test) if hasattr(winner, "predict_proba") else None