In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Import our custom modules
from src.preprocessing.presets import get_preset 
from src.model.models import create_and_return_all_models
from src.hyperparameter_tuning import get_param_dists, run_hyperparameter_search
from src.train.evaluate import show_final_evaluation, create_submission_file


# ## 2. Load and Preprocess Data
print("--- Step 1: Loading and Preprocessing Data ---")
train_values = pd.read_csv("data/train_set_values.csv")
train_labels = pd.read_csv("data/train_set_labels.csv")
test_values  = pd.read_csv("data/test_set_values.csv")
train_df = pd.merge(train_values, train_labels, on="id", how="left")

# Use your predefined preprocessing pipeline from presets.py
preset_name = "log_transform+remove_correlated+feature_engineer"
pre = get_preset(preset_name, list(train_df.columns))

train_processed = pre.fit_transform(train_df)
test_processed  = pre.transform(test_values)
print("Data preprocessing complete.")


# ## 3. Prepare Data for Modeling
# Separate features (X) and target (y), and encode the target variable
X = train_processed.drop(columns=["status_group"])
y_raw = train_processed["status_group"]

le = LabelEncoder()
y = le.fit_transform(y_raw)


# ## 4. Run Cross-Validated Hyperparameter Search
# Get the dictionary of untrained model pipelines from models.py
# We pass X so the function can identify numeric columns for the scaler
models_to_tune = create_and_return_all_models(X_train=X, seed=42)

# Get the dictionary of parameter distributions to search over
param_dists = get_param_dists()

# Run the search. Set n_iter to a higher number (e.g., 25) for a real search.
summary_df, best_estimators = run_hyperparameter_search(models_to_tune, param_dists, X, y, n_iter=1)

print("\n--- Hyperparameter Search Summary ---")
print(summary_df)


# ## 5. Evaluate and Submit the Winning Model
# Select the best model based on the f1_macro score from the summary table
winner_name = summary_df.index[0]
winner_model = best_estimators[winner_name]

print(f"\n--- Winning Model: {winner_name} ---")
print(f"Best Parameters Found: {summary_df.loc[winner_name, 'best_params']}")

# **This is the new step that shows the detailed report you wanted.**
# The model from RandomizedSearchCV is already refit on the full data.
# We call our new function from evaluation.py to see the results.
show_final_evaluation(winner_model, X, y, le)

# Create the final submission file using the fully trained winner
X_test = test_processed.copy()
submission_df = create_submission_file(
    model=winner_model, 
    X_test=X_test, 
    original_test_df=test_values, 
    label_encoder=le,
    filename=f"submission_{winner_name.replace(' ', '_')}.csv"
)

print("\n--- Project Finished ---")
print("Final submission head:")
print(submission_df.head())


--- Step 1: Loading and Preprocessing Data ---
{'all_columns': ['id', 'amount_tsh', 'date_recorded', 'funder', 'gps_height', 'installer', 'longitude', 'latitude', 'wpt_name', 'num_private', 'basin', 'subvillage', 'region', 'region_code', 'district_code', 'lga', 'ward', 'population', 'public_meeting', 'recorded_by', 'scheme_management', 'scheme_name', 'permit', 'construction_year', 'extraction_type', 'extraction_type_group', 'extraction_type_class', 'management', 'management_group', 'payment', 'payment_type', 'water_quality', 'quality_group', 'quantity', 'quantity_group', 'source', 'source_type', 'source_class', 'waterpoint_type', 'waterpoint_type_group', 'status_group'], 'remove_col_after_log': True, 'cat_col_cut_off': 10, 'cat_columns': ['installer', 'wpt_name', 'basin', 'public_meeting', 'scheme_management', 'permit', 'extraction_type', 'management', 'payment', 'water_quality', 'quantity', 'quantity_group', 'waterpoint_type', 'recorded_by'], 'log_transform_cols': ['amount_tsh', 'popu