In [None]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Import our custom modules
# NOTE: Make sure 'get_preset' is in 'src/preprocessing/presets.py'
from src.preprocessing.presets import get_preset 
from src.model.models import create_and_return_all_models
from src.hyperparameter_tuning import get_param_dists, run_hyperparameter_search

# ## 2. Load and Preprocess Data
print("--- Step 1: Loading and Preprocessing Data ---")
train_values = pd.read_csv("data/train_set_values.csv")
train_labels = pd.read_csv("data/train_set_labels.csv")
test_values  = pd.read_csv("data/test_set_values.csv")
train_df = pd.merge(train_values, train_labels, on="id", how="left")

# Use your predefined preprocessing pipeline
preset_name = "log_transform+remove_correlated+feature_engineer"
pre = get_preset(preset_name, list(train_df.columns))

train_processed = pre.fit_transform(train_df)
test_processed  = pre.transform(test_values)
print("Data preprocessing complete.")

# ## 3. Prepare Data for Modeling
X = train_processed.drop(columns=["status_group"])
y_raw = train_processed["status_group"]
le = LabelEncoder()
y = le.fit_transform(y_raw)

# ## 4. Run Cross-Validated Hyperparameter Search
# Get the dictionary of untrained model pipelines
# We pass X to the function so it can identify the numeric columns for scaling
models_to_tune = create_and_return_all_models(X_train=X, seed=42)

# Get the dictionary of parameter distributions
param_dists = get_param_dists()

# Run the search. Set n_iter to a higher number (e.g., 25) for a more thorough search.
summary_df, best_estimators = run_hyperparameter_search(models_to_tune, param_dists, X, y, n_iter=10)

print("\n--- Hyperparameter Search Summary ---")
print(summary_df)

# ## 5. Create Submission with the Winning Model
winner_name = summary_df.index[0]
winner_model = best_estimators[winner_name]

print(f"\n--- Winning Model: {winner_name} ---")
print(f"Best Parameters Found: {summary_df.loc[winner_name, 'best_params']}")

# Prepare the test data for prediction
X_test = test_processed.copy()

# Make predictions
test_pred_encoded = winner_model.predict(X_test)

# Decode predictions back to original labels
test_pred_labels = le.inverse_transform(test_pred_encoded)

# Create and save submission file
submission_df = pd.DataFrame({
    'id': test_values['id'],
    'status_group': test_pred_labels
})
submission_filename = f"submission_{winner_name.replace(' ', '_')}.csv"
submission_df.to_csv(submission_filename, index=False)
print(f"\nSubmission file saved as '{submission_filename}'")

print("\n--- Project Finished ---")
print("Final submission head:")
print(submission_df.head())