In [None]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import OrdinalEncoder

import xgboost as xgb
import optuna

import matplotlib.pyplot as plt
import seaborn as sns

import mlflow
import mlflow.sklearn

pd.set_option("display.max_columns", None)
sns.set_style("whitegrid")

tracking_uri = "../logs/mlruns"
os.makedirs(os.path.join(tracking_uri, ".trash"), exist_ok=True)

mlflow.set_tracking_uri(tracking_uri)
mlflow.set_experiment("house_price_prediction")


In [None]:
import sys
import os
from pathlib import Path
import yaml


# Adjust the path to your project root folder
project_root = os.path.abspath(
    os.path.join("..")
)  # from notebooks/ up one level

if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.data_loading.data_loading.data_loader import load_data_from_json
from src.data_loading.preprocessing.preprocessing import preprocess_df
from src.data_loading.preprocessing.imputation import impute_missing_values


# go two levels up from notebook dir -> project root
ROOT = (
    Path(__file__).resolve().parents[2]
    if "__file__" in globals()
    else Path.cwd().parents[1]
)
CONFIG_PATH = (
    ROOT
    / "house_price_prediction_project"
    / "config"
    / "preprocessing_config.yaml"
)

with open(CONFIG_PATH) as f:
    CONFIG = yaml.safe_load(f)

df_raw = load_data_from_json("../data/parsed_json/*.json")
df_clean = preprocess_df(
    df_raw,
    drop_raw=CONFIG["preprocessing"]["drop_raw"],
    numeric_cols=CONFIG["preprocessing"]["numeric_cols"],
)
df_clean = impute_missing_values(
    df_clean, CONFIG["preprocessing"]["imputation"]
)
# Drop price_num NaNs for the training of the model
df_clean = df_clean[df_clean["price_num"].notna()]
df_clean.drop(columns=["living_area"], inplace=True)

####
# Outlier removal
# import pandas as pd
# from sklearn.ensemble import IsolationForest

# # Copy to avoid mutating original df
# df_outlier_test = df_clean.copy()

# # 1️⃣ Select only numeric features for outlier detection
# numeric_cols = df_outlier_test.select_dtypes(include=['number']).columns
# X_numeric = df_outlier_test[numeric_cols]

# # 2️⃣ Fit IsolationForest on numeric subset
# iso = IsolationForest(
#     contamination=0.01,  # ~5% flagged as outliers (tune this!)
#     random_state=42
# )
# outlier_labels = iso.fit_predict(X_numeric)

# # 3️⃣ Filter outliers (label = -1 are outliers)
# df_no_outliers = df_outlier_test[outlier_labels == 1].reset_index(drop=True)

# print(f"Original shape: {df_outlier_test.shape}")
# print(f"After removing outliers: {df_no_outliers.shape}")

# # Rows removed (outliers)
# mask_outliers = outlier_labels == -1
# df_removed = df_outlier_test[mask_outliers].reset_index(drop=True)

# df_clean = df_no_outliers.copy()
#####

import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest

# Copy to avoid mutating original df
df_outlier_test = df_clean.copy()

# Numeric features only
numeric_cols = df_outlier_test.select_dtypes(include=['number']).columns
X_numeric = df_outlier_test[numeric_cols]

# 1️⃣ Detect outliers with IsolationForest
iso = IsolationForest(
    contamination=0.02,  # ~2% of data flagged as outliers, tune as needed
    random_state=42
)
outlier_labels = iso.fit_predict(X_numeric)

# 2️⃣ Separate outliers and inliers
mask_outliers = outlier_labels == -1
df_outliers = df_outlier_test[mask_outliers].reset_index(drop=True)
df_inliers = df_outlier_test[~mask_outliers].reset_index(drop=True)

print(f"Original shape: {df_outlier_test.shape}")
print(f"Outliers detected: {df_outliers.shape[0]}")

# 3️⃣ Winsorize only the upper tail (99th percentile of inliers)
winsor_limits = {}
df_winsorized = df_outlier_test.copy()

for col in numeric_cols:
    upper_limit = np.percentile(df_inliers[col], 95)
    winsor_limits[col] = upper_limit
    # Track which rows are capped
    capped_mask = df_winsorized[col] > upper_limit
    df_winsorized.loc[capped_mask, col] = upper_limit
    if capped_mask.any():
        print(f"{capped_mask.sum()} rows in '{col}' capped at {upper_limit:.2f}")

print("Winsorizing applied: only upper extremes capped.")

# ✅ Now df_winsorized can be used for modeling

# Now df_outlier_test has all rows, but extreme numeric values are capped
df_clean = df_winsorized.copy()
####

# df_clean = df_clean[:100] 
df = df_clean.copy()

In [None]:
capped_mask

In [None]:
from src.features.data_prep_for_modelling.data_preparation import prepare_data

FEATURES_CONFIG_PATH = (
    ROOT / "house_price_prediction_project" / "config" / "model_config.yaml"
)

# Scaled features (applies scaling according to YAML)
X_train_scaled, X_test_scaled, y_train, y_test, X_val, y_val, scaler, _ = prepare_data(
    df,
    config_path=FEATURES_CONFIG_PATH,
    model_name="linear_regression",  # uses the unified YAML key
    use_extended_features=False,       # set True if you want extended features
    cv=False
)

In [None]:
from src.model.evaluate import ModelEvaluator
from src.model.mlflow_logger import MLFlowLogger

evaluator = ModelEvaluator()
logger = MLFlowLogger()

lr_model = LinearRegression()

# Evaluate
trained_lr, y_train_pred, y_val_pred, y_test_pred, lr_results = evaluator.evaluate(
    model=lr_model,
    X_train=X_train_scaled,
    y_train=y_train,
    X_test=X_test_scaled,
    y_test=y_test,
    model_params={},   
    fit_params={},     
    use_xgb_train=False
)

# Log the model and results
logger.log_model(trained_lr, "LinearRegression", lr_results)

In [None]:
from src.features.feature_engineering.encoding import encode_energy_label

X_train, X_test, y_train, y_test, scaler, X_val, y_val, _ = prepare_data(
    df_clean,
    config_path=FEATURES_CONFIG_PATH, 
    model_name="random_forest",
    use_extended_features=False,     
    cv=False 
)

In [None]:
rf_model = RandomForestRegressor()

trained_rf, y_train_pred, y_val_pred, y_test_pred, rf_results = evaluator.evaluate(
    model=rf_model,
    X_train=X_train,
    y_train=y_train,
    X_test=X_test,
    y_test=y_test,
    model_params={},  
    fit_params={},    
    use_xgb_train=False
)
logger.log_model(trained_rf, "RandomForestRegression", rf_results)

In [None]:
from src.model.utils import load_model_config_and_search_space

X_train, X_test, y_train, y_test, X_val, y_val, scaler, _ = prepare_data(
    df_clean, config_path=FEATURES_CONFIG_PATH, model_name="xgboost", 
    use_extended_features=False, cv=False
)

MODEL_CONFIG_PATH = (
    ROOT / "house_price_prediction_project" / "config" / "model_config.yaml"
)

model_params, fit_params, _ = load_model_config_and_search_space(
    MODEL_CONFIG_PATH, model_name="xgboost"
)
fit_params_safe = fit_params.copy()
n_estimators = fit_params_safe.pop("n_estimators", 100)  

xgb_model = xgb.XGBRegressor(
    n_estimators=n_estimators,
    **model_params
)

trained_xgb, y_train_pred, y_val_pred, y_test_pred, xgb_results = evaluator.evaluate(
    xgb_model,
    X_train,
    y_train,
    X_test=X_test,
    y_test=y_test,
    fit_params=fit_params_safe, 
    use_xgb_train=False,
    X_val=X_val,
    y_val=y_val,
)
logger.log_model(trained_xgb, "XGBoostRegression", xgb_results)

In [None]:
# df_removed

In [None]:
X_train, X_test, y_train, y_test, X_val, y_val, scaler, _ = prepare_data(
    df_clean,
    config_path=FEATURES_CONFIG_PATH,
    model_name="xgboost_early_stopping",
    use_extended_features=False,
    cv=False,
)

xgb_model_params, xgb_fit_params, _ = load_model_config_and_search_space(
    MODEL_CONFIG_PATH, "xgboost_early_stopping"
)

xgb_model = xgb.XGBRegressor(**xgb_model_params)


trained_xgb, y_train_pred, y_val_pred, y_test_pred, xgb_results = evaluator.evaluate(
    None,
    X_train,
    y_train,
    X_test,
    y_test,
    X_val=X_val,
    y_val=y_val,
    fit_params=xgb_fit_params,
    model_params=xgb_model_params,
    use_xgb_train=True,  
)

logger.log_model(
    trained_xgb, "xgb_with_early_stopping", xgb_results, use_xgb_train=True
)