In [1]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn.preprocessing import OrdinalEncoder

import xgboost as xgb
import optuna

import matplotlib.pyplot as plt
import seaborn as sns

import mlflow
import mlflow.sklearn

pd.set_option("display.max_columns", None)
sns.set_style("whitegrid")

tracking_uri = "../logs/mlruns"
os.makedirs(os.path.join(tracking_uri, ".trash"), exist_ok=True)

mlflow.set_tracking_uri(tracking_uri)
mlflow.set_experiment("house_price_prediction")


  from .autonotebook import tqdm as notebook_tqdm


<Experiment: artifact_location=('file:///c:/Users/LaurynasBaltrusaitis/OneDrive - Adaptfy '
 'BV/Desktop/Education/git_personal_repos/house_price_prediction_project/notebooks/../logs/mlruns/436920743824240539'), creation_time=1758608845292, experiment_id='436920743824240539', last_update_time=1758608845292, lifecycle_stage='active', name='house_price_prediction', tags={}>

In [2]:
import sys
from pathlib import Path
import yaml
import glob

# Set project root (house_price_prediction)
ROOT = Path.cwd().parents[0]  # notebooks/notebook_x -> ROOT = house_price_prediction

# Adjust the path to your project root folder
project_root = os.path.abspath(
    os.path.join("..")
)  # from notebooks/ up one level

if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.features.preprocessing_pipeline import PreprocessingPipeline

PREPROCESS_CONFIG_PATH = ROOT / "config" / "preprocessing_config.yaml"
MODEL_CONFIG_PATH = ROOT / "config" / "model_config.yaml"

RAW_JSON_PATTERN = ROOT / "data" / "parsed_json" / "*.json"  
CACHE_DIR = ROOT / "data" / "cache"                           

with open(PREPROCESS_CONFIG_PATH) as f:
    preprocessing_cfg = yaml.safe_load(f)

with open(MODEL_CONFIG_PATH) as f:
    model_cfg = yaml.safe_load(f)

config_paths = {
    "preprocessing": preprocessing_cfg.get("preprocessing", {}),
    "model": model_cfg.get("model", {}),
}

print("JSON files found:", glob.glob(str(RAW_JSON_PATTERN)))
print("Cache directory:", CACHE_DIR.resolve())

pipeline = PreprocessingPipeline(
    config_paths=config_paths,
    raw_json_pattern=str(RAW_JSON_PATTERN),
    load_cache=False,  # do not load existing cache
    save_cache=True,   # save newly computed steps
    cache_dir=str(CACHE_DIR.resolve()),
    model_config_path=MODEL_CONFIG_PATH,
    model_name="xgboost_early_stopping_optuna_feature_eng_geoloc_exp",
)

result = pipeline.run(smart_cache=True)

print("X_train shape:", result.X_train.shape)
print("X_val shape:", result.X_val.shape if result.X_val is not None else None)
print("X_test shape:", result.X_test.shape)
display(result.df_clean.head())


JSON files found: ['c:\\Users\\LaurynasBaltrusaitis\\OneDrive - Adaptfy BV\\Desktop\\Education\\git_personal_repos\\house_price_prediction_project\\data\\parsed_json\\0005dd28_20250914_004731.json', 'c:\\Users\\LaurynasBaltrusaitis\\OneDrive - Adaptfy BV\\Desktop\\Education\\git_personal_repos\\house_price_prediction_project\\data\\parsed_json\\00210944_20250913_180341.json', 'c:\\Users\\LaurynasBaltrusaitis\\OneDrive - Adaptfy BV\\Desktop\\Education\\git_personal_repos\\house_price_prediction_project\\data\\parsed_json\\0021eb84_20250914_002324.json', 'c:\\Users\\LaurynasBaltrusaitis\\OneDrive - Adaptfy BV\\Desktop\\Education\\git_personal_repos\\house_price_prediction_project\\data\\parsed_json\\0038884d_20250914_114737.json', 'c:\\Users\\LaurynasBaltrusaitis\\OneDrive - Adaptfy BV\\Desktop\\Education\\git_personal_repos\\house_price_prediction_project\\data\\parsed_json\\004ae00d_20250914_124103.json', 'c:\\Users\\LaurynasBaltrusaitis\\OneDrive - Adaptfy BV\\Desktop\\Education\\git_

FileNotFoundError: No cache found for: df_raw

In [None]:
import mlflow

model_name = "XGB_Optuna_LogTransformed_feature_eng_best"
runs = mlflow.search_runs(filter_string=f"tags.mlflow.runName = '{model_name}'")
print(runs)


                             run_id       experiment_id    status  \
0  c850bdfd458847b6a7ef59754aca1c68  436920743824240539  FINISHED   

                                        artifact_uri  \
0  file:///c:/Users/LaurynasBaltrusaitis/OneDrive...   

                        start_time                         end_time  \
0 2025-10-10 16:29:50.710000+00:00 2025-10-10 16:31:32.953000+00:00   

   metrics.train_huber  metrics.val_r2  metrics.test_mae  metrics.train_mae  \
0         54340.716911        0.918169      75441.862718       54341.216911   

   metrics.train_mape  metrics.test_huber  metrics.train_rmse  \
0            7.574298        75441.362718       112722.151887   

   metrics.val_huber  metrics.val_mae  metrics.test_rmse  metrics.val_mape  \
0       75440.926672     75441.426672      205448.592463         10.337877   

   metrics.val_rmse  metrics.test_mape  metrics.test_r2  metrics.train_r2  \
0     152651.155385            9.05448         0.817923          0.953449   

   