**HORIZON**

In [2]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from datetime import datetime, timedelta
from tsfresh import extract_features
from tsfresh.utilities.dataframe_functions import roll_time_series
from sklearn.neighbors import BallTree
from math import radians, sin, cos, sqrt, atan2

def load_and_prepare_data(calendar_path, listings_path, n_listings=500):
    """Load and prepare the data with basic cleaning and sampling."""
    # Load listings first to get the sample
    listings_df = pd.read_csv(listings_path)
    sampled_listings = listings_df['id'].sample(n=n_listings, random_state=42)
    
    # Clean and prepare listings data
    listings_cleaned = listings_df[listings_df['id'].isin(sampled_listings)][
        ['id', 'neighbourhood_cleansed', 'latitude', 'longitude']
    ]
    listings_cleaned = listings_cleaned.rename(columns={'id': 'listing_id'})
    
    # Load and filter calendar data
    calendar_df = pd.read_csv(calendar_path)
    calendar_df = calendar_df[calendar_df['listing_id'].isin(sampled_listings)]
    calendar_df['date'] = pd.to_datetime(calendar_df['date'])
    
    # Merge calendar with listings data
    df = pd.merge(calendar_df, listings_cleaned, on='listing_id', how='left')
    
    print(f"Total listings in sample: {len(df['listing_id'].unique())}")
    print(f"Total records in sample: {len(df)}")
    print(f"Date range: {df['date'].min()} to {df['date'].max()}")
    
    return df

def extract_tsfresh_features(df):
    """Extract time series features using tsfresh."""
    # Create a proper time series dataframe for tsfresh
    df_tsfresh = df[['listing_id', 'date', 'price']].copy()
    #df_tsfresh['price'] = clean_price(df_tsfresh['price'])
    df_tsfresh = df_tsfresh.sort_values(['listing_id', 'date'])
   
    # Create rolling windows
    df_rolled = roll_time_series(
        df_tsfresh,
        column_id='listing_id',
        column_sort='date',
        max_timeshift=7,
        rolling_direction=1
    )
   
    # Define minimal but meaningful feature set
    fc_parameters = {
        "mean": None,
        "median": None,
        "standard_deviation": None,
        "variance": None,
        "maximum": None,
        "minimum": None,
        "mean_change": None,
        "mean_abs_change": None
    }
   
    try:
        features_filtered = extract_features(
            df_rolled,
            column_id='listing_id',
            column_sort='date',
            column_value='price',
            default_fc_parameters=fc_parameters,
            n_jobs=0
        )
       
        # Validate features
        features_filtered = features_filtered.replace([np.inf, -np.inf], np.nan)
        features_filtered = features_filtered.dropna(axis=1, how='all')
        non_constant_cols = features_filtered.columns[features_filtered.nunique() > 1]
        features_filtered = features_filtered[non_constant_cols]
       
        print(f"\nExtracted {len(non_constant_cols)} valid tsfresh features")
        return features_filtered
       
    except Exception as e:
        print(f"Error in tsfresh feature extraction: {str(e)}")
        return pd.DataFrame()

def create_history_features(df, history_window=365, forecast_horizon=60):
    """Create lagged features based on full year history and 2-month forecast horizon."""
    df['price_numeric'] = pd.to_numeric(df['price'].replace(r'[\$,]', '', regex=True), errors='coerce')
    df = df.sort_values(['listing_id', 'date'])
    
    processed_data = []
    
    for listing_id in df['listing_id'].unique():
        listing_data = df[df['listing_id'] == listing_id].copy()
        
        # Create lagged features at different intervals throughout the year
        # Monthly lags for first 6 months
        for i in range(1, 181, 30):
            listing_data[f'price_lag_{i}d'] = listing_data['price_numeric'].shift(i)
        
        # Quarterly lags for the rest of the year
        for i in range(181, 366, 90):
            listing_data[f'price_lag_{i}d'] = listing_data['price_numeric'].shift(i)
        
        # Create rolling statistics with different windows
        windows = [30, 90, 180, 365]  # Monthly, quarterly, half-yearly, yearly
        for window in windows:
            listing_data[f'rolling_mean_{window}d'] = listing_data['price_numeric'].rolling(window=window).mean()
            listing_data[f'rolling_std_{window}d'] = listing_data['price_numeric'].rolling(window=window).std()
            listing_data[f'rolling_max_{window}d'] = listing_data['price_numeric'].rolling(window=window).max()
            listing_data[f'rolling_min_{window}d'] = listing_data['price_numeric'].rolling(window=window).min()
        
        # Add year-over-year price change
        listing_data['yoy_price_change'] = listing_data['price_numeric'] / listing_data['price_numeric'].shift(365) - 1
        
        # Create future target (price after forecast_horizon days)
        listing_data['target_price'] = listing_data['price_numeric'].shift(-forecast_horizon)
        
        processed_data.append(listing_data)
    
    processed_df = pd.concat(processed_data)
    processed_df = processed_df.dropna()
    
    return processed_df


def prepare_features_with_history(df, tsfresh_features, history_window=365, forecast_horizon=60):
    """Enhanced feature preparation including spatial features."""
    # Create temporal features
    df['day_of_week'] = df['date'].dt.dayofweek
    df['month'] = df['date'].dt.month
    df['is_weekend'] = df['day_of_week'].isin([5, 6]).astype(int)
    df['season_sin'] = np.sin(2 * np.pi * df['date'].dt.dayofyear/365.25)
    df['season_cos'] = np.cos(2 * np.pi * df['date'].dt.dayofyear/365.25)
    
    # Add history-based features
    df_with_history = create_history_features(df, history_window, forecast_horizon)
    
    # Create spatial features
    spatial_features = create_spatial_features(df_with_history)
    df_with_history = pd.concat([df_with_history, spatial_features], axis=1)
    
    # Encode categorical variables
    le = LabelEncoder()
    df_with_history['neighbourhood_cleansed_encoded'] = le.fit_transform(
        df_with_history['neighbourhood_cleansed'].fillna('Unknown')
    )
    
    # Define feature sets
    basic_features = [
        'day_of_week', 'month', 'is_weekend', 'season_sin', 'season_cos',
        'neighbourhood_cleansed_encoded', 'latitude', 'longitude'
    ]
    
    lag_features = [col for col in df_with_history.columns if 'price_lag_' in col]
    rolling_features = [col for col in df_with_history.columns if 'rolling_' in col]
    spatial_feature_cols = spatial_features.columns.tolist()
    
    all_features = (
        basic_features + 
        lag_features + 
        rolling_features + 
        spatial_feature_cols + 
        ['yoy_price_change']
    )
    
    X = df_with_history[all_features]
    
    return X, df_with_history['target_price'], df_with_history['date']

def calculate_distance(lat1, lon1, lat2, lon2):
    """
    Calculate the distance between two points using the Haversine formula.
    Returns distance in kilometers.
    """
    R = 6371  # Earth's radius in kilometers

    lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
    
    dlat = lat2 - lat1
    dlon = lon2 - lon1
    
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * atan2(sqrt(a), sqrt(1-a))
    distance = R * c
    
    return distance

def create_spatial_features(df, k_neighbors=5):
    """
    Create spatial features combining:
    - Distance to city center
    - North-South position
    - K-nearest neighbors price statistics
    
    Args:
        df: DataFrame containing latitude, longitude and price
        k_neighbors: Number of nearest neighbors for price statistics
    """
    # City centers (approximate coordinates)
    city_centers = {
        'amsterdam': (52.3676, 4.9041),
        'paris': (48.8566, 2.3522),
        'london': (51.5074, -0.1278),
        'berlin': (52.5200, 13.4050),
        'rome': (41.9028, 12.4964),
        'barcelona': (41.3851, 2.1734)
    }
    
    # Determine which city based on coordinates
    sample_lat = df['latitude'].median()
    sample_lon = df['longitude'].median()
    
    # Find closest city center
    closest_city = min(city_centers.items(), 
                      key=lambda x: calculate_distance(sample_lat, sample_lon, x[1][0], x[1][1]))
    
    city_center_lat, city_center_lon = city_centers[closest_city[0]]
    print(f"Using {closest_city[0].title()} as reference city")
    
    # Create spatial features DataFrame
    spatial_features = pd.DataFrame(index=df.index)
    
    # 1. Basic spatial features
    # Distance to city center
    spatial_features['distance_to_center'] = df.apply(
        lambda row: calculate_distance(
            row['latitude'], 
            row['longitude'], 
            city_center_lat, 
            city_center_lon
        ),
        axis=1
    )
    
    # North-South position relative to city center
    spatial_features['north_south'] = df['latitude'] - city_center_lat
    
    # Normalized latitude (0 = southernmost, 1 = northernmost)
    min_lat = df['latitude'].min()
    max_lat = df['latitude'].max()
    spatial_features['normalized_latitude'] = (df['latitude'] - min_lat) / (max_lat - min_lat)
    
    # 2. K-Nearest Neighbors Price Statistics
    # Convert price to numeric if needed
    if not pd.api.types.is_numeric_dtype(df['price']):
        price_numeric = pd.to_numeric(df['price'].str.replace('$', '').str.replace(',', ''), errors='coerce')
    else:
        price_numeric = df['price']
    
    # Convert coordinates to radians for BallTree
    coords = np.radians(df[['latitude', 'longitude']].values)
    
    # Create BallTree for efficient nearest neighbor search
    tree = BallTree(coords, metric='haversine')
    
    # Get indices and distances for k nearest neighbors
    distances, indices = tree.query(coords, k=k_neighbors + 1)  # +1 because first point is self
    distances = distances * 6371.0  # Convert to kilometers (Earth radius)
    
    # Calculate neighbor price statistics
    neighbor_prices = np.array([price_numeric.iloc[idx[1:]].values for idx in indices])
    
    spatial_features['knn_price_mean'] = np.nanmean(neighbor_prices, axis=1)
    spatial_features['knn_price_std'] = np.nanstd(neighbor_prices, axis=1)
    spatial_features['price_diff_from_neighbors'] = price_numeric.values - spatial_features['knn_price_mean']
    
    print("Created spatial features:")
    print("- distance_to_center: Distance in km from city center")
    print("- north_south: Positive values are north of city center")
    print("- normalized_latitude: 0-1 scale from south to north")
    print("- knn_price_mean: Average price of k nearest neighbors")
    print("- knn_price_std: Standard deviation of k nearest neighbor prices")
    print("- price_diff_from_neighbors: Price difference from neighbor average")
    
    return spatial_features

def evaluate_predictions(y_test, y_pred, test_dates):
    """
    Simple evaluation of model predictions.
    
    Args:
        y_test: Array of actual values
        y_pred: Array of predicted values
        test_dates: Array of dates corresponding to predictions
    """
    # Calculate basic metrics
    metrics = {
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred)),
        'MAE': mean_absolute_error(y_test, y_pred),
        'R2': r2_score(y_test, y_pred)
    }
    
    # Create results DataFrame
    results_df = pd.DataFrame({
        'date': test_dates,
        'actual': y_test,
        'predicted': y_pred,
        'abs_error': np.abs(y_test - y_pred),
        'pct_error': np.abs((y_test - y_pred) / y_test) * 100
    })
    
    # Simple monthly analysis
    monthly_analysis = results_df.set_index('date').resample('M').agg({
        'abs_error': ['mean', 'std'],
        'pct_error': ['mean', 'std']
    })
    
    return metrics, results_df, monthly_analysis

def train_model_with_history(X, y, dates):
    """Train XGBoost model using 1 year of data and predict 2 months ahead."""
    # Calculate cutoff dates
    latest_date = dates.max()
    training_start = latest_date - timedelta(days=365)  # 1 year
    validation_start = latest_date - timedelta(days=60)  # Last 2 months for validation
    
    # Split data based on dates
    train_mask = (dates >= training_start) & (dates < validation_start)
    test_mask = dates >= validation_start
    
    # Split features and target
    X_train = X[train_mask]
    X_test = X[test_mask]
    y_train = y[train_mask]
    y_test = y[test_mask]
    
    print("\nData Split Information:")
    print(f"Training period: {dates[train_mask].min()} to {dates[train_mask].max()}")
    print(f"Testing period: {dates[test_mask].min()} to {dates[test_mask].max()}")
    print(f"Training samples: {len(X_train)}")
    print(f"Testing samples: {len(X_test)}")
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # Initialize XGBoost with parameters tuned for longer forecast horizon
    xgb_model = XGBRegressor(
        n_estimators=300,
        learning_rate=0.01,
        max_depth=6,
        min_child_weight=2,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        early_stopping=20
    )
    
    # Train model
    xgb_model.fit(
        X_train_scaled, 
        y_train,
        eval_set=[(X_test_scaled, y_test)],
        verbose=False
    )
    
    return xgb_model, X_train_scaled, X_test_scaled, y_train, y_test, dates[test_mask]

def main():
    # Load and prepare data
    calendar_path = r'C:\Users\mvk\Documents\DATA_school\thesis\data_new\paris\paris_merged_calendar.csv'
    listings_path = r'C:\Users\mvk\Documents\DATA_school\thesis\data_new\paris\2024-06-10\listings.csv'
    
    print("Loading and preparing data...")
    df = load_and_prepare_data(calendar_path, listings_path, n_listings=500)
    
    # Extract tsfresh features
    print("Extracting tsfresh features...")
    tsfresh_features = extract_tsfresh_features(df)

    
    # Prepare features with full year history and 2-month forecast
    print("Preparing features with history...")
    X, y, dates = prepare_features_with_history(
        df, 
        tsfresh_features,
        history_window=365,  # Full year of history
        forecast_horizon=60  # 2 months ahead
    )
    
    # Train model and make predictions
    print("\nTraining model...")
    model, X_train_scaled, X_test_scaled, y_train, y_test, test_dates = train_model_with_history(X, y, dates)
    
    print("\nMaking predictions...")
    y_pred = model.predict(X_test_scaled)
    
    # Evaluate predictions
    metrics, results_df, monthly_analysis = evaluate_predictions(y_test, y_pred, test_dates)
    
    # Print results
    print("\nModel Performance Metrics:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    
    print("\nMonthly Error Analysis:")
    print(monthly_analysis)
    
    # Feature importance analysis
    feature_importance = pd.DataFrame({
        'feature': X.columns,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print("\nTop 10 Most Important Features:")
    print(feature_importance.head(10))
    
    # Analysis by prediction month
    first_month_mask = results_df['date'] < (results_df['date'].min() + timedelta(days=30))
    second_month_mask = ~first_month_mask
    
    print("\nPrediction Error by Month:")
    print("First Month:")
    print(f"Mean Absolute Error: {results_df[first_month_mask]['abs_error'].mean():.2f}")
    print(f"Mean Percentage Error: {results_df[first_month_mask]['pct_error'].mean():.2f}%")
    print("\nSecond Month:")
    print(f"Mean Absolute Error: {results_df[second_month_mask]['abs_error'].mean():.2f}")
    print(f"Mean Percentage Error: {results_df[second_month_mask]['pct_error'].mean():.2f}%")

if __name__ == "__main__":
    main()

Loading and preparing data...
Total listings in sample: 500
Total records in sample: 339131
Date range: 2023-06-07 00:00:00 to 2025-09-12 00:00:00
Extracting tsfresh features...


Rolling: 100%|██████████| 40/40 [00:22<00:00,  1.80it/s]
Feature Extraction: 100%|██████████| 500/500 [00:00<00:00, 1048.87it/s]



Extracted 8 valid tsfresh features
Preparing features with history...
Using Paris as reference city
Created spatial features:
- distance_to_center: Distance in km from city center
- north_south: Positive values are north of city center
- normalized_latitude: 0-1 scale from south to north
- knn_price_mean: Average price of k nearest neighbors
- knn_price_std: Standard deviation of k nearest neighbor prices
- price_diff_from_neighbors: Price difference from neighbor average

Training model...

Data Split Information:
Training period: 2024-07-14 00:00:00 to 2025-05-14 00:00:00
Testing period: 2025-05-15 00:00:00 to 2025-07-14 00:00:00
Training samples: 94603
Testing samples: 23949


Parameters: { "early_stopping" } are not used.




Making predictions...

Model Performance Metrics:
RMSE: 23.8129
MAE: 9.2217
R2: 0.9976

Monthly Error Analysis:
           abs_error            pct_error           
                mean        std      mean        std
date                                                
2025-05-31  8.899027  21.848284  6.716404  11.587952
2025-06-30  9.187735  21.582365  6.316491  10.905755
2025-07-31  9.722606  23.023905  5.422482   9.067460

Top 10 Most Important Features:
             feature  importance
28  rolling_min_180d    0.269329
10     price_lag_61d    0.234078
8       price_lag_1d    0.196678
9      price_lag_31d    0.189669
20   rolling_min_30d    0.036352
21  rolling_mean_90d    0.023633
7          longitude    0.011679
19   rolling_max_30d    0.008715
14    price_lag_181d    0.005494
24   rolling_min_90d    0.004464

Prediction Error by Month:
First Month:
Mean Absolute Error: 8.94
Mean Percentage Error: 6.67%

Second Month:
Mean Absolute Error: 9.47
Mean Percentage Error: 5.86%


  monthly_analysis = results_df.set_index('date').resample('M').agg({
