## ARIMA ##

In [27]:
import pandas as pd
import numpy as np
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from datetime import timedelta
import warnings
warnings.filterwarnings('ignore')

def load_and_prepare_data(calendar_path, listings_path, n_listings=500):
    """Load and prepare the data with basic cleaning and sampling."""
    # Load listings first to get the sample
    listings_df = pd.read_csv(listings_path)
    sampled_listings = listings_df['id'].sample(n=n_listings, random_state=42)
    
    # Clean and prepare listings data
    listings_cleaned = listings_df[listings_df['id'].isin(sampled_listings)][
        ['id', 'neighbourhood_cleansed']
    ]
    listings_cleaned = listings_cleaned.rename(columns={'id': 'listing_id'})
    
    # Load and filter calendar data
    calendar_df = pd.read_csv(calendar_path)
    calendar_df = calendar_df[calendar_df['listing_id'].isin(sampled_listings)]
    calendar_df['date'] = pd.to_datetime(calendar_df['date'])
    
    # Clean price column
    calendar_df['price_numeric'] = pd.to_numeric(
        calendar_df['price'].replace('[\$,]', '', regex=True),
        errors='coerce'
    )
    
    # Merge calendar with listings data
    df = pd.merge(calendar_df, listings_cleaned, on='listing_id', how='left')
    
    print(f"Total listings in sample: {len(df['listing_id'].unique())}")
    print(f"Total records in sample: {len(df)}")
    print(f"Date range: {df['date'].min()} to {df['date'].max()}")
    
    return df

def train_simple_arima(df, history_window=365, forecast_horizon=60):
    """
    Train a simple ARIMA model on the market average price.
    Uses the same time window as XGBoost for fair comparison.
    """
    # Calculate market daily average price and sample size
    market_data = df.groupby('date').agg({
        'price_numeric': 'mean',
        'listing_id': 'count'
    }).rename(columns={'listing_id': 'sample_size'})
    
    # Calculate cutoff dates
    latest_date = market_data.index.max()
    train_start = latest_date - timedelta(days=history_window)
    validation_start = latest_date - timedelta(days=forecast_horizon)
    
    # Split data
    train_data = market_data[
        (market_data.index >= train_start) & 
        (market_data.index < validation_start)
    ]
    
    test_data = market_data[market_data.index >= validation_start]
    
    # Fit simple ARIMA model
    try:
        model = SARIMAX(
            train_data['price_numeric'],
            order=(1, 0, 0),  # Simple AR(1) model given stable prices
            enforce_stationarity=False
        ).fit(disp=False)
    except:
        print("Falling back to even simpler model")
        model = SARIMAX(
            train_data['price_numeric'],
            order=(1, 0, 0),
            enforce_stationarity=True
        ).fit(disp=False)
    
    # Generate predictions
    predictions = model.forecast(len(test_data))
    
    # Create results DataFrame
    results_df = pd.DataFrame({
        'date': test_data.index,
        'actual': test_data['price_numeric'],
        'predicted': predictions,
        'sample_size': test_data['sample_size']
    })
    
    # Calculate errors
    results_df['abs_error'] = np.abs(results_df['actual'] - results_df['predicted'])
    results_df['pct_error'] = np.abs((results_df['actual'] - results_df['predicted']) / 
                                    results_df['actual']) * 100
    
    return model, results_df

def evaluate_predictions(y_true, y_pred, dates):
    """Calculate comprehensive error metrics for predictions."""
    # Basic metrics
    metrics = {
        'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
        'MAE': mean_absolute_error(y_true, y_pred),
        'R2': r2_score(y_true, y_pred)
    }
    
    # Create results DataFrame
    results_df = pd.DataFrame({
        'date': dates,
        'actual': y_true,
        'predicted': y_pred,
        'abs_error': np.abs(y_true - y_pred),
        'pct_error': np.abs((y_true - y_pred) / y_true) * 100
    })
    
    # Monthly analysis
    monthly_analysis = results_df.set_index('date').resample('ME').agg({
        'actual': ['mean', 'std'],
        'predicted': ['mean', 'std'],
        'abs_error': ['mean', 'std'],
        'pct_error': ['mean', 'std']
    })
    
    return metrics, results_df, monthly_analysis

def analyze_price_stability(df):
    """Analyze price stability across listings."""
    price_changes = df.groupby('listing_id')['price_numeric'].agg([
        ('unique_prices', 'nunique'),
        ('mean_price', 'mean'),
        ('std_price', 'std'),
        ('min_price', 'min'),
        ('max_price', 'max')
    ])
    
    return {
        'constant_price_listings': (price_changes['unique_prices'] == 1).sum(),
        'avg_unique_prices': price_changes['unique_prices'].mean(),
        'max_unique_prices': price_changes['unique_prices'].max(),
        'avg_price_std': price_changes['std_price'].mean(),
        'price_range': price_changes['max_price'].max() - price_changes['min_price'].min()
    }

def main():
    calendar_path = r"C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Merged_Data\paris_merged_calendar.csv"
    listings_path = r"C:\Users\matth\OneDrive\Documents\KU Leuven\Thesis\Data_Mor\paris\2024-09-06\listings.csv"
    
    print("Loading and preparing data...")
    df = load_and_prepare_data(calendar_path, listings_path, n_listings=500)
    
    print("\nAnalyzing price stability...")
    stability_metrics = analyze_price_stability(df)
    print("\nPrice Stability Analysis:")
    for metric, value in stability_metrics.items():
        print(f"{metric}: {value:.2f}")
    
    print("\nTraining ARIMA model...")
    model, results_df = train_simple_arima(
        df,
        history_window=365,
        forecast_horizon=60
    )
    
    # Evaluate predictions
    metrics, results_df, monthly_analysis = evaluate_predictions(
        results_df['actual'],
        results_df['predicted'],
        results_df['date']
    )
    
    # Print results
    print("\nModel Performance Metrics:")
    for metric, value in metrics.items():
        print(f"{metric}: {value:.4f}")
    
    print("\nMonthly Error Analysis:")
    print(monthly_analysis)
    
    # Analysis by prediction month
    first_month_mask = results_df['date'] < (results_df['date'].min() + timedelta(days=30))
    second_month_mask = ~first_month_mask
    
    print("\nPrediction Error by Month:")
    print("First Month:")
    print(f"Mean Absolute Error: {results_df[first_month_mask]['abs_error'].mean():.2f}")
    print(f"Mean Percentage Error: {results_df[first_month_mask]['pct_error'].mean():.2f}%")
    print("\nSecond Month:")
    print(f"Mean Absolute Error: {results_df[second_month_mask]['abs_error'].mean():.2f}")
    print(f"Mean Percentage Error: {results_df[second_month_mask]['pct_error'].mean():.2f}%")
    
    # Additional analysis
    print("\nPrediction Statistics:")
    print(f"Mean predicted price: ${results_df['predicted'].mean():.2f}")
    print(f"Actual price range: ${results_df['actual'].min():.2f} - ${results_df['actual'].max():.2f}")
    print(f"Predicted price range: ${results_df['predicted'].min():.2f} - ${results_df['predicted'].max():.2f}")
    
    # Error distribution
    percentiles = [25, 50, 75, 90]
    print("\nError Distribution:")
    for p in percentiles:
        print(f"{p}th percentile of absolute error: ${np.percentile(results_df['abs_error'], p):.2f}")
    
    # Sample size analysis
    if 'sample_size' in results_df.columns:
        print("\nSample Size Analysis:")
        print(f"Average daily listings: {results_df['sample_size'].mean():.1f}")
        print(f"Min daily listings: {results_df['sample_size'].min():.0f}")
        print(f"Max daily listings: {results_df['sample_size'].max():.0f}")

if __name__ == "__main__":
    main()

Loading and preparing data...
Total listings in sample: 500
Total records in sample: 332673
Date range: 2023-06-07 00:00:00 to 2025-09-12 00:00:00

Analyzing price stability...

Price Stability Analysis:
constant_price_listings: 262.00
avg_unique_prices: 4.80
max_unique_prices: 105.00
avg_price_std: 28.86
price_range: 9989.00

Training ARIMA model...

Model Performance Metrics:
RMSE: 21.3673
MAE: 7.5328
R2: -0.0266

Monthly Error Analysis:
                actual              predicted            abs_error             \
                  mean        std        mean       std       mean        std   
date                                                                            
2025-07-31  279.066000   0.000000  278.891978  0.097770   0.174022   0.097770   
2025-08-31  279.066000   0.000000  278.443677  0.166246   0.622323   0.166246   
2025-09-30  291.909911  48.183657  278.050794  0.065833  36.423028  32.914569   

            pct_error             
                 mean        std  