In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_squared_error, mean_absolute_error
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
sns.set_style("whitegrid")

print("="*80)
print("NEIGHBORHOOD-LEVEL SARIMA MODELING")
print("="*80)

# Load data
df = pd.read_csv('../data/NSI.csv')
print(f"\nOriginal dataset shape: {df.shape}")

# Create datetime column
df['date'] = pd.to_datetime({'year': df['REPORT_YEAR'], 
                              'month': df['REPORT_MONTH'], 
                              'day': 1})

print(f"Date range: {df['REPORT_YEAR'].min()}-{df['REPORT_MONTH'].min()} to {df['REPORT_YEAR'].max()}-{df['REPORT_MONTH'].max()}")
print(f"Number of neighborhoods: {df['NEIGHBOURHOOD_158'].nunique()}")

# Get unique neighborhoods
neighborhoods = sorted(df['NEIGHBOURHOOD_158'].unique())
print(f"\nProcessing {len(neighborhoods)} neighborhoods...")

# Storage for results
all_results = []
neighborhood_summaries = []
failed_models = []

# SARIMA order (using the best performing simple model from city-wide analysis)
order = (0, 1, 1)
seasonal_order = (0, 1, 1, 12)

print(f"\nUsing SARIMA{order}×{seasonal_order} for all neighborhoods")
print("="*80)

NEIGHBORHOOD-LEVEL SARIMA MODELING

Original dataset shape: (21910, 10)
Date range: 2014-1 to 2025-12
Number of neighborhoods: 159

Processing 159 neighborhoods...

Using SARIMA(0, 1, 1)×(0, 1, 1, 12) for all neighborhoods


In [None]:
# Process each neighborhood
for idx, neighborhood_id in enumerate(neighborhoods):
    try:
        # Filter data for this neighborhood
        neighborhood_data = df[df['NEIGHBOURHOOD_158'] == neighborhood_id].copy()
        neighborhood_data = neighborhood_data.sort_values('date')
        neighborhood_data.set_index('date', inplace=True)
        
        # Get NSI time series
        nsi_series = neighborhood_data['NSI']
        
        # Check if we have enough data
        if len(nsi_series) < 50:  # Need at least 50 observations
            failed_models.append({
                'Neighborhood': neighborhood_id,
                'Reason': f'Insufficient data ({len(nsi_series)} observations)'
            })
            continue
        
        # Train-test split (80-20)
        train_size = int(len(nsi_series) * 0.8)
        train_data = nsi_series[:train_size]
        test_data = nsi_series[train_size:]
        
        # Skip if test set is too small
        if len(test_data) < 5:
            failed_models.append({
                'Neighborhood': neighborhood_id,
                'Reason': f'Test set too small ({len(test_data)} observations)'
            })
            continue
        
        # Fit SARIMA model
        model = SARIMAX(train_data, 
                       order=order, 
                       seasonal_order=seasonal_order,
                       enforce_stationarity=False,
                       enforce_invertibility=False)
        
        fitted_model = model.fit(disp=False, maxiter=200)
        
        # Make predictions
        forecast_steps = len(test_data)
        forecast = fitted_model.forecast(steps=forecast_steps)
        
        # Calculate metrics
        mse = mean_squared_error(test_data, forecast)
        rmse = np.sqrt(mse)
        mae = mean_absolute_error(test_data, forecast)
        mape = np.mean(np.abs((test_data - forecast) / test_data)) * 100
        bias = (test_data - forecast).mean()
        
        # Store summary for this neighborhood
        neighborhood_summaries.append({
            'Neighborhood': neighborhood_id,
            'Total_Observations': len(nsi_series),
            'Train_Size': len(train_data),
            'Test_Size': len(test_data),
            'Mean_NSI': nsi_series.mean(),
            'Std_NSI': nsi_series.std(),
            'Min_NSI': nsi_series.min(),
            'Max_NSI': nsi_series.max(),
            'RMSE': rmse,
            'MAE': mae,
            'MAPE': mape,
            'Bias': bias,
            'AIC': fitted_model.aic,
            'BIC': fitted_model.bic
        })
        
        # Store detailed predictions
        for date, actual, pred in zip(test_data.index, test_data.values, forecast.values):
            all_results.append({
                'Neighborhood': neighborhood_id,
                'Date': date,
                'Actual': actual,
                'Predicted': pred,
                'Error': actual - pred,
                'Abs_Error': abs(actual - pred),
                'Pct_Error': abs((actual - pred) / actual) * 100
            })
        
        # Progress update
        if (idx + 1) % 20 == 0:
            print(f"Processed {idx + 1}/{len(neighborhoods)} neighborhoods...")
            
    except Exception as e:
        failed_models.append({
            'Neighborhood': neighborhood_id,
            'Reason': f'Model error: {str(e)[:50]}'
        })
        continue

print(f"\nSuccessfully modeled {len(neighborhood_summaries)} neighborhoods")
print(f"Failed to model {len(failed_models)} neighborhoods")


Processed 20/159 neighborhoods...
Processed 40/159 neighborhoods...
Processed 60/159 neighborhoods...
Processed 80/159 neighborhoods...
Processed 100/159 neighborhoods...
Processed 120/159 neighborhoods...
Processed 140/159 neighborhoods...

✓ Successfully modeled 159 neighborhoods
✗ Failed to model 0 neighborhoods


In [3]:
# Convert to DataFrames
results_df = pd.DataFrame(all_results)
summary_df = pd.DataFrame(neighborhood_summaries)
failed_df = pd.DataFrame(failed_models) if failed_models else pd.DataFrame()

# Save results
results_df.to_csv('../arima_2/neighborhood_predictions.csv', index=False)
summary_df.to_csv('../arima_2/neighborhood_summary.csv', index=False)
if len(failed_df) > 0:
    failed_df.to_csv('../arima_2/failed_neighborhoods.csv', index=False)

print("\n" + "="*80)
print("OVERALL PERFORMANCE SUMMARY")
print("="*80)

# Aggregate statistics
print(f"\nSuccessfully Modeled Neighborhoods: {len(summary_df)}")
print(f"Failed Neighborhoods: {len(failed_df)}")
print(f"\nTotal Forecasts Generated: {len(results_df)}")

print("\nPerformance Metrics Across All Neighborhoods:")
print("-"*80)
print(f"Mean MAPE:   {summary_df['MAPE'].mean():.2f}% (Std: {summary_df['MAPE'].std():.2f}%)")
print(f"Median MAPE: {summary_df['MAPE'].median():.2f}%")
print(f"Best MAPE:   {summary_df['MAPE'].min():.2f}% (Neighborhood {summary_df.loc[summary_df['MAPE'].idxmin(), 'Neighborhood']})")
print(f"Worst MAPE:  {summary_df['MAPE'].max():.2f}% (Neighborhood {summary_df.loc[summary_df['MAPE'].idxmax(), 'Neighborhood']})")

print(f"\nMean RMSE:   {summary_df['RMSE'].mean():.4f}")
print(f"Mean MAE:    {summary_df['MAE'].mean():.4f}")
print(f"Mean Bias:   {summary_df['Bias'].mean():.4f}")

print("\nNSI Statistics Across Neighborhoods:")
print("-"*80)
print(f"Mean NSI:    {summary_df['Mean_NSI'].mean():.4f} (Range: {summary_df['Mean_NSI'].min():.4f} - {summary_df['Mean_NSI'].max():.4f})")
print(f"Mean Std:    {summary_df['Std_NSI'].mean():.4f}")

# Performance categories
excellent = len(summary_df[summary_df['MAPE'] < 5])
good = len(summary_df[(summary_df['MAPE'] >= 5) & (summary_df['MAPE'] < 10)])
fair = len(summary_df[(summary_df['MAPE'] >= 10) & (summary_df['MAPE'] < 20)])
poor = len(summary_df[summary_df['MAPE'] >= 20])

print("\nPerformance Distribution:")
print("-"*80)
print(f"Excellent (MAPE < 5%):     {excellent} neighborhoods ({excellent/len(summary_df)*100:.1f}%)")
print(f"Good (MAPE 5-10%):         {good} neighborhoods ({good/len(summary_df)*100:.1f}%)")
print(f"Fair (MAPE 10-20%):        {fair} neighborhoods ({fair/len(summary_df)*100:.1f}%)")
print(f"Poor (MAPE > 20%):         {poor} neighborhoods ({poor/len(summary_df)*100:.1f}%)")



OVERALL PERFORMANCE SUMMARY

Successfully Modeled Neighborhoods: 159
Failed Neighborhoods: 0

Total Forecasts Generated: 4450

Performance Metrics Across All Neighborhoods:
--------------------------------------------------------------------------------
Mean MAPE:   10.15% (Std: 44.00%)
Median MAPE: 4.85%
Best MAPE:   1.88% (Neighborhood 24)
Worst MAPE:  520.58% (Neighborhood 90)

Mean RMSE:   0.0568
Mean MAE:    0.0459
Mean Bias:   0.0009

NSI Statistics Across Neighborhoods:
--------------------------------------------------------------------------------
Mean NSI:    0.8547 (Range: 0.4342 - 0.9655)
Mean Std:    0.0503

Performance Distribution:
--------------------------------------------------------------------------------
Excellent (MAPE < 5%):     74 neighborhoods (46.5%)
Good (MAPE 5-10%):         43 neighborhoods (27.0%)
Fair (MAPE 10-20%):        17 neighborhoods (10.7%)
Poor (MAPE > 20%):         4 neighborhoods (2.5%)


In [4]:
# ============================================================================
# VISUALIZATIONS - STREAMLINED
# ============================================================================

print("\n" + "="*80)
print("GENERATING VISUALIZATIONS")
print("="*80)

# Figure 1: Performance distribution
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# MAPE distribution
axes[0, 0].hist(summary_df['MAPE'], bins=30, color='steelblue', alpha=0.7, edgecolor='black')
axes[0, 0].axvline(summary_df['MAPE'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {summary_df["MAPE"].mean():.2f}%')
axes[0, 0].axvline(summary_df['MAPE'].median(), color='orange', linestyle='--', linewidth=2, label=f'Median: {summary_df["MAPE"].median():.2f}%')
axes[0, 0].set_xlabel('MAPE (%)', fontsize=11, fontweight='bold')
axes[0, 0].set_ylabel('Number of Neighborhoods', fontsize=11, fontweight='bold')
axes[0, 0].set_title('Distribution of Forecast Accuracy (MAPE)', fontsize=12, fontweight='bold')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# RMSE distribution
axes[0, 1].hist(summary_df['RMSE'], bins=30, color='darkgreen', alpha=0.7, edgecolor='black')
axes[0, 1].axvline(summary_df['RMSE'].mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {summary_df["RMSE"].mean():.4f}')
axes[0, 1].set_xlabel('RMSE', fontsize=11, fontweight='bold')
axes[0, 1].set_ylabel('Number of Neighborhoods', fontsize=11, fontweight='bold')
axes[0, 1].set_title('Distribution of RMSE', fontsize=12, fontweight='bold')
axes[0, 1].legend()
axes[0, 1].grid(alpha=0.3)

# NSI variability vs MAPE
axes[1, 0].scatter(summary_df['Std_NSI'], summary_df['MAPE'], alpha=0.6, s=50, color='purple', edgecolor='black')
axes[1, 0].set_xlabel('NSI Standard Deviation', fontsize=11, fontweight='bold')
axes[1, 0].set_ylabel('MAPE (%)', fontsize=11, fontweight='bold')
axes[1, 0].set_title('NSI Volatility vs Forecast Accuracy', fontsize=12, fontweight='bold')
axes[1, 0].grid(alpha=0.3)

# Performance categories
categories = ['Excellent\n(<5%)', 'Good\n(5-10%)', 'Fair\n(10-20%)', 'Poor\n(>20%)']
counts = [excellent, good, fair, poor]
colors_cat = ['green', 'yellowgreen', 'orange', 'red']
axes[1, 1].bar(categories, counts, color=colors_cat, alpha=0.7, edgecolor='black')
axes[1, 1].set_ylabel('Number of Neighborhoods', fontsize=11, fontweight='bold')
axes[1, 1].set_title('Performance Category Distribution', fontsize=12, fontweight='bold')
axes[1, 1].grid(axis='y', alpha=0.3)

for i, (cat, count) in enumerate(zip(categories, counts)):
    pct = count/len(summary_df)*100
    axes[1, 1].text(i, count, f'{count}\n({pct:.1f}%)', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.savefig('../arima_2/01_performance_summary.png', dpi=300, bbox_inches='tight')
print("✓ Saved: 01_performance_summary.png")
plt.close()

# Figure 2: Top 10 and Bottom 10 neighborhoods
fig, axes = plt.subplots(1, 2, figsize=(16, 6))

# Top 10 best performing
top_10 = summary_df.nsmallest(10, 'MAPE')
axes[0].barh(range(len(top_10)), top_10['MAPE'], color='green', alpha=0.7, edgecolor='black')
axes[0].set_yticks(range(len(top_10)))
axes[0].set_yticklabels([f"NH {int(n)}" for n in top_10['Neighborhood']])
axes[0].set_xlabel('MAPE (%)', fontsize=11, fontweight='bold')
axes[0].set_title('Top 10 Best Performing Neighborhoods', fontsize=12, fontweight='bold')
axes[0].invert_yaxis()
axes[0].grid(axis='x', alpha=0.3)

for i, (idx, row) in enumerate(top_10.iterrows()):
    axes[0].text(row['MAPE'], i, f"  {row['MAPE']:.2f}%", va='center', fontweight='bold')

# Bottom 10 worst performing
bottom_10 = summary_df.nlargest(10, 'MAPE')
axes[1].barh(range(len(bottom_10)), bottom_10['MAPE'], color='red', alpha=0.7, edgecolor='black')
axes[1].set_yticks(range(len(bottom_10)))
axes[1].set_yticklabels([f"NH {int(n)}" for n in bottom_10['Neighborhood']])
axes[1].set_xlabel('MAPE (%)', fontsize=11, fontweight='bold')
axes[1].set_title('Top 10 Worst Performing Neighborhoods', fontsize=12, fontweight='bold')
axes[1].invert_yaxis()
axes[1].grid(axis='x', alpha=0.3)

for i, (idx, row) in enumerate(bottom_10.iterrows()):
    axes[1].text(row['MAPE'], i, f"  {row['MAPE']:.2f}%", va='center', fontweight='bold')

plt.tight_layout()
plt.savefig('../arima_2/02_top_bottom_neighborhoods.png', dpi=300, bbox_inches='tight')
print("✓ Saved: 02_top_bottom_neighborhoods.png")
plt.close()

# Figure 3: Overall error analysis
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# Overall error distribution
axes[0, 0].hist(results_df['Error'], bins=50, color='darkblue', alpha=0.7, edgecolor='black')
axes[0, 0].axvline(0, color='red', linestyle='--', linewidth=2)
axes[0, 0].axvline(results_df['Error'].mean(), color='orange', linestyle='--', linewidth=2, label=f'Mean: {results_df["Error"].mean():.4f}')
axes[0, 0].set_xlabel('Forecast Error', fontsize=11, fontweight='bold')
axes[0, 0].set_ylabel('Frequency', fontsize=11, fontweight='bold')
axes[0, 0].set_title('Overall Error Distribution (All Neighborhoods)', fontsize=12, fontweight='bold')
axes[0, 0].legend()
axes[0, 0].grid(alpha=0.3)

# Actual vs Predicted (sample)
sample = results_df.sample(min(1000, len(results_df)))
axes[0, 1].scatter(sample['Actual'], sample['Predicted'], alpha=0.4, s=20, color='purple')
min_val = min(sample['Actual'].min(), sample['Predicted'].min())
max_val = max(sample['Actual'].max(), sample['Predicted'].max())
axes[0, 1].plot([min_val, max_val], [min_val, max_val], 'r--', linewidth=2, label='Perfect Prediction')
axes[0, 1].set_xlabel('Actual NSI', fontsize=11, fontweight='bold')
axes[0, 1].set_ylabel('Predicted NSI', fontsize=11, fontweight='bold')
axes[0, 1].set_title('Actual vs Predicted (Sample)', fontsize=12, fontweight='bold')
axes[0, 1].legend()
axes[0, 1].grid(alpha=0.3)

# MAPE over time (by date)
mape_by_date = results_df.groupby('Date')['Pct_Error'].mean().reset_index()
mape_by_date['Date'] = pd.to_datetime(mape_by_date['Date'])
axes[1, 0].plot(mape_by_date['Date'], mape_by_date['Pct_Error'], linewidth=2, color='darkorange', marker='o', markersize=4)
axes[1, 0].axhline(results_df['Pct_Error'].mean(), color='red', linestyle='--', linewidth=2, label=f'Overall Mean: {results_df["Pct_Error"].mean():.2f}%')
axes[1, 0].set_xlabel('Date', fontsize=11, fontweight='bold')
axes[1, 0].set_ylabel('Mean MAPE (%)', fontsize=11, fontweight='bold')
axes[1, 0].set_title('Forecast Accuracy Over Time', fontsize=12, fontweight='bold')
axes[1, 0].legend()
axes[1, 0].grid(alpha=0.3)
plt.setp(axes[1, 0].xaxis.get_majorticklabels(), rotation=45, ha='right')

# Box plot of errors by performance category
summary_df['Category'] = pd.cut(summary_df['MAPE'], bins=[0, 5, 10, 20, 100], labels=['Excellent', 'Good', 'Fair', 'Poor'])
category_data = []
for cat in ['Excellent', 'Good', 'Fair', 'Poor']:
    neighborhoods_in_cat = summary_df[summary_df['Category'] == cat]['Neighborhood'].values
    errors_in_cat = results_df[results_df['Neighborhood'].isin(neighborhoods_in_cat)]['Error'].values
    category_data.append(errors_in_cat)

bp = axes[1, 1].boxplot(category_data, labels=['Excellent', 'Good', 'Fair', 'Poor'], patch_artist=True)
for patch, color in zip(bp['boxes'], ['green', 'yellowgreen', 'orange', 'red']):
    patch.set_facecolor(color)
    patch.set_alpha(0.7)
axes[1, 1].axhline(0, color='black', linestyle='--', linewidth=1)
axes[1, 1].set_ylabel('Forecast Error', fontsize=11, fontweight='bold')
axes[1, 1].set_xlabel('Performance Category', fontsize=11, fontweight='bold')
axes[1, 1].set_title('Error Distribution by Performance Category', fontsize=12, fontweight='bold')
axes[1, 1].grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.savefig('../arima_2/03_error_analysis.png', dpi=300, bbox_inches='tight')
print("✓ Saved: 03_error_analysis.png")
plt.close()

# ============================================================================
# SUMMARY REPORT
# ============================================================================

with open('../arima_2/neighborhood_analysis_summary.txt', 'w') as f:
    f.write("="*80 + "\n")
    f.write("NEIGHBORHOOD-LEVEL SARIMA ANALYSIS SUMMARY\n")
    f.write("="*80 + "\n\n")
    
    f.write("MODEL CONFIGURATION\n")
    f.write("-"*80 + "\n")
    f.write(f"Model: SARIMA{order}×{seasonal_order}\n")
    f.write(f"Total Neighborhoods: {len(neighborhoods)}\n")
    f.write(f"Successfully Modeled: {len(summary_df)}\n")
    f.write(f"Failed to Model: {len(failed_df)}\n")
    f.write(f"Total Forecasts: {len(results_df)}\n\n")
    
    f.write("AGGREGATE PERFORMANCE METRICS\n")
    f.write("-"*80 + "\n")
    f.write(f"Mean MAPE:   {summary_df['MAPE'].mean():.2f}% (Std: {summary_df['MAPE'].std():.2f}%)\n")
    f.write(f"Median MAPE: {summary_df['MAPE'].median():.2f}%\n")
    f.write(f"Min MAPE:    {summary_df['MAPE'].min():.2f}%\n")
    f.write(f"Max MAPE:    {summary_df['MAPE'].max():.2f}%\n")
    f.write(f"Mean RMSE:   {summary_df['RMSE'].mean():.4f}\n")
    f.write(f"Mean MAE:    {summary_df['MAE'].mean():.4f}\n")
    f.write(f"Mean Bias:   {summary_df['Bias'].mean():.4f}\n\n")
    
    f.write("PERFORMANCE DISTRIBUTION\n")
    f.write("-"*80 + "\n")
    f.write(f"Excellent (MAPE < 5%):     {excellent} ({excellent/len(summary_df)*100:.1f}%)\n")
    f.write(f"Good (MAPE 5-10%):         {good} ({good/len(summary_df)*100:.1f}%)\n")
    f.write(f"Fair (MAPE 10-20%):        {fair} ({fair/len(summary_df)*100:.1f}%)\n")
    f.write(f"Poor (MAPE > 20%):         {poor} ({poor/len(summary_df)*100:.1f}%)\n\n")
    
    f.write("TOP 10 BEST PERFORMING NEIGHBORHOODS\n")
    f.write("-"*80 + "\n")
    for idx, row in top_10.iterrows():
        f.write(f"Neighborhood {int(row['Neighborhood']):3d}: MAPE = {row['MAPE']:6.2f}%, ")
        f.write(f"MAE = {row['MAE']:.4f}, Mean NSI = {row['Mean_NSI']:.4f}\n")
    
    f.write("\nTOP 10 WORST PERFORMING NEIGHBORHOODS\n")
    f.write("-"*80 + "\n")
    for idx, row in bottom_10.iterrows():
        f.write(f"Neighborhood {int(row['Neighborhood']):3d}: MAPE = {row['MAPE']:6.2f}%, ")
        f.write(f"MAE = {row['MAE']:.4f}, Mean NSI = {row['Mean_NSI']:.4f}\n")
    
    if len(failed_df) > 0:
        f.write("\nFAILED NEIGHBORHOODS\n")
        f.write("-"*80 + "\n")
        for idx, row in failed_df.iterrows():
            f.write(f"Neighborhood {int(row['Neighborhood']):3d}: {row['Reason']}\n")

print("✓ Saved: neighborhood_analysis_summary.txt")

print("\n" + "="*80)
print("ANALYSIS COMPLETE!")
print("="*80)
print(f"\nGenerated files:")
print("  - neighborhood_predictions.csv (all predictions)")
print("  - neighborhood_summary.csv (performance by neighborhood)")
if len(failed_df) > 0:
    print("  - failed_neighborhoods.csv (neighborhoods that couldn't be modeled)")
print("  - 01_performance_summary.png")
print("  - 02_top_bottom_neighborhoods.png")
print("  - 03_error_analysis.png")
print("  - neighborhood_analysis_summary.txt")


GENERATING VISUALIZATIONS
✓ Saved: 01_performance_summary.png
✓ Saved: 02_top_bottom_neighborhoods.png
✓ Saved: 03_error_analysis.png
✓ Saved: neighborhood_analysis_summary.txt

ANALYSIS COMPLETE!

Generated files:
  - neighborhood_predictions.csv (all predictions)
  - neighborhood_summary.csv (performance by neighborhood)
  - 01_performance_summary.png
  - 02_top_bottom_neighborhoods.png
  - 03_error_analysis.png
  - neighborhood_analysis_summary.txt
