In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import json
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Configuration
raw_data_path = Path('../data/full-data/raw')
processed_data_path = Path('../data/full-data/processed')
output_path = Path('../data/full-data/temporal_analysis')
output_path.mkdir(parents=True, exist_ok=True)

mapping_df = pd.read_csv('../data/full-data/hotel_mapping.csv')
hotel_list = mapping_df['masked_id'].tolist()

print("="*100)
print("TEMPORAL ANALYSIS: WHEN DO FLAT PERIODS OCCUR?")
print("="*100)
print("Goal: Identify if flat periods are in past or future dates")
print("This will help us decide on train/validation/test split!")
print("-" * 100)

def find_flat_period_dates(series, dates, min_length=30):
    """
    Find the date ranges where flat periods occur
    Returns list of (start_date, end_date, length, value)
    """
    if len(series) < min_length:
        return []
    
    # Convert to list for easier indexing
    series_values = series.values
    dates_list = dates.tolist()
    
    flat_periods = []
    consecutive = 1
    start_idx = 0
    start_value = series_values[0] if pd.notna(series_values[0]) else None
    
    for i in range(1, len(series_values)):
        if pd.notna(series_values[i]) and pd.notna(series_values[i-1]) and series_values[i] == series_values[i-1]:
            consecutive += 1
        else:
            if consecutive >= min_length:
                flat_periods.append({
                    'start_date': dates_list[start_idx],
                    'end_date': dates_list[i-1],
                    'length_days': consecutive,
                    'value': start_value
                })
            consecutive = 1
            start_idx = i
            start_value = series_values[i] if pd.notna(series_values[i]) else None
    
    # Check last period
    if consecutive >= min_length:
        flat_periods.append({
            'start_date': dates_list[start_idx],
            'end_date': dates_list[len(series_values)-1],
            'length_days': consecutive,
            'value': start_value
        })
    
    return flat_periods

def analyze_hotel_timeline(hotel_id):
    """Analyze when flat periods occur for each hotel"""
    
    result = {
        'hotel_id': hotel_id,
        'status': 'analyzing'
    }
    
    try:
        # Load raw competitor data
        comp_file = raw_data_path / f'{hotel_id}_competitors.csv'
        if not comp_file.exists():
            result['status'] = 'missing_file'
            return result
        
        df_comp = pd.read_csv(comp_file)
        df_comp['stay_date'] = pd.to_datetime(df_comp['stay_date'])
        df_comp = df_comp.sort_values('stay_date').reset_index(drop=True)
        
        # Get date range
        min_date = df_comp['stay_date'].min()
        max_date = df_comp['stay_date'].max()
        
        result['date_range'] = {
            'min_date': str(min_date.date()),
            'max_date': str(max_date.date()),
            'total_days': (max_date - min_date).days
        }
        
        # Pivot to analyze each competitor
        df_pivot = df_comp.pivot_table(
            index='stay_date',
            columns='hotel_id',
            values='price',
            aggfunc='first'
        )
        
        # Find flat periods for each competitor
        all_flat_periods = []
        
        for comp_id in df_pivot.columns:
            comp_series = df_pivot[comp_id].dropna()
            if len(comp_series) > 30:
                flat_periods = find_flat_period_dates(
                    comp_series, 
                    comp_series.index, 
                    min_length=30
                )
                
                for fp in flat_periods:
                    all_flat_periods.append({
                        'competitor': str(comp_id),
                        'start_date': str(fp['start_date'].date()),
                        'end_date': str(fp['end_date'].date()),
                        'length_days': fp['length_days'],
                        'price': float(fp['value']) if pd.notna(fp['value']) else None
                    })
        
        result['flat_periods'] = all_flat_periods
        result['n_flat_periods'] = len(all_flat_periods)
        
        if len(all_flat_periods) > 0:
            # Categorize by time period
            cutoff_train = pd.Timestamp('2025-05-31')  # Your proposed train cutoff
            cutoff_val = pd.Timestamp('2025-08-31')    # Your proposed validation cutoff
            
            periods_in_train = []
            periods_in_val = []
            periods_in_test = []
            periods_spanning = []
            
            for fp in all_flat_periods:
                start = pd.Timestamp(fp['start_date'])
                end = pd.Timestamp(fp['end_date'])
                
                # Check where flat period falls
                if end <= cutoff_train:
                    periods_in_train.append(fp)
                elif start >= cutoff_val:
                    periods_in_test.append(fp)
                elif start <= cutoff_train and end >= cutoff_val:
                    periods_spanning.append(fp)
                elif start > cutoff_train and end < cutoff_val:
                    periods_in_val.append(fp)
                elif start <= cutoff_train and end < cutoff_val:
                    periods_spanning.append(fp)
                else:
                    periods_spanning.append(fp)
            
            result['period_categorization'] = {
                'in_train_period': len(periods_in_train),
                'in_validation_period': len(periods_in_val),
                'in_test_period': len(periods_in_test),
                'spanning_multiple': len(periods_spanning)
            }
            
            result['train_period_details'] = periods_in_train
            result['validation_period_details'] = periods_in_val
            result['test_period_details'] = periods_in_test
            result['spanning_period_details'] = periods_spanning
        
        result['status'] = 'complete'
        return result
        
    except Exception as e:
        result['status'] = 'error'
        result['error'] = str(e)
        return result

# ============================================================
# ANALYZE ALL HOTELS
# ============================================================
print("\nAnalyzing temporal distribution of flat periods...")
print("Proposed split:")
print("  TRAIN:      Oct 2023 - May 2025")
print("  VALIDATION: Jun 2025 - Aug 2025")
print("  TEST:       Sep 2025")
print("-" * 100)

all_results = {}
for idx, hotel_id in enumerate(hotel_list, 1):
    print(f"[{idx}/{len(hotel_list)}] {hotel_id}...", end='\r')
    result = analyze_hotel_timeline(hotel_id)
    all_results[hotel_id] = result

print(f"\n\nCompleted temporal analysis for {len(all_results)} hotels")

# Save results
with open(output_path / 'temporal_flat_period_analysis.json', 'w') as f:
    json.dump(all_results, f, indent=2)

# ============================================================
# GENERATE REPORT
# ============================================================
print("\n" + "="*100)
print("TEMPORAL ANALYSIS REPORT")
print("="*100)

successful = [h for h, r in all_results.items() if r['status'] == 'complete']
print(f"\nSuccessfully analyzed: {len(successful)}/{len(all_results)} hotels")

if len(successful) > 0:
    # Overall date ranges
    print("\n" + "="*100)
    print("1. DATA DATE RANGES")
    print("="*100)
    
    all_min_dates = [pd.Timestamp(all_results[h]['date_range']['min_date']) for h in successful]
    all_max_dates = [pd.Timestamp(all_results[h]['date_range']['max_date']) for h in successful]
    
    print(f"\nEarliest data across all hotels: {min(all_min_dates).date()}")
    print(f"Latest data across all hotels:   {max(all_max_dates).date()}")
    print(f"Data span: {(max(all_max_dates) - min(all_min_dates)).days} days")
    
    # Hotels with flat periods
    hotels_with_flat = [h for h in successful if all_results[h]['n_flat_periods'] > 0]
    
    print("\n" + "="*100)
    print("2. HOTELS WITH FLAT PERIODS (30+ days)")
    print("="*100)
    print(f"\nHotels with flat periods: {len(hotels_with_flat)}/{len(successful)}")
    
    if len(hotels_with_flat) > 0:
        # Temporal distribution
        print("\n" + "="*100)
        print("3. WHEN DO FLAT PERIODS OCCUR?")
        print("="*100)
        print("\nProposed split:")
        print("  TRAIN:      up to May 31, 2025")
        print("  VALIDATION: Jun 1 - Aug 31, 2025")
        print("  TEST:       Sep 1, 2025 onwards")
        print("-" * 100)
        
        temporal_summary = []
        
        for hotel_id in hotels_with_flat:
            r = all_results[hotel_id]
            cat = r.get('period_categorization', {})
            
            temporal_summary.append({
                'hotel_id': hotel_id,
                'total_flat_periods': r['n_flat_periods'],
                'in_train': cat.get('in_train_period', 0),
                'in_validation': cat.get('in_validation_period', 0),
                'in_test': cat.get('in_test_period', 0),
                'spanning_multiple': cat.get('spanning_multiple', 0)
            })
        
        temporal_df = pd.DataFrame(temporal_summary)
        temporal_df = temporal_df.sort_values('total_flat_periods', ascending=False)
        
        print("\nFlat Period Distribution by Time Period:")
        print(temporal_df.to_string(index=False))
        temporal_df.to_csv(output_path / 'flat_period_temporal_distribution.csv', index=False)
        
        # Summary statistics
        print("\n" + "-" * 100)
        print("SUMMARY STATISTICS:")
        print(f"  Total flat periods across all hotels: {temporal_df['total_flat_periods'].sum()}")
        print(f"  Flat periods in TRAIN period:         {temporal_df['in_train'].sum()} ({temporal_df['in_train'].sum()/temporal_df['total_flat_periods'].sum()*100:.1f}%)")
        print(f"  Flat periods in VALIDATION period:    {temporal_df['in_validation'].sum()} ({temporal_df['in_validation'].sum()/temporal_df['total_flat_periods'].sum()*100:.1f}%)")
        print(f"  Flat periods in TEST period:          {temporal_df['in_test'].sum()} ({temporal_df['in_test'].sum()/temporal_df['total_flat_periods'].sum()*100:.1f}%)")
        print(f"  Flat periods spanning multiple:       {temporal_df['spanning_multiple'].sum()} ({temporal_df['spanning_multiple'].sum()/temporal_df['total_flat_periods'].sum()*100:.1f}%)")
        
        # Detailed examples
        print("\n" + "="*100)
        print("4. DETAILED EXAMPLES - Hotels with Most Flat Periods")
        print("="*100)
        
        for hotel_id in temporal_df.head(5)['hotel_id']:
            r = all_results[hotel_id]
            print(f"\n{hotel_id}:")
            print(f"  Date range: {r['date_range']['min_date']} to {r['date_range']['max_date']}")
            print(f"  Total flat periods: {r['n_flat_periods']}")
            
            if r.get('train_period_details'):
                print(f"  \n  IN TRAIN PERIOD ({len(r['train_period_details'])} periods):")
                for fp in r['train_period_details'][:2]:  # Show first 2
                    print(f"    - {fp['competitor'][:30]}: {fp['start_date']} to {fp['end_date']} ({fp['length_days']} days)")
            
            if r.get('validation_period_details'):
                print(f"  \n  IN VALIDATION PERIOD ({len(r['validation_period_details'])} periods):")
                for fp in r['validation_period_details'][:2]:
                    print(f"    - {fp['competitor'][:30]}: {fp['start_date']} to {fp['end_date']} ({fp['length_days']} days)")
            
            if r.get('test_period_details'):
                print(f"  \n  IN TEST PERIOD ({len(r['test_period_details'])} periods):")
                for fp in r['test_period_details'][:2]:
                    print(f"    - {fp['competitor'][:30]}: {fp['start_date']} to {fp['end_date']} ({fp['length_days']} days)")
            
            if r.get('spanning_period_details'):
                print(f"  \n  SPANNING MULTIPLE PERIODS ({len(r['spanning_period_details'])} periods):")
                for fp in r['spanning_period_details'][:2]:
                    print(f"    - {fp['competitor'][:30]}: {fp['start_date']} to {fp['end_date']} ({fp['length_days']} days)")

print("\n" + "="*100)
print("KEY FINDINGS & RECOMMENDATION")
print("="*100)
print(f"\nüìÅ Files saved to: {output_path}/")
print("   - temporal_flat_period_analysis.json")
print("   - flat_period_temporal_distribution.csv")
print("\nüéØ NEXT STEPS:")
print("   1. Review the temporal distribution above")
print("   2. If most flat periods are in TEST period ‚Üí Your strategy will work! ‚úÖ")
print("   3. If flat periods span TRAIN period ‚Üí We need different approach ‚ö†Ô∏è")
print("   4. Create train/val/test split based on these findings")
print("\n" + "="*100)

TEMPORAL ANALYSIS: WHEN DO FLAT PERIODS OCCUR?
Goal: Identify if flat periods are in past or future dates
This will help us decide on train/validation/test split!
----------------------------------------------------------------------------------------------------

Analyzing temporal distribution of flat periods...
Proposed split:
  TRAIN:      Oct 2023 - May 2025
  VALIDATION: Jun 2025 - Aug 2025
  TEST:       Sep 2025
----------------------------------------------------------------------------------------------------
[44/44] Hotel_44...

Completed temporal analysis for 44 hotels

TEMPORAL ANALYSIS REPORT

Successfully analyzed: 39/44 hotels

1. DATA DATE RANGES

Earliest data across all hotels: 2024-07-29
Latest data across all hotels:   2026-10-06
Data span: 799 days

2. HOTELS WITH FLAT PERIODS (30+ days)

Hotels with flat periods: 27/39

3. WHEN DO FLAT PERIODS OCCUR?

Proposed split:
  TRAIN:      up to May 31, 2025
  VALIDATION: Jun 1 - Aug 31, 2025
  TEST:       Sep 1, 2025 onwa