In [None]:
import requests
import pandas as pd
import datetime
from datetime import timedelta
import time
import os
import json
import calendar

class TiingoDataDownloader:
    def __init__(self, api_key):
        self.api_key = api_key
        self.base_url = "https://api.tiingo.com/iex"
        self.headers = {
            'Content-Type': 'application/json',
            'Authorization': f'Token {api_key}'
        }
        self.checkpoint_file = "download_checkpoint.json"
    
    def save_checkpoint(self, checkpoint_data):
        """Save download progress to checkpoint file"""
        with open(self.checkpoint_file, 'w') as f:
            json.dump(checkpoint_data, f, indent=2)
        print(f"Checkpoint saved: {checkpoint_data['completed_months']}/{checkpoint_data.get('total_months', '?')} months completed")
    
    def load_checkpoint(self):
        """Load download progress from checkpoint file"""
        if os.path.exists(self.checkpoint_file):
            try:
                with open(self.checkpoint_file, 'r') as f:
                    checkpoint = json.load(f)
                print(f"Checkpoint found: {checkpoint.get('completed_months', 0)} months already completed")
                return checkpoint
            except Exception as e:
                print(f"Error loading checkpoint: {e}")
                return None
        return None
    
    def clear_checkpoint(self):
        """Remove checkpoint file after successful completion"""
        if os.path.exists(self.checkpoint_file):
            os.remove(self.checkpoint_file)
            print("Checkpoint file cleared")
    
    def get_month_range(self, year, month):
        """Get start and end date for a specific month"""
        start_date = datetime.date(year, month, 1)
        last_day = calendar.monthrange(year, month)[1]
        end_date = datetime.date(year, month, last_day)
        return start_date, end_date
    
    def generate_month_list(self, start_date_str, end_date_str):
        """Generate list of months to download between start and end dates"""
        start_dt = datetime.datetime.strptime(start_date_str, "%Y-%m-%d").date()
        end_dt = datetime.datetime.strptime(end_date_str, "%Y-%m-%d").date()
        
        months_to_download = []
        current_date = start_dt.replace(day=1)  # Start from beginning of month
        
        while current_date <= end_dt:
            month_start, month_end = self.get_month_range(current_date.year, current_date.month)
            
            # Adjust date range for first and last months
            if current_date.year == start_dt.year and current_date.month == start_dt.month:
                month_start = start_dt
            if current_date.year == end_dt.year and current_date.month == end_dt.month:
                month_end = end_dt
                
            months_to_download.append((month_start, month_end))
            
            # Move to next month
            if current_date.month == 12:
                current_date = current_date.replace(year=current_date.year+1, month=1)
            else:
                current_date = current_date.replace(month=current_date.month+1)
        
        return months_to_download
    
    def download_monthly_data(self, symbol, start_date, end_date, save_path="spx_monthly_data"):
        """
        Download data month by month with checkpoint/resume functionality
        
        Parameters:
        symbol: Stock symbol (e.g., SPY for SPX proxy)
        start_date: Start date in YYYY-MM-DD format
        end_date: End date in YYYY-MM-DD format
        save_path: Directory to save monthly files
        """
        
        # Create save directory
        if not os.path.exists(save_path):
            os.makedirs(save_path)
            print(f"Created directory: {save_path}")
        
        # Generate list of months to download
        months_to_download = self.generate_month_list(start_date, end_date)
        total_months = len(months_to_download)
        
        print(f"Total months to download: {total_months}")
        print(f"Date range: {start_date} to {end_date}")
        print(f"Save path: {save_path}")
        
        # Load checkpoint to resume from where we left off
        checkpoint = self.load_checkpoint()
        completed_months = 0
        failed_downloads = []
        
        if checkpoint and checkpoint.get('symbol') == symbol:
            completed_months = checkpoint.get('completed_months', 0)
            failed_downloads = checkpoint.get('failed_downloads', [])
            print(f"Resuming from checkpoint: {completed_months}/{total_months} months completed")
        else:
            print("Starting fresh download")
        
        # Download remaining months
        successful_downloads = 0
        
        for i in range(completed_months, total_months):
            month_start, month_end = months_to_download[i]
            month_str = f"{month_start.year}-{month_start.month:02d}"
            filename = f"{save_path}/{symbol}_{month_str}.csv"
            
            # Check if file already exists and is valid
            if self.is_file_valid(filename):
                print(f"Skipping existing file: {filename}")
                successful_downloads += 1
                continue
            
            print(f"\nDownloading month {i+1}/{total_months}: {month_start} to {month_end}")
            
            success = self.download_single_month(
                symbol, 
                month_start.strftime("%Y-%m-%d"), 
                month_end.strftime("%Y-%m-%d"),
                filename
            )
            
            if success:
                successful_downloads += 1
                print(f"✅ Successfully saved: {filename}")
                # Remove from failed list if it was there
                if month_str in failed_downloads:
                    failed_downloads.remove(month_str)
            else:
                if month_str not in failed_downloads:
                    failed_downloads.append(month_str)
                print(f"❌ Failed to download: {month_str}")
            
            # Save checkpoint after each month
            checkpoint_data = {
                'symbol': symbol,
                'start_date': start_date,
                'end_date': end_date,
                'total_months': total_months,
                'completed_months': i + 1,
                'successful_downloads': successful_downloads,
                'failed_downloads': failed_downloads,
                'last_updated': datetime.datetime.now().isoformat()
            }
            self.save_checkpoint(checkpoint_data)
            
            # Wait between downloads to respect API limits
            if i < total_months - 1:  # Don't wait after the last download
                print("Waiting 3 seconds before next download...")
                time.sleep(3)
        
        # Final statistics
        print(f"\n=== Download Complete ===")
        print(f"Successfully downloaded: {successful_downloads}/{total_months} months")
        print(f"Failed downloads: {len(failed_downloads)} months")
        if failed_downloads:
            print(f"Failed months: {', '.join(failed_downloads)}")
        
        # Clear checkpoint file on successful completion
        if len(failed_downloads) == 0:
            self.clear_checkpoint()
            print("All downloads completed successfully!")
        else:
            print("Some downloads failed. You can retry them using retry_failed_months()")
            
        return successful_downloads, failed_downloads
    
    def is_file_valid(self, filename):
        """Check if file exists and contains valid data"""
        if not os.path.exists(filename):
            return False
        
        try:
            df = pd.read_csv(filename)
            # Consider file valid if it has data or is intentionally empty (non-trading days)
            return True  # File exists and can be read
        except Exception as e:
            print(f"Invalid file detected: {filename} - {e}")
            return False
    
    def download_single_month(self, symbol, start_date, end_date, filename, max_retries=3):
        """
        Download data for a single month with retry logic
        """
        for attempt in range(max_retries):
            try:
                # Build API request URL
                url = f"{self.base_url}/{symbol}/prices"
                params = {
                    'startDate': start_date,
                    'endDate': end_date,
                    'resampleFreq': '1min',
                    'format': 'json'
                }
                
                print(f"  Attempt {attempt + 1}/{max_retries}: Requesting data...")
                
                # Send request
                response = requests.get(url, headers=self.headers, params=params, timeout=60)
                
                if response.status_code == 200:
                    data = response.json()
                    if data and len(data) > 0:
                        # Process and save data
                        df = pd.DataFrame(data)
                        processed_df = self.process_dataframe(df)
                        processed_df.to_csv(filename, index=False)
                        
                        print(f"  ✅ Retrieved {len(processed_df):,} records")
                        return True
                    else:
                        print(f"  ℹ️  No data returned (likely non-trading days)")
                        # Create empty file to mark as processed
                        pd.DataFrame(columns=['timestamp', 'open', 'high', 'low', 'close', 'volume']).to_csv(filename, index=False)
                        return True
                        
                elif response.status_code == 429:
                    wait_time = min(300 * (attempt + 1), 1800)  # Max 30 minutes
                    print(f"  ⚠️  API rate limit exceeded (attempt {attempt + 1}/{max_retries})")
                    if attempt < max_retries - 1:
                        print(f"  ⏳ Waiting {wait_time} seconds before retry...")
                        time.sleep(wait_time)
                    else:
                        print(f"  ❌ Max retries exceeded due to rate limiting")
                        return False
                        
                else:
                    print(f"  ❌ Request failed: {response.status_code} - {response.text}")
                    if attempt < max_retries - 1:
                        print(f"  ⏳ Waiting 30 seconds before retry...")
                        time.sleep(30)
                    else:
                        return False
                        
            except requests.exceptions.Timeout:
                print(f"  ⚠️  Request timeout (attempt {attempt + 1}/{max_retries})")
                if attempt < max_retries - 1:
                    time.sleep(30)
                else:
                    return False
                    
            except Exception as e:
                print(f"  ❌ Error during download (attempt {attempt + 1}/{max_retries}): {str(e)}")
                if attempt < max_retries - 1:
                    time.sleep(30)
                else:
                    return False
        
        return False
    
    def process_dataframe(self, df):
        """Process and clean DataFrame"""
        # Convert timestamp
        df['timestamp'] = pd.to_datetime(df['date'])
        
        # Rename columns to standard format
        column_mapping = {
            'date': 'date_raw',
            'open': 'open',
            'high': 'high', 
            'low': 'low',
            'close': 'close',
            'volume': 'volume'
        }
        
        # Only rename existing columns
        for old_col, new_col in column_mapping.items():
            if old_col in df.columns and old_col != new_col:
                df = df.rename(columns={old_col: new_col})
        
        # Select required columns
        columns_to_keep = ['timestamp', 'open', 'high', 'low', 'close', 'volume']
        available_columns = [col for col in columns_to_keep if col in df.columns]
        df = df[available_columns]
        
        # Sort by timestamp and remove duplicates
        df = df.sort_values('timestamp').drop_duplicates(subset=['timestamp'], keep='last')
        
        return df
    
    def check_downloaded_files(self, save_path="spx_monthly_data", symbol=None):
        """Check and display information about downloaded files"""
        if not os.path.exists(save_path):
            print(f"Directory does not exist: {save_path}")
            return
        
        files = [f for f in os.listdir(save_path) if f.endswith('.csv')]
        if symbol:
            files = [f for f in files if f.startswith(f"{symbol}_")]
        files.sort()
        
        print(f"\n=== Downloaded Files Check ===")
        print(f"Directory: {save_path}")
        print(f"Total files: {len(files)}")
        
        total_records = 0
        valid_files = 0
        empty_files = 0
        
        for file in files:
            filepath = os.path.join(save_path, file)
            try:
                df = pd.read_csv(filepath)
                records = len(df)
                total_records += records
                if records > 0:
                    valid_files += 1
                    print(f"✅ {file}: {records:,} records")
                else:
                    empty_files += 1
                    print(f"📄 {file}: Empty (non-trading days)")
            except Exception as e:
                print(f"❌ {file}: Read error - {e}")
        
        print(f"\nSummary:")
        print(f"  Valid files with data: {valid_files}")
        print(f"  Empty files: {empty_files}")
        print(f"  Total records: {total_records:,}")
    
    def retry_failed_months(self, symbol, failed_months=None, save_path="spx_monthly_data"):
        """Retry downloading failed months"""
        if failed_months is None:
            # Load from checkpoint
            checkpoint = self.load_checkpoint()
            if checkpoint:
                failed_months = checkpoint.get('failed_downloads', [])
            else:
                print("No checkpoint found. Cannot determine failed months.")
                return []
        
        if not failed_months:
            print("No failed months to retry.")
            return []
        
        print(f"\n=== Retrying Failed Months ===")
        print(f"Failed months to retry: {len(failed_months)}")
        
        successful_retries = 0
        still_failed = []
        
        for i, month_str in enumerate(failed_months):
            print(f"\nRetrying {i+1}/{len(failed_months)}: {month_str}")
            
            try:
                year, month = map(int, month_str.split('-'))
                month_start, month_end = self.get_month_range(year, month)
                filename = f"{save_path}/{symbol}_{month_str}.csv"
                
                success = self.download_single_month(
                    symbol,
                    month_start.strftime("%Y-%m-%d"),
                    month_end.strftime("%Y-%m-%d"),
                    filename,
                    max_retries=5  # More retries for failed months
                )
                
                if success:
                    successful_retries += 1
                    print(f"✅ Retry successful: {month_str}")
                else:
                    still_failed.append(month_str)
                    print(f"❌ Retry failed: {month_str}")
                
                # Longer wait between retries
                if i < len(failed_months) - 1:
                    print("Waiting 10 seconds before next retry...")
                    time.sleep(10)
                    
            except Exception as e:
                print(f"❌ Error processing {month_str}: {e}")
                still_failed.append(month_str)
        
        print(f"\nRetry Results:")
        print(f"  Successful: {successful_retries}/{len(failed_months)}")
        print(f"  Still failed: {len(still_failed)}")
        
        # Update checkpoint with new failed list
        if os.path.exists(self.checkpoint_file):
            checkpoint = self.load_checkpoint()
            if checkpoint:
                checkpoint['failed_downloads'] = still_failed
                checkpoint['last_updated'] = datetime.datetime.now().isoformat()
                self.save_checkpoint(checkpoint)
        
        return still_failed
    
    def resume_download(self):
        """Resume download from checkpoint"""
        checkpoint = self.load_checkpoint()
        if not checkpoint:
            print("No checkpoint found. Cannot resume.")
            return False
        
        symbol = checkpoint.get('symbol')
        start_date = checkpoint.get('start_date')
        end_date = checkpoint.get('end_date')
        
        if not all([symbol, start_date, end_date]):
            print("Invalid checkpoint data. Cannot resume.")
            return False
        
        print(f"Resuming download for {symbol} from {start_date} to {end_date}")
        
        # Continue download
        successful, failed = self.download_monthly_data(
            symbol=symbol,
            start_date=start_date,
            end_date=end_date
        )
        
        return len(failed) == 0

# Main execution function
def main():
    # Your Tiingo API key
    # API_KEY = "5a58845b22dce30dbb63164f835ca3274e0db92e"
    API_KEY = "5def33259d6ea438abe2ecf9236a552b5b972dee"
    # Create downloader instance
    downloader = TiingoDataDownloader(API_KEY)
    
    # Configuration
    symbol = "SPY"  # Use SPY as proxy for SPX
    start_date = "2018-08-31"
    end_date = "2023-08-31"
    save_path = "spx_monthly_data"
    
    print("=== Tiingo SPX Monthly Data Downloader ===")
    print(f"Symbol: {symbol}")
    print(f"Date range: {start_date} to {end_date}")
    print(f"Save path: {save_path}")
    
    # Check for existing checkpoint
    checkpoint = downloader.load_checkpoint()
    if checkpoint:
        print(f"\n📁 Checkpoint found! Previous download was interrupted.")
        resume = input("Do you want to resume from checkpoint? (y/n): ").lower().strip()
        if resume == 'y':
            success = downloader.resume_download()
            if success:
                print("✅ All downloads completed successfully!")
                return
        else:
            print("Starting fresh download...")
            downloader.clear_checkpoint()
    
    # Check existing files
    downloader.check_downloaded_files(save_path, symbol)
    
    # Start download
    successful, failed = downloader.download_monthly_data(
        symbol=symbol,
        start_date=start_date,
        end_date=end_date,
        save_path=save_path
    )
    
    # Handle failed downloads
    if failed:
        print(f"\n⚠️  Found {len(failed)} failed months")
        retry = input("Do you want to retry failed months? (y/n): ").lower().strip()
        if retry == 'y':
            still_failed = downloader.retry_failed_months(symbol, failed, save_path)
            if still_failed:
                print(f"❌ {len(still_failed)} months still failed: {', '.join(still_failed)}")
            else:
                print("✅ All failed months successfully retried!")
    
    # Final check
    print(f"\n=== Final Summary ===")
    downloader.check_downloaded_files(save_path, symbol)
    
    if os.path.exists(downloader.checkpoint_file):
        print(f"\n💾 Checkpoint file still exists. You can resume later if needed.")
    else:
        print(f"\n🎉 All downloads completed successfully!")

if __name__ == "__main__":
    main()