In [None]:
import os
import pandas as pd
import s3fs
from dotenv import load_dotenv

def explore_spotify_data():
    """
    Connects to S3, lists available daily Parquet files,
    and loads the most recent one into a Pandas DataFrame for exploration.
    """
    # --- 1. Setup and S3 Connection ---
    print("--- Setting up S3 connection ---")
    
    # Load credentials from the .env file in the parent directory
    load_dotenv('../.env')

    # S3 Configuration
    s3_endpoint_url = os.getenv('AWS_ENDPOINT_URL')
    s3_bucket_name = 'inbound'  # Or your bucket name
    s3_prefix = 'raw/spotify/api/daily/'

    if not s3_endpoint_url:
        print("Error: AWS_ENDPOINT_URL not found in .env file.")
        return

    # Create an S3 filesystem object
    try:
        s3 = s3fs.S3FileSystem(
            client_kwargs={
                'endpoint_url': s3_endpoint_url
            }
        )
        print("S3 filesystem object created successfully.")
    except Exception as e:
        print(f"Error creating S3 filesystem object: {e}")
        return

    # --- 2. List Available Data Files ---
    print("\n--- Listing available data files ---")
    parquet_files = []
    try:
        full_path = f'{s3_bucket_name}/{s3_prefix}'
        print(f'Listing files in: s3://{full_path}')
        available_files = s3.ls(full_path)
        
        parquet_files = sorted([f for f in available_files if f.endswith('.parquet')])
        
        if not parquet_files:
            print("\nNo Parquet files found in the specified directory.")
        else:
            print("\nAvailable Parquet files:")
            for f in parquet_files:
                print(f'- s3://{f}')
    except Exception as e:
        print(f"Could not list files: {e}")

    # --- 3. Load and Analyze the Most Recent File ---
    print("\n--- Loading and analyzing the most recent file ---")
    if parquet_files:
        # Choose the last file in the sorted list, which should be the most recent
        file_to_load = parquet_files[-1]
        
        print(f"Loading data from: s3://{file_to_load}...")
        try:
            # Use the s3fs object to open the file
            with s3.open(file_to_load, 'rb') as f:
                df = pd.read_parquet(f)
            
            print("\nSuccessfully loaded DataFrame!")
            
            print("\nDataFrame Info:")
            df.info()
            
            print("\nFirst 5 Rows:")
            print(df.head())
            
            print(f"\nTotal tracks on this day: {len(df)}")
            
        except Exception as e:
            print(f"Error loading Parquet file: {e}")
    else:
        print("No files found to load.")

if __name__ == "__main__":
    explore_spotify_data()
