# Load Kaggle Competition Data

> Fill in a module description here

In [None]:
import os
import zipfile
from snowflake.snowpark import Session
import pandas as pd


import os
from kaggle.api.kaggle_api_extended import KaggleApi
from typing import List


In [None]:

class InstacartDataLoader:
    def __init__(self, session: Session):
        self.session = session
        
    def unzip_data_files(self, path: str):
        """Unzip all CSV files in the directory"""
        print("Unzipping data files...")
        for file in os.listdir(path):
            if file.endswith('.zip'):
                zip_path = os.path.join(path, file)
                with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                    zip_ref.extractall(path)
                print(f"Unzipped: {file}")
                
    def load_csv_to_snowflake(self, file_path: str, table_name: str, schema: str = 'INSTACART_RAW'):
        """Load CSV file to Snowflake table"""
        print(f"Loading {file_path} to {schema}.{table_name}")
        chunk_size = 100000
        for chunk in pd.read_csv(file_path, chunksize=chunk_size):
            df = self.session.create_dataframe(chunk)
            df.write.save_as_table(f"{schema}.{table_name}", mode="append")
            
    def setup_incremental_loading(self):
        """Setup tables for incremental loading"""
        self.session.sql("""
            CREATE OR REPLACE TABLE INSTACART_RAW.ORDER_PRODUCTS_STAGE (
                order_id INTEGER,
                product_id INTEGER,
                add_to_cart_order INTEGER,
                reordered INTEGER,
                file_name VARCHAR,
                loaded_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP()
            )
        """).collect()


In [None]:


def download_instacart_data(path: str = "./data") -> List[str]:
    """
    Download Instacart Market Basket Analysis data
    
    Args:
        path: Directory to save the data
        
    Returns:
        List of downloaded file paths
    
    Raises:
        Exception: If competition rules haven't been accepted or other API errors
    """
    try:
        api = KaggleApi()
        api.authenticate()
        
        # Create data directory if it doesn't exist
        os.makedirs(path, exist_ok=True)
        
        # Check if we've already downloaded the data
        zip_path = f"{path}/instacart-market-basket-analysis.zip"
        if os.path.exists(zip_path):
            print("Data already downloaded, using existing files...")
        else:
            print("Downloading competition data...")
            try:
                api.competition_download_files(
                    'instacart-market-basket-analysis',
                    path=path
                )
            except Exception as e:
                if "rules" in str(e):
                    raise Exception(
                        "Please accept the competition rules first at "
                        "https://www.kaggle.com/competitions/instacart-market-basket-analysis"
                    ) from e
                raise
        
        # Unzip if needed
        csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
        if not csv_files:
            print("Extracting files...")
            import zipfile
            with zipfile.ZipFile(zip_path, 'r') as zip_ref:
                zip_ref.extractall(path)
        
        # List available files
        csv_files = [f for f in os.listdir(path) if f.endswith('.csv')]
        print("\nAvailable files:")
        for file in csv_files:
            print(f"- {file}")
            
        return [os.path.join(path, f) for f in csv_files]
        
    except Exception as e:
        print(f"Error downloading data: {str(e)}")
        raise
