# Module 2 - EDA

## Import libraries

In [17]:
import os
import boto3
from pathlib import Path
from dotenv import load_dotenv

## Download datasets

In [18]:
# Load environment variables
load_dotenv()
AWW_API_KEY = os.getenv("AWW_API_KEY")
AWW_SECRET = os.getenv("AWW_SECRET")

# Initialize S3 client
s3 = boto3.client(
    's3',
    aws_access_key_id=AWW_API_KEY,
    aws_secret_access_key=AWW_SECRET
)
bucket_name = 'zrive-ds-data'

# Prefixes to fetch
prefixes = [
    "groceries/sampled-datasets/",
    "groceries/box_builder_dataset/"
]

# Local folder
PROJECT_ROOT = Path().resolve().parent.parent
DATA_DIR = PROJECT_ROOT / "data"
DATA_DIR.mkdir(exist_ok=True)

print("Data folder:", DATA_DIR)

# Download only if file does not exist locally
for prefix in prefixes:
    response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
    for obj in response.get('Contents', []):
        file_key = obj['Key']
        file_name = os.path.basename(file_key)

        if file_name:
            local_path = DATA_DIR / file_name

            if os.path.exists(local_path):
                print(f"Already exists: {local_path}, skipping download.")
            else:
                s3.download_file(bucket_name, file_key, local_path)
                print(f"Downloaded {file_name} from {prefix}")


Data folder: /Users/maria/Desktop/Zrive DS/zrive-ds/data
Already exists: /Users/maria/Desktop/Zrive DS/zrive-ds/data/abandoned_carts.parquet, skipping download.
Already exists: /Users/maria/Desktop/Zrive DS/zrive-ds/data/inventory.parquet, skipping download.
Already exists: /Users/maria/Desktop/Zrive DS/zrive-ds/data/orders.parquet, skipping download.
Already exists: /Users/maria/Desktop/Zrive DS/zrive-ds/data/regulars.parquet, skipping download.
Already exists: /Users/maria/Desktop/Zrive DS/zrive-ds/data/users.parquet, skipping download.
Already exists: /Users/maria/Desktop/Zrive DS/zrive-ds/data/feature_frame.csv, skipping download.
