# Download the zipfile archive.zip from kaggle

In [33]:
import os
import zipfile
import sqlite3
import pandas as pd

# Define paths
zip_path = '../data/raw/archive.zip'
extract_folder = '../data/interim/instacart_data'
db_path = '../data/processed/instacart.db'

# Step 1: Unzip the archive
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_folder)
    print(f"Extracted files to {extract_folder}")

# Step 2: Connect to SQLite database (it will be created if it doesn't exist)
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Step 3: Load CSV files into SQLite
for csv_file in os.listdir(extract_folder):
    if csv_file.endswith('.csv'):
        table_name = os.path.splitext(csv_file)[0]
        file_path = os.path.join(extract_folder, csv_file)
        
        # Load CSV into pandas DataFrame
        df = pd.read_csv(file_path)
        
        # Create table and insert data
        df.to_sql(table_name, conn, if_exists='replace', index=False)
        print(f"Loaded {csv_file} into table {table_name}")

# Step 4: Commit changes and close connection
conn.commit()
conn.close()
print(f"Database '{db_path}' created successfully.")


Extracted files to ../data/interim/instacart_data
Loaded aisles.csv into table aisles
Loaded departments.csv into table departments
Loaded orders.csv into table orders
Loaded order_products__prior.csv into table order_products__prior
Loaded order_products__train.csv into table order_products__train
Loaded products.csv into table products
Database '../data/processed/instacart.db' created successfully.


# Check Connections

In [35]:
import sqlite3

db_path = '../data/processed/instacart.db'

# Connect to the SQLite database
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# List of table names to check
tables = ['orders', 'order_products__train', 'order_products__prior', 'products', 'aisles', 'departments']

for table in tables:
    cursor.execute(f"SELECT name FROM sqlite_master WHERE type='table' AND name='{table}'")
    if cursor.fetchone():
        print(f"Table '{table}' exists.")
    else:
        print(f"Table '{table}' does not exist.")

conn.close()


Table 'orders' exists.
Table 'order_products__train' exists.
Table 'order_products__prior' exists.
Table 'products' exists.
Table 'aisles' exists.
Table 'departments' exists.


In [None]:
import sqlite3

db_path = '../data/processed/instacart.db'

# Connect to the SQLite database
conn = sqlite3.connect(db_path)
cursor = conn.cursor()

# Sample JOIN query
cursor.execute("""
    SELECT p.product_name, o.order_id
    FROM order_products__train opt
    JOIN orders o ON opt.order_id = o.order_id
    JOIN products p ON opt.product_id = p.product_id
    WHERE o.order_id <= 10
    LIMIT 10;
""")

rows = cursor.fetchall()
for row in rows:
    print(row)

conn.close()


('Organic Celery Hearts', 1)
('Organic 4% Milk Fat Whole Milk Cottage Cheese', 1)
('Bag of Organic Bananas', 1)
('Organic Whole String Cheese', 1)
('Lightly Smoked Sardines in Olive Oil', 1)
('Organic Hass Avocado', 1)
('Bulgarian Yogurt', 1)
('Cucumber Kirby', 1)
