# Cohort 700 SKU Data Google Sheet Updater

This notebook:
1. Fetches SKU data for cohort 700 from Snowflake
2. Updates a Google Sheet with 3 tabs:
   - **Tab 1: Raw Data** - All SKUs with details (product_id, product_name, packing_unit_id, etc.)
   - **Tab 2: SKU Aggregation** - SKU level mapping percentage (mapped SKUs / total SKUs per product)
   - **Tab 3: Cat-Brand Aggregation** - Category-Brand level mapping percentage
3. Clears existing data before updating
4. Designed to run weekly

**Note:** Update the `workbook_name` variable in the execution cell with your Google Sheet name.


In [None]:
# =============================================================================
# IMPORTS & SETUP
# =============================================================================
import pandas as pd
import os
import sys
from pathlib import Path

# Add parent directory to path to import common_functions
sys.path.insert(0, str(Path.cwd()))

from common_functions import snowflake_query, google_sheets, initialize_env

print("✓ Imports loaded successfully")


In [None]:
# =============================================================================
# INITIALIZE ENVIRONMENT
# =============================================================================
print("Initializing environment...")
initialize_env()
print("✓ Environment initialized")


In [None]:
# =============================================================================
# READ SQL QUERY
# =============================================================================
query_file = Path('queries') / 'cohort_700_sku_data.sql'

with open(query_file, 'r', encoding='utf-8') as f:
    query = f.read()

print(f"✓ Query loaded from {query_file}")
print(f"  Query length: {len(query)} characters")


In [None]:
# =============================================================================
# EXECUTE QUERY ON SNOWFLAKE
# =============================================================================
print("Executing query on Snowflake...")
print("(This may take a few moments...)")

df_raw = snowflake_query('Egypt', query)

print(f"✓ Query executed successfully")
print(f"✓ Retrieved {len(df_raw)} rows")
print(f"\nColumns: {list(df_raw.columns)}")

# Display first few rows
if len(df_raw) > 0:
    print("\nFirst 5 rows:")
    display(df_raw.head())
else:
    print("\n⚠ Warning: No data returned from query!")


In [None]:
# =============================================================================
# CREATE SKU LEVEL AGGREGATION
# =============================================================================
# Group by product_id to get SKU level data (not packing unit level)
print("Creating SKU level aggregation...")

sku_agg = df_raw.groupby('product_id').agg({
    'product_name': 'first',
    'cat': 'first',
    'brand': 'first',
    'ben_soliman': ['sum', 'count']
}).reset_index()

# Flatten column names
sku_agg.columns = ['product_id', 'product_name', 'cat', 'brand', 
                   'mapped_skus', 'total_skus']

# Calculate mapping percentage
sku_agg['mapping_percentage'] = (sku_agg['mapped_skus'] / sku_agg['total_skus'] * 100).round(2)

# Select and order columns
df_sku_agg = sku_agg[['product_id', 'product_name', 'cat', 'brand', 
                      'total_skus', 'mapped_skus', 'mapping_percentage']]

print(f"✓ Created aggregation for {len(df_sku_agg)} SKUs")
print("\nFirst 5 rows of SKU aggregation:")
display(df_sku_agg.head())


In [None]:
# =============================================================================
# CREATE CAT-BRAND LEVEL AGGREGATION
# =============================================================================
print("Creating Cat-Brand level aggregation...")

# Group by cat and brand
cat_brand_agg = df_raw.groupby(['cat', 'brand']).agg({
    'product_id': 'nunique',  # Count unique products
    'ben_soliman': 'sum'  # Sum of mapped SKUs
}).reset_index()

# Calculate total SKUs per cat-brand (count of all rows)
total_skus = df_raw.groupby(['cat', 'brand']).size().reset_index(name='total_skus')

# Merge
cat_brand_agg = cat_brand_agg.merge(total_skus, on=['cat', 'brand'])

# Rename columns
cat_brand_agg.columns = ['cat', 'brand', 'unique_products', 'mapped_skus', 'total_skus']

# Calculate mapping percentage
cat_brand_agg['mapping_percentage'] = (cat_brand_agg['mapped_skus'] / 
                                       cat_brand_agg['total_skus'] * 100).round(2)

# Select and order columns
df_cat_brand_agg = cat_brand_agg[['cat', 'brand', 'unique_products', 
                                  'total_skus', 'mapped_skus', 'mapping_percentage']]

# Sort by cat and brand
df_cat_brand_agg = df_cat_brand_agg.sort_values(['cat', 'brand'])

print(f"✓ Created aggregation for {len(df_cat_brand_agg)} cat-brand combinations")
print("\nFirst 5 rows of Cat-Brand aggregation:")
display(df_cat_brand_agg.head())


In [None]:
# =============================================================================
# UPDATE GOOGLE SHEET
# =============================================================================
# TODO: Replace with actual Google Sheet name when provided
workbook_name = "Cohort 700 SKU Mapping Data"  # Update this with the actual sheet name

print(f"Updating Google Sheet: {workbook_name}")
print("=" * 70)

# Tab 1: Raw Data
print("\n1. Clearing and updating Tab 1: Raw Data...")
google_sheets(workbook_name, 'Raw Data', 'overwrite', df=df_raw)
print("   ✓ Tab 1 updated")

# Tab 2: SKU Aggregation
print("\n2. Clearing and updating Tab 2: SKU Aggregation...")
google_sheets(workbook_name, 'SKU Aggregation', 'overwrite', df=df_sku_agg)
print("   ✓ Tab 2 updated")

# Tab 3: Cat-Brand Aggregation
print("\n3. Clearing and updating Tab 3: Cat-Brand Aggregation...")
google_sheets(workbook_name, 'Cat-Brand Aggregation', 'overwrite', df=df_cat_brand_agg)
print("   ✓ Tab 3 updated")

print("\n" + "=" * 70)
print(f"✓ Successfully updated all tabs in {workbook_name}")
print("=" * 70)
