In [None]:
# --- 1. SETUP AND AUTHENTICATION ---
import pandas as pd
import os
from google.colab import auth
from google.cloud import bigquery

# Authenticate your user account and set the GCP project
print("Authenticating user...")
auth.authenticate_user()
project_id = 'mgmt599-whitese-labs'
!gcloud config set project {project_id}
print(f"✅ Authenticated and project set to {project_id}")

# --- 2. DOWNLOAD DATA FROM KAGGLE ---
print("\nDownloading dataset from Kaggle...")
# Set up Kaggle API credentials
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download and unzip the dataset
!kaggle datasets download -d anirudhchauhan/retail-store-inventory-forecasting-dataset
!unzip -o retail-store-inventory-forecasting-dataset.zip

print("✅ Dataset downloaded and unzipped.")

# --- 3. LOAD DATA ---
print("\nLoading dataset...")
file_name = 'retail_store_inventory.csv'

try:
    df = pd.read_csv(file_name)
    print(f"✅ Successfully loaded {file_name}")

    # --- 4. CLEAN DATA ---
    print("\nCleaning data...")
    # Clean column names for BigQuery compatibility
    df.columns = df.columns.str.replace(' ', '_').str.replace('/', '_').str.lower()
    # Convert date column to the correct datetime format
    df['date'] = pd.to_datetime(df['date'])
    # Fix impossible negative values in the demand forecast
    df.loc[df['demand_forecast'] < 0, 'demand_forecast'] = 0
    print("✅ Data cleaning complete.")

except FileNotFoundError:
    print(f"\n❌ ERROR: The file '{file_name}' was not found after download.")

Authenticating user...
Updated property [core/project].
✅ Authenticated and project set to mgmt599-whitese-labs

Downloading dataset from Kaggle...
Dataset URL: https://www.kaggle.com/datasets/anirudhchauhan/retail-store-inventory-forecasting-dataset
License(s): CC0-1.0
Downloading retail-store-inventory-forecasting-dataset.zip to /content
  0% 0.00/1.51M [00:00<?, ?B/s]
100% 1.51M/1.51M [00:00<00:00, 677MB/s]
Archive:  retail-store-inventory-forecasting-dataset.zip
  inflating: retail_store_inventory.csv  
✅ Dataset downloaded and unzipped.

Loading dataset...
✅ Successfully loaded retail_store_inventory.csv

Cleaning data...
✅ Data cleaning complete.


In [None]:
# --- DISCOVER: Competitive Pricing Analysis ---
# This uses the cleaned DataFrame 'df' you just created.

# Group by category and calculate the average prices
pricing_df = df.groupby('category').agg(
    avg_our_price=('price', 'mean'),
    avg_competitor_price=('competitor_pricing', 'mean')
).reset_index()

# Calculate the difference
pricing_df['avg_price_difference'] = pricing_df['avg_our_price'] - pricing_df['avg_competitor_price']

# Sort to see the biggest differences
pricing_df = pricing_df.sort_values(by='avg_price_difference', ascending=False)

print("--- DISCOVER: Competitive Pricing Analysis ---")
display(pricing_df)

--- DISCOVER: Competitive Pricing Analysis ---


Unnamed: 0,category,avg_our_price,avg_competitor_price,avg_price_difference
4,Toys,55.03241,55.005455,0.026955
1,Electronics,55.310763,55.301609,0.009154
2,Furniture,55.175861,55.197196,-0.021335
0,Clothing,54.886602,54.920652,-0.03405
3,Groceries,55.271222,55.306665,-0.035443


### DIVE Stage 1: Discover - Competitive Landscape

**Prompt to Gemini:**

As a Risk & Strategy Analyst reviewing the following competitive pricing data, what are the top 3 strategic threats or opportunities that jump out? For each, state the potential business impact.

**Data Snapshot:**
-- DISCOVER: Competitive Pricing Analysis ---

	category 	avg_our_price 	avg_competitor_price 	avg_price_difference
4 	Toys 	55.032410 	55.005455 	0.026955
1 	Electronics 	55.310763 	55.301609 	0.009154
2 	Furniture 	55.175861 	55.197196 	-0.021335
0 	Clothing 	54.886602 	54.920652 	-0.034050
3 	Groceries 	55.271222 	55.306665 	-0.035443

In [None]:
import pandas as pd

# This code assumes your cleaned DataFrame 'df' is loaded from the previous steps

print("--- INVESTIGATE: Calculating Community Signatures ---")

# Calculate total sales for each store/category combination
store_category_sales = df.groupby(['store_id', 'category'])['units_sold'].sum()

# Calculate total sales for each store
total_store_sales = df.groupby('store_id')['units_sold'].sum()

# Calculate total sales for each category
total_category_sales = df.groupby('category')['units_sold'].sum()

# Calculate total overall sales
total_sales = df['units_sold'].sum()

# Combine the data to calculate the index
localization_df = (store_category_sales / total_store_sales) / (total_category_sales / total_sales)
localization_df = localization_df.reset_index(name='localization_index')

# Sort to find the strongest signatures and display the top 20
top_signatures = localization_df.sort_values(by='localization_index', ascending=False).head(20)

print("\n--- Top 20 Community Signatures (Localization Index) ---")
display(top_signatures)

--- INVESTIGATE: Calculating Community Signatures ---

--- Top 20 Community Signatures (Localization Index) ---


Unnamed: 0,store_id,category,localization_index
15,S004,Clothing,1.043518
14,S003,Toys,1.022448
18,S004,Groceries,1.021741
22,S005,Furniture,1.018526
4,S001,Toys,1.015464
24,S005,Toys,1.0118
6,S002,Electronics,1.0099
11,S003,Electronics,1.007345
3,S001,Groceries,1.00709
7,S002,Furniture,1.00628


### DIVE Stage 2: Investigate - Strategic Risks

**Prompt to Gemini:**

Our analysis reveals that certain stores have a high "Localization Index," meaning they sell far more of a specific category than the company average. The top examples are:

--- INVESTIGATE: Calculating Community Signatures ---

--- Top 20 Community Signatures (Localization Index) ---

	store_id 	category 	localization_index
15 	S004 	Clothing 	1.043518
14 	S003 	Toys 	1.022448
18 	S004 	Groceries 	1.021741
22 	S005 	Furniture 	1.018526
4 	S001 	Toys 	1.015464

As a strategist, what is the primary business risk of ignoring these "Community Signatures" and implementing a uniform, company-wide product and marketing strategy?

In [None]:
import pandas as pd

# This code assumes your cleaned DataFrame 'df' is loaded from the previous steps

print("--- VALIDATE: Analyzing Store Sales Volatility ---")

# First, calculate total daily sales for each store
daily_store_sales = df.groupby(['store_id', 'date'])['units_sold'].sum().reset_index()

# Now, calculate the volatility (standard deviation of daily sales) for each store
store_volatility = daily_store_sales.groupby('store_id')['units_sold'].std().reset_index()
store_volatility = store_volatility.rename(columns={'units_sold': 'sales_volatility'})

# Also, calculate the average daily sales for context
avg_store_sales = daily_store_sales.groupby('store_id')['units_sold'].mean().reset_index()
avg_store_sales = avg_store_sales.rename(columns={'units_sold': 'avg_daily_sales'})

# Merge the two metrics together
validation_df = pd.merge(avg_store_sales, store_volatility, on='store_id')

# Sort by volatility to see the most and least predictable stores
validation_df = validation_df.sort_values(by='sales_volatility', ascending=False)

print("\n--- Store Performance: Average Sales vs. Volatility ---")
display(validation_df)

--- VALIDATE: Analyzing Store Sales Volatility ---

--- Store Performance: Average Sales vs. Volatility ---


Unnamed: 0,store_id,avg_daily_sales,sales_volatility
4,S005,2749.898769,494.670457
2,S003,2767.025992,485.655082
0,S001,2702.804378,480.354725
1,S002,2719.172367,467.078666
3,S004,2707.585499,462.29124


### DIVE Stage 3: Validate - Testing Strategic Assumptions

**Prompt to Gemini:**

Our analysis so far suggests our stores are very similar. The working assumption is that a uniform operational strategy is the most efficient approach. However, the data below shows the sales volatility (unpredictability) for each store.

--- VALIDATE: Analyzing Store Sales Volatility ---

--- Store Performance: Average Sales vs. Volatility ---

	store_id 	avg_daily_sales 	sales_volatility
4 	S005 	2749.898769 	494.670457
2 	S003 	2767.025992 	485.655082
0 	S001 	2702.804378 	480.354725
1 	S002 	2719.172367 	467.078666
3 	S004 	2707.585499 	462.291240

As a Devil's Advocate, how does this volatility data invalidate our "stores are similar" assumption? What hidden risks does this expose for inventory and staffing, even if the product mix is the same across stores?

In [None]:
import pandas as pd

# This code assumes your cleaned DataFrame 'df' is loaded from the previous steps

print("--- EXTEND: Analyzing Forecast Bias ---")

# Calculate the forecast error (actual sales vs. forecasted demand)
df['forecast_error'] = df['units_sold'] - df['demand_forecast']

# Create a pivot table to show the average forecast error by category and weather
forecast_bias_df = df.pivot_table(
    index='category',
    columns='weather_condition',
    values='forecast_error',
    aggfunc='mean'
)

print("\n--- Forecast Bias Analysis (Average Forecast Error by Weather) ---")
print("Positive values = Under-forecasted (missed sales opportunities)")
print("Negative values = Over-forecasted (wasted inventory)")
display(forecast_bias_df.style.background_gradient(cmap='RdYlGn', axis=1))

--- EXTEND: Analyzing Forecast Bias ---

--- Forecast Bias Analysis (Average Forecast Error by Weather) ---
Positive values = Under-forecasted (missed sales opportunities)
Negative values = Over-forecasted (wasted inventory)


weather_condition,Cloudy,Rainy,Snowy,Sunny
category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Clothing,-5.043003,-5.307165,-5.085514,-4.948669
Electronics,-4.971938,-4.993774,-5.083094,-5.068075
Furniture,-5.063531,-5.014751,-5.142836,-5.145829
Groceries,-4.877811,-5.132261,-4.986688,-4.945563
Toys,-5.212092,-4.957728,-5.03168,-5.261425


### DIVE Stage 4: Extend - Strategic Recommendations

**Prompt to Gemini:**

Based on our validated findings:
1.  The company cannot compete on price.
2.  Its stores are operationally similar but have different risk profiles due to sales volatility.
3.  The company's internal demand forecast is systematically flawed, especially in relation to predictable events like the weather, as shown in the data below.

--- EXTEND: Analyzing Forecast Bias ---

--- Forecast Bias Analysis (Average Forecast Error by Weather) ---
Positive values = Under-forecasted (missed sales opportunities)
Negative values = Over-forecasted (wasted inventory)

weather_condition 	Cloudy 	Rainy 	Snowy 	Sunny
category
Clothing 	-5.043003 	-5.307165 	-5.085514 	-4.948669
Electronics 	-4.971938 	-4.993774 	-5.083094 	-5.068075
Furniture 	-5.063531 	-5.014751 	-5.142836 	-5.145829
Groceries 	-4.877811 	-5.132261 	-4.986688 	-4.945563
Toys 	-5.212092 	-4.957728 	-5.031680 	-5.261425

I want to act as a Chief Executive and Strategy Officer. Based on this critical internal failure, develop three strategic initiatives with specific KPIs to transform the company's forecasting and inventory management capabilities. Propose a 90-day implementation timeline and outline the expected ROI.