# OLAP: Execute Queries from the Retail Data Warehouse (Section 1 Task 3)

This notebook runs the roll-up, drill-down and slice queries against the data warehouse (outputs/db/retail_dw.db), saves query outputs (CSV), produces visualizations (PNG), and writes an analysis report (Markdown).

## 3.1 imports & paths

In [1]:
import sqlite3
import pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt
import textwrap

PROJECT_ROOT = Path.cwd().parent
DB_PATH = PROJECT_ROOT / 'outputs' / 'db' / 'retail_dw.db'
SQL_PATH = PROJECT_ROOT / 'sql' / 'olap_task3.sql'
IMAGES_DIR = PROJECT_ROOT / 'outputs' / 'images'
REPORTS_DIR = PROJECT_ROOT / 'outputs' / 'reports'
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
REPORTS_DIR.mkdir(parents=True, exist_ok=True)

print("DB_PATH =", DB_PATH)
print("SQL_PATH =", SQL_PATH)

DB_PATH = c:\Users\HP\Documents\DSA2040_Practical_Exam_Geoffrey_Mwangi_566\outputs\db\retail_dw.db
SQL_PATH = c:\Users\HP\Documents\DSA2040_Practical_Exam_Geoffrey_Mwangi_566\sql\olap_task3.sql


## 3.2 load SQL file text and split into three queries — fallback to inline if file not found

In [2]:
# Read SQL file with the three queries. If file is missing, we fall back to inline query text.
if SQL_PATH.exists():
    sql_text = SQL_PATH.read_text()
else:
    # Inline fallback (same queries as provided in SQL file)
    sql_text = '''
    -- Roll-up
    SELECT c.country, t.year, t.quarter, SUM(s.total_sales) AS total_sales, COUNT(DISTINCT s.invoice_no) AS num_invoices
    FROM SalesFact s
    JOIN CustomerDim c ON s.customer_id = c.customer_id
    JOIN TimeDim t     ON s.time_id = t.time_id
    GROUP BY c.country, t.year, t.quarter
    ORDER BY t.year, t.quarter, total_sales DESC;

    -- Drill-down (UK example)
    SELECT t.year, t.month, SUM(s.total_sales) AS total_sales, COUNT(DISTINCT s.invoice_no) AS num_invoices, COUNT(*) AS rows
    FROM SalesFact s
    JOIN CustomerDim c ON s.customer_id = c.customer_id
    JOIN TimeDim t     ON s.time_id = t.time_id
    WHERE c.country = 'United Kingdom'
    GROUP BY t.year, t.month
    ORDER BY t.year, t.month;

    -- Slice (Electronics)
    SELECT p.category, SUM(s.total_sales) AS total_sales, COUNT(DISTINCT s.invoice_no) AS num_invoices
    FROM SalesFact s
    JOIN ProductDim p ON s.product_id = p.product_id
    WHERE p.category = 'Electronics'
    GROUP BY p.category;
    '''
# split into individual statements by the double newline followed by a comment or heuristic
queries = [q.strip() for q in sql_text.strip().split(';') if q.strip()]
print(f"Found {len(queries)} SQL blocks (split by ';').")

Found 4 SQL blocks (split by ';').


## 3.3 execute roll-up query and save CSV + create visualization

In [3]:
conn = sqlite3.connect(DB_PATH)

# Roll-up: total sales by country & quarter
rollup_q = """
SELECT c.country, t.year, t.quarter, SUM(s.total_sales) AS total_sales, COUNT(DISTINCT s.invoice_no) AS num_invoices
FROM SalesFact s
JOIN CustomerDim c ON s.customer_id = c.customer_id
JOIN TimeDim t     ON s.time_id = t.time_id
GROUP BY c.country, t.year, t.quarter
ORDER BY t.year, t.quarter, total_sales DESC
"""
df_rollup = pd.read_sql_query(rollup_q, conn)
df_rollup.to_csv(REPORTS_DIR / 'rollup_country_quarter.csv', index=False)
print("Saved roll-up CSV to", REPORTS_DIR / 'rollup_country_quarter.csv')

# Visualization 1: total sales by country (aggregated over all quarters)
agg_by_country = df_rollup.groupby('country', as_index=False)['total_sales'].sum().sort_values('total_sales', ascending=False)
plt.figure(figsize=(12,6))
plt.bar(agg_by_country['country'].astype(str), agg_by_country['total_sales'])
plt.xticks(rotation=45, ha='right')
plt.ylabel('Total Sales')
plt.title('Total Sales by Country (all quarters)')
plt.tight_layout()
plt.savefig(IMAGES_DIR / 'sales_by_country.png')
plt.close()
print("Saved chart:", IMAGES_DIR / 'sales_by_country.png')

Saved roll-up CSV to c:\Users\HP\Documents\DSA2040_Practical_Exam_Geoffrey_Mwangi_566\outputs\reports\rollup_country_quarter.csv
Saved chart: c:\Users\HP\Documents\DSA2040_Practical_Exam_Geoffrey_Mwangi_566\outputs\images\sales_by_country.png


## 3.4 execute drill-down query — parameterize country variable — save CSV + visualization

In [4]:
# Drill-down: sales by month for a chosen country
country_of_interest = 'United Kingdom'

drill_q = f"""
SELECT t.year, t.month, SUM(s.total_sales) AS total_sales, COUNT(DISTINCT s.invoice_no) AS num_invoices
FROM SalesFact s
JOIN CustomerDim c ON s.customer_id = c.customer_id
JOIN TimeDim t     ON s.time_id = t.time_id
WHERE c.country = ?
GROUP BY t.year, t.month
ORDER BY t.year, t.month
"""
df_drill = pd.read_sql_query(drill_q, conn, params=(country_of_interest,))
df_drill.to_csv(REPORTS_DIR / f'drilldown_{country_of_interest.replace(" ","_")}_by_month.csv', index=False)
print("Saved drill-down CSV to", REPORTS_DIR / f'drilldown_{country_of_interest.replace(" ","_")}_by_month.csv')

# Visualization 2: monthly sales for UK
if not df_drill.empty:
    # combine year-month into a datetime for plotting
    df_drill['date'] = pd.to_datetime(df_drill['year'].astype(str) + '-' + df_drill['month'].astype(str) + '-01')
    plt.figure(figsize=(10,5))
    plt.plot(df_drill['date'], df_drill['total_sales'], marker='o')
    plt.title(f'Monthly Total Sales - {country_of_interest}')
    plt.ylabel('Total Sales')
    plt.xlabel('Month')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.savefig(IMAGES_DIR / f'drilldown_{country_of_interest.replace(" ","_")}_monthly.png')
    plt.close()
    print("Saved chart:", IMAGES_DIR / f'drilldown_{country_of_interest.replace(" ","_")}_monthly.png')
else:
    print("No drill-down rows for", country_of_interest)


Saved drill-down CSV to c:\Users\HP\Documents\DSA2040_Practical_Exam_Geoffrey_Mwangi_566\outputs\reports\drilldown_United_Kingdom_by_month.csv
Saved chart: c:\Users\HP\Documents\DSA2040_Practical_Exam_Geoffrey_Mwangi_566\outputs\images\drilldown_United_Kingdom_monthly.png


## 3.5 execute slice query for Electronics and save CSV

In [5]:
slice_q = """
SELECT p.category, SUM(s.total_sales) AS total_sales, COUNT(DISTINCT s.invoice_no) AS num_invoices
FROM SalesFact s
JOIN ProductDim p ON s.product_id = p.product_id
WHERE p.category = 'Electronics'
GROUP BY p.category
"""
df_slice = pd.read_sql_query(slice_q, conn)
df_slice.to_csv(REPORTS_DIR / 'slice_electronics_total_sales.csv', index=False)
print("Saved slice CSV to", REPORTS_DIR / 'slice_electronics_total_sales.csv')
conn.close()
df_rollup.head(), df_drill.head(), df_slice.head()

Saved slice CSV to c:\Users\HP\Documents\DSA2040_Practical_Exam_Geoffrey_Mwangi_566\outputs\reports\slice_electronics_total_sales.csv


(          country  year  quarter  total_sales  num_invoices
 0  United Kingdom  2010        4    498661.85          1291
 1         Germany  2010        4     15241.14            30
 2          France  2010        4      9616.31            21
 3            EIRE  2010        4      8813.88            15
 4     Netherlands  2010        4      8784.48             3,
    year  month  total_sales  num_invoices       date
 0  2010     12   498661.850          1291 2010-12-01
 1  2011      1   442190.060           874 2011-01-01
 2  2011      2   355655.630           896 2011-02-01
 3  2011      3   467198.590          1177 2011-03-01
 4  2011      4   409559.141          1058 2011-04-01,
       category  total_sales  num_invoices
 0  Electronics      3752.74           264)