# Ingest and validate daily data

In [4]:
import sys
from pathlib import Path  # no installation needed for stdlib/project-local
ROOT = Path(r"C:\Users\quantbase\Desktop\ecom_forecast")
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

from src.config import ProjectPaths  # no installation needed for stdlib/project-local
from src.io import (  # no installation needed for stdlib/project-local
    load_billing_location_sales,
    load_gross_sales,
    load_marketing,
    load_sessions,
    load_variant_sales,
)
from src.validate import qc_daily, require_columns  # no installation needed for stdlib/project-local

In [5]:
#src = SRC_DIR  # alias for easier use in notebooks

from __future__ import annotations  # no installation needed

import json  # no installation needed
from pathlib import Path  # no installation needed

import pandas as pd  # already in env — no new install
import yaml  # already in env — no new install

from config import ProjectPaths  # no installation needed

from validate import require_columns, qc_daily  # no installation needed

ROOT = Path(r"C:\Users\quantbase\Desktop\ecom_forecast")
P = ProjectPaths.from_root(ROOT)
A = yaml.safe_load(P.assumptions_path.read_text())

DATE_START = A["project"]["date_start"]
DATE_END = A["project"]["date_end"]


In [6]:
paths = ProjectPaths.from_root()
paths.ensure_directories()

In [7]:
gross_sales = load_gross_sales(paths)
sessions = load_sessions(paths)
marketing = load_marketing(paths)
billing_location_sales = load_billing_location_sales(paths)
variant_sales = load_variant_sales(paths)

gross_sales = gross_sales.sort_values('Day').reset_index(drop=True)
sessions = sessions.sort_values('Day').reset_index(drop=True)
marketing = marketing.sort_values('Day').reset_index(drop=True)


In [8]:
required_columns = {
    'gross_sales': ['Day', 'Gross sales', 'Net sales', 'Discounts', 'Returns', 'Taxes', 'Shipping charges', 'Total sales'],
    'sessions': ['Day', 'Online store visitors', 'Sessions', 'Bounce rate', 'Average session duration', 'Conversion rate', 'Pageviews'],
    'marketing': ['Day', 'Meta_Spend', 'Meta_Impressions', 'Meta_Clicks', 'Meta_Reported_Sales', 'Google_Spend', 'Google_Impressions', 'Google_Clicks', 'Google_Reported_Sales', 'TikTok_Spend', 'TikTok_Impressions', 'TikTok_Clicks', 'TikTok_Reported_Sales', 'Email_SMS_Cost'],
    'billing_location_sales': ['Billing country', 'Shipping region', 'Orders', 'Gross sales', 'Discounts', 'Returns', 'Net sales', 'Shipping charges', 'Taxes', 'Total sales', 'Cost of goods sold'],
    'variant_sales': ['Product title', 'Product variant title', 'Product variant SKU', 'Net items sold', 'Gross Sales', 'Discounts', 'Returns', 'Net Sales', 'Taxes', 'Total Sales', 'Cost of goods sold'],
}
datasets = {
    'gross_sales': gross_sales,
    'sessions': sessions,
    'marketing': marketing,
    'billing_location_sales': billing_location_sales,
    'variant_sales': variant_sales,
}
for name, frame in datasets.items():
    require_columns(frame, required_columns[name])


In [9]:
qc_rows = [
    qc_daily(gross_sales, 'gross_sales'),
    qc_daily(sessions, 'sessions'),
    qc_daily(marketing, 'marketing'),
]
qc_daily_summary = pd.concat(qc_rows, ignore_index=True)
assert (qc_daily_summary['n_missing_days'] == 0).all()
assert (qc_daily_summary['n_duplicate_days'] == 0).all()
assert qc_daily_summary['min_day'].eq('2025-09-17').all()
assert qc_daily_summary['max_day'].eq('2025-12-16').all()
qc_daily_summary


Unnamed: 0,dataset,min_day,max_day,missing_days,n_missing_days,duplicate_days,n_duplicate_days
0,gross_sales,2025-09-17,2025-12-16,[],0,[],0
1,sessions,2025-09-17,2025-12-16,[],0,[],0
2,marketing,2025-09-17,2025-12-16,[],0,[],0


In [10]:
qc_csv = paths.qc_dir / 'qc_daily.csv'
qc_json = paths.qc_dir / 'qc_daily.json'
qc_daily_summary.to_csv(qc_csv, index=False)
qc_daily_summary.to_json(qc_json, orient='records', indent=2)
qc_daily_summary


Unnamed: 0,dataset,min_day,max_day,missing_days,n_missing_days,duplicate_days,n_duplicate_days
0,gross_sales,2025-09-17,2025-12-16,[],0,[],0
1,sessions,2025-09-17,2025-12-16,[],0,[],0
2,marketing,2025-09-17,2025-12-16,[],0,[],0


In [11]:
save_map = {
    'gross_sales': gross_sales,
    'sessions': sessions,
    'marketing': marketing,
    'billing_location_sales': billing_location_sales,
    'variant_sales': variant_sales,
}
for name, frame in save_map.items():
    frame.to_pickle(paths.clean_dir / f'{name}.pkl')
sorted(save_map.keys())


['billing_location_sales',
 'gross_sales',
 'marketing',
 'sessions',
 'variant_sales']

In [13]:
gross_sales.head()

Unnamed: 0,Day,Gross sales,Net sales,Discounts,Returns,Taxes,Shipping charges,Total sales,Day (previous_year),Gross sales (previous_year),...,Taxes (previous_year),Shipping charges (previous_year),Total sales (previous_year),Gross sales (previous_year).1,Net sales (previous_year),Discounts (previous_year),Returns (previous_year),Taxes (previous_year).1,Shipping charges (previous_year).1,Total sales (previous_year).1
0,2025-09-17,97074.6,84236.87,-215.5,-12622.23,9188.74,320.0,93745.61,2024-09-17,47833.31,...,2011.03,137.62,28377.63,102.943,221.159,21.091,40.827,356.917,132.524,230.35
1,2025-09-18,44746.99,31587.32,-973.19,-12186.48,3363.61,120.0,35070.93,2024-09-18,51766.22,...,3392.6,161.6,40488.37,-13.559,-14.476,59.2,2.09,-0.854,-25.742,-13.38
2,2025-09-19,45629.9,37746.52,-6.0,-7877.38,4546.7,75.0,42368.22,2024-09-19,47667.23,...,3520.99,122.59,40845.98,-4.274,1.462,91.695,24.201,29.131,-38.82,3.726
3,2025-09-20,57708.4,53603.15,-58.25,-4047.0,5856.59,75.0,59534.74,2024-09-20,48901.01,...,4193.57,120.0,43248.81,18.01,37.672,85.068,57.736,39.656,-37.5,37.656
4,2025-09-21,36737.7,34462.5,-48.8,-2226.4,3830.38,235.0,38527.88,2024-09-21,49365.69,...,4888.81,176.28,51636.99,-25.58,-26.001,48.029,17.537,-21.65,33.31,-25.387


In [14]:
sessions.head()

Unnamed: 0,Day,Online store visitors,Sessions,Bounce rate,Average session duration,Checkout conversion rate,Completed checkout rate,Conversion rate,Pageviews,Reached checkout rate,...,Reached checkout rate (previous_period),Online store visitors (previous_period),Sessions (previous_period),Bounce rate (previous_period),Average session duration (previous_period),Checkout conversion rate (previous_period),Completed checkout rate (previous_period),Conversion rate (previous_period),Pageviews (previous_period),Reached checkout rate (previous_period).1
0,2025-09-17,5525,7418,0.389997,305.992855,0.814286,0.818792,0.03842,49746,0.047183,...,0.036845,9.972134,13.407736,2.407562,27.763319,18.218589,12.045214,51.38892,76.975346,28.058474
1,2025-09-18,5356,7027,0.426498,244.719795,0.65625,0.745223,0.017931,41616,0.027323,...,0.025199,1.960784,3.551429,9.879044,3.992293,-1.5625,8.589627,6.735674,39.655693,8.429891
2,2025-09-19,8957,10654,0.3318,199.553219,0.732955,0.737179,0.012108,51030,0.01652,...,0.028221,68.42798,55.783009,-9.809937,-20.163069,6.361073,2.455454,-37.738729,68.160548,-41.462352
3,2025-09-20,4999,6720,0.35878,301.502232,0.770642,0.782383,0.025,44484,0.03244,...,0.024386,1.092012,6.413302,-8.935121,38.277058,13.027523,12.628822,50.357143,74.996066,33.027018
4,2025-09-21,4890,6317,0.392433,249.085484,0.702381,0.744828,0.01868,36791,0.026595,...,0.023294,-39.844999,-34.889713,-29.623177,36.02265,15.027605,14.588859,31.326786,6.270942,14.169799


In [15]:
marketing.head()

Unnamed: 0,Day,Meta_Spend,Meta_Impressions,Meta_Clicks,Meta_Reported_Sales,Google_Spend,Google_Impressions,Google_Clicks,Google_Reported_Sales,TikTok_Spend,TikTok_Impressions,TikTok_Clicks,TikTok_Reported_Sales,Email_SMS_Cost
0,2025-09-17,3500,140000,2100,10500,1200,40000,800,4800,500,100000,1000,750,150
1,2025-09-18,3200,128000,1920,8960,1100,36666,733,4180,400,80000,800,520,120
2,2025-09-19,4100,164000,2460,13120,1500,50000,1000,6300,800,160000,1600,1120,180
3,2025-09-20,3800,152000,2280,11400,1400,46666,933,5600,600,120000,1200,780,160
4,2025-09-21,3600,144000,2160,10080,1300,43333,866,4940,500,100000,1000,600,140
