In [1]:
import pandas as pd
import numpy as np
import re
from pathlib import Path

FILE2 = "data/rep_s_00014_SMRY.csv"   # <-- adjust if your path differs
OUT_DIR = Path("../outputs")
OUT_DIR.mkdir(exist_ok=True)

def clean_num(x):
    """Turn '1,234.50' / '0.00' / NaN into float (or np.nan)."""
    if pd.isna(x):
        return np.nan
    s = str(x).strip()
    if s == "" or s.lower() == "nan":
        return np.nan
    s = s.replace(",", "")
    try:
        return float(s)
    except:
        return np.nan

def extract_year(df_raw):
    """Find 'Years:2025' anywhere and return 2025 as int."""
    for v in df_raw.values.ravel():
        if isinstance(v, str) and "Years:" in v:
            m = re.search(r"Years:(\d{4})", v)
            if m:
                return int(m.group(1))
    return np.nan

In [2]:
import pandas as pd
import numpy as np
import re

def clean_number(x):
    if pd.isna(x):
        return np.nan
    s = str(x).strip()
    if s == "" or s.lower() == "nan":
        return np.nan
    s = s.replace(",", "")
    s = re.sub(r"[%]", "", s)  # remove %
    try:
        return float(s)
    except:
        return np.nan

In [3]:
import pandas as pd
import numpy as np
from pathlib import Path
import os

print("CWD:", os.getcwd())

DATA_DIR = Path("../data")  # because your notebook is inside /notebooks
OUT_DIR = Path("../outputs")
OUT_DIR.mkdir(exist_ok=True)

# show files to confirm file2 name
print("Data files:", [p.name for p in DATA_DIR.glob("*.csv")])

CWD: C:\projects\stories-coffee-analytics\notebooks
Data files: ['rep_s_00014_SMRY.csv', 'REP_S_00134_SMRY.csv', 'rep_s_00191_SMRY-3.csv', 'rep_s_00673_SMRY.csv']


In [4]:
import pandas as pd

file2_path = r"C:\projects\stories-coffee-analytics\data\rep_s_00014_SMRY.csv"
df_raw = pd.read_csv(file2_path, header=None)

df_raw.shape, df_raw.head(12)

((14585, 10),
                              0       1            2                   3  \
 0                      Stories     NaN          NaN                 NaN   
 1   Theoretical Profit By Item     NaN          NaN                 NaN   
 2                    22-Jan-26     NaN          NaN  Years:2025 Month:0   
 3                 Product Desc     Qty  Total Price                 NaN   
 4          Stories - Bir Hasan     NaN          NaN                 NaN   
 5                    TAKE AWAY     NaN          NaN                 NaN   
 6                    BEVERAGES     NaN          NaN                 NaN   
 7             COLD BAR SECTION     NaN          NaN                 NaN   
 8                1 SHOT DECAFE  404.00         0.00                 NaN   
 9                2 SHOT DECAFE  637.00         0.00                 NaN   
 10               3 SHOT DECAFE  113.00         0.00                 NaN   
 11      ADD BANANA SAUCE LARGE    4.00       266.67                 NaN  

In [5]:
header_mask = df_raw.apply(
    lambda r: r.astype(str).str.contains("Product Desc", case=False, na=False).any()
              and r.astype(str).str.contains("Qty", case=False, na=False).any(),
    axis=1
)

header_idxs = header_mask[header_mask].index.tolist()
header_idxs[:10], len(header_idxs)

([3, 38, 75, 112, 148, 184, 220, 257, 294, 331], 396)

In [6]:
tables = []

for i, h in enumerate(header_idxs):
    start = h + 1
    end = header_idxs[i + 1] if i + 1 < len(header_idxs) else len(df_raw)

    block = df_raw.iloc[start:end].copy()

    # rename columns using the header row
    block.columns = df_raw.loc[h].astype(str).str.strip()

    # keep only rows that actually have a product description
    block = block[block["Product Desc"].notna()]

    tables.append(block)

df_items = pd.concat(tables, ignore_index=True)

df_items.shape, df_items.head(10)

((14186, 10),
               Product Desc     Qty Total Price  NaN Total Cost Total Cost %  \
 0      Stories - Bir Hasan     NaN         NaN  NaN        NaN          NaN   
 1                TAKE AWAY     NaN         NaN  NaN        NaN          NaN   
 2                BEVERAGES     NaN         NaN  NaN        NaN          NaN   
 3         COLD BAR SECTION     NaN         NaN  NaN        NaN          NaN   
 4            1 SHOT DECAFE  404.00        0.00  NaN   3,856.85         0.00   
 5            2 SHOT DECAFE  637.00        0.00  NaN  12,162.45         0.00   
 6            3 SHOT DECAFE  113.00        0.00  NaN   3,236.32         0.00   
 7   ADD BANANA SAUCE LARGE    4.00      266.67  NaN     139.97        52.49   
 8  ADD BANANA SAUCE MEDIUM   47.00    1,487.39  NaN     822.34        55.29   
 9   ADD BANANA SAUCE SMALL   28.00      893.72  NaN     489.91        54.82   
 
   Total Profit  NaN Total Profit %  NaN  
 0          NaN  NaN            NaN  NaN  
 1          NaN  N

In [7]:
import numpy as np

def clean_number(x):
    try:
        if pd.isna(x):
            return np.nan
        return float(str(x).replace(",", ""))
    except:
        return np.nan

num_cols = ["Qty", "Total Price", "Total Cost", "Total Profit", "Total Profit %"]

for c in num_cols:
    if c in df_items.columns:
        df_items[c] = df_items[c].map(clean_number)

df_items[num_cols].describe()


Unnamed: 0,Qty,Total Price,Total Cost,Total Profit,Total Profit %
count,13143.0,13143.0,13143.0,13143.0,13143.0
mean,2487.763518,116816.1,92151.67,227584.5,66.570349
std,23475.164784,438958.0,862105.3,2145991.0,198.307554
min,-6.0,-1200.0,-3996.09,-1040870.0,-16619.04
25%,12.0,1000.0,518.53,641.81,65.22
50%,84.0,9459.46,3500.03,6488.56,74.52
75%,412.95,55266.99,18174.1,37519.19,81.11
max,803914.8,6489117.0,30826760.0,77771420.0,100.0


In [8]:
df_items["source_file"] = "rep_s_00014_SMRY.csv"

In [9]:
from pathlib import Path

OUT_DIR = Path("../outputs")
OUT_DIR.mkdir(exist_ok=True)

out2 = OUT_DIR / "clean_items_file2.csv"
df_items.to_csv(out2, index=False)

print("Saved:", out2.resolve())

Saved: C:\projects\stories-coffee-analytics\outputs\clean_items_file2.csv
