In [1]:
import pandas as pd
from pathlib import Path

DATA_DIR = Path(r"C:\projects\stories-coffee-analytics\data")
file3_path = DATA_DIR / "rep_s_00191_SMRY-3.csv"

df3_raw = pd.read_csv(file3_path, header=None)
df3_raw.shape, df3_raw.head(10)

((14140, 5),
                              0                    1       2             3  \
 0                      Stories                  NaN     NaN           NaN   
 1      Sales by Items By Group                  NaN     NaN           NaN   
 2                    19-Jan-26  Years:2025 Months:0     NaN     Page 1 of   
 3                  Description              Barcode     Qty  Total Amount   
 4  Branch: Stories - Bir Hasan                  NaN     NaN           NaN   
 5    Division: HOT BAR SECTION                  NaN     NaN           NaN   
 6          Group: BLACK COFFEE                  NaN     NaN           NaN   
 7                     ESPRESSO                  NaN  3313.0    500,103.75   
 8              DOUBLE ESPRESSO                  NaN  3200.0    498,631.33   
 9              AMERICANO SMALL                  NaN   902.0    176,816.53   
 
        4  
 0    NaN  
 1    NaN  
 2  359.0  
 3    NaN  
 4    NaN  
 5    NaN  
 6    NaN  
 7    NaN  
 8    NaN  
 9    N

In [2]:
header_mask = df3_raw.apply(
    lambda r: r.astype(str).str.contains("Product", case=False, na=False).any()
              and r.astype(str).str.contains("Qty", case=False, na=False).any(),
    axis=1
)

header_idxs = header_mask[header_mask].index.tolist()
header_idxs[:10], len(header_idxs)

([], 0)

In [3]:
blocks = []

for i, h in enumerate(header_idxs):
    start = h + 1
    end = header_idxs[i+1] if i+1 < len(header_idxs) else len(df3_raw)
    
    block = df3_raw.iloc[start:end].copy()
    block.columns = df3_raw.iloc[h]
    blocks.append(block)

df_items3 = pd.concat(blocks, ignore_index=True)
df_items3.head()

ValueError: No objects to concatenate

In [4]:
import pandas as pd
import numpy as np

file3_path = r"C:\projects\stories-coffee-analytics\data\rep_s_00191_SMRY-3.csv"
df3_raw = pd.read_csv(file3_path, header=None)

df3_raw.shape, df3_raw.head(15)

((14140, 5),
                               0                    1       2             3  \
 0                       Stories                  NaN     NaN           NaN   
 1       Sales by Items By Group                  NaN     NaN           NaN   
 2                     19-Jan-26  Years:2025 Months:0     NaN     Page 1 of   
 3                   Description              Barcode     Qty  Total Amount   
 4   Branch: Stories - Bir Hasan                  NaN     NaN           NaN   
 5     Division: HOT BAR SECTION                  NaN     NaN           NaN   
 6           Group: BLACK COFFEE                  NaN     NaN           NaN   
 7                      ESPRESSO                  NaN  3313.0    500,103.75   
 8               DOUBLE ESPRESSO                  NaN  3200.0    498,631.33   
 9               AMERICANO SMALL                  NaN   902.0    176,816.53   
 10             AMERICANO MEDIUM                  NaN   657.0    152,807.36   
 11              AMERICANO LARGE       

In [5]:
# find candidate header rows by keywords
kw_qty = "qty"
kw_desc = ["desc", "description", "item", "product"]

def row_has_kw(row, kw):
    s = row.astype(str).str.lower()
    return s.str.contains(kw, na=False).any()

header_mask = df3_raw.apply(
    lambda r: row_has_kw(r, kw_qty) and any(row_has_kw(r, k) for k in kw_desc),
    axis=1
)

header_idxs = df3_raw.index[header_mask].tolist()
header_idxs[:20], len(header_idxs)

([3,
  40,
  79,
  118,
  157,
  197,
  237,
  277,
  316,
  355,
  395,
  434,
  473,
  512,
  551,
  590,
  630,
  669,
  709,
  748],
 359)

In [6]:
blocks = []

for i, h in enumerate(header_idxs):
    start = h + 1
    end = header_idxs[i+1] if i+1 < len(header_idxs) else len(df3_raw)

    block = df3_raw.iloc[start:end].copy()
    block.columns = df3_raw.iloc[h].astype(str).str.strip().tolist()
    blocks.append(block)

df_items3 = pd.concat(blocks, ignore_index=True)
df_items3.shape, df_items3.head()

((13778, 5),
                    Description Barcode     Qty Total Amount  NaN
 0  Branch: Stories - Bir Hasan     NaN     NaN          NaN  NaN
 1    Division: HOT BAR SECTION     NaN     NaN          NaN  NaN
 2          Group: BLACK COFFEE     NaN     NaN          NaN  NaN
 3                     ESPRESSO     NaN  3313.0   500,103.75  NaN
 4              DOUBLE ESPRESSO     NaN  3200.0   498,631.33  NaN)

In [7]:
blocks = []

for i, h in enumerate(header_idxs):
    start = h + 1
    end = header_idxs[i+1] if i+1 < len(header_idxs) else len(df3_raw)

    block = df3_raw.iloc[start:end].copy()
    block.columns = df3_raw.iloc[h].astype(str).str.strip().tolist()
    blocks.append(block)

df_items3 = pd.concat(blocks, ignore_index=True)
df_items3.shape, df_items3.head()


((13778, 5),
                    Description Barcode     Qty Total Amount  NaN
 0  Branch: Stories - Bir Hasan     NaN     NaN          NaN  NaN
 1    Division: HOT BAR SECTION     NaN     NaN          NaN  NaN
 2          Group: BLACK COFFEE     NaN     NaN          NaN  NaN
 3                     ESPRESSO     NaN  3313.0   500,103.75  NaN
 4              DOUBLE ESPRESSO     NaN  3200.0   498,631.33  NaN)

In [10]:
df_items3.columns = (
    df_items3.columns.astype(str)
    .str.strip()
    .str.lower()
    .str.replace(r"\s+", " ", regex=True)
)

# drop the weird last column if it's literally called 'nan' or empty
if "nan" in df_items3.columns:
    df_items3 = df_items3.drop(columns=["nan"])

df_items3.head()

Unnamed: 0,description,barcode,qty,total amount,NaN
0,Branch: Stories - Bir Hasan,,,,
1,Division: HOT BAR SECTION,,,,
2,Group: BLACK COFFEE,,,,
3,ESPRESSO,,3313.0,500103.75,
4,DOUBLE ESPRESSO,,3200.0,498631.33,


In [11]:
df3_keep = df_items3[["description", "barcode", "qty", "total amount"]].copy()
df3_keep.head()

Unnamed: 0,description,barcode,qty,total amount
0,Branch: Stories - Bir Hasan,,,
1,Division: HOT BAR SECTION,,,
2,Group: BLACK COFFEE,,,
3,ESPRESSO,,3313.0,500103.75
4,DOUBLE ESPRESSO,,3200.0,498631.33


In [12]:
import numpy as np
import re

def clean_number(x):
    if pd.isna(x):
        return np.nan
    s = str(x).strip()
    if s == "" or s.lower() == "nan":
        return np.nan
    s = s.replace(",", "")
    s = re.sub(r"[^0-9.\-]", "", s)
    try:
        return float(s) if s != "" else np.nan
    except:
        return np.nan

df3_keep["qty"] = df3_keep["qty"].map(clean_number)
df3_keep["total_amount"] = df3_keep["total amount"].map(clean_number)
df3_keep = df3_keep.drop(columns=["total amount"])

In [13]:
desc = df3_keep["description"].astype(str).str.strip()

# remove headers/metadata lines
bad = desc.str.lower().str.startswith(("branch:", "division:", "group:", "page", "report"))
df3_clean = df3_keep[~bad].copy()

# also remove empty descriptions
df3_clean = df3_clean[desc.ne("") & desc.ne("nan")]

df3_clean.shape, df3_clean.head(10)

  df3_clean = df3_clean[desc.ne("") & desc.ne("nan")]


((12630, 4),
              description barcode     qty  total_amount
 3               ESPRESSO     NaN  3313.0     500103.75
 4        DOUBLE ESPRESSO     NaN  3200.0     498631.33
 5        AMERICANO SMALL     NaN   902.0     176816.53
 6       AMERICANO MEDIUM     NaN   657.0     152807.36
 7        AMERICANO LARGE     NaN   334.0      88443.43
 8     BLACK COFFEE SMALL     NaN   823.0     133384.23
 9    BLACK COFFEE MEDIUM     NaN   307.0      62723.78
 10    BLACK COFFEE LARGE     NaN    86.0      19908.92
 11  SINGLE ESPRESSO MACC     NaN    32.0       5156.93
 12   DOUBLE ESPRESSO MAC     NaN    35.0       6595.68)

In [14]:
df3_clean["source_file"] = "rep_s_00191_SMRY-3.csv"

from pathlib import Path
OUT_DIR = Path("../outputs")
OUT_DIR.mkdir(exist_ok=True)

out3 = OUT_DIR / "clean_items_file3.csv"
df3_clean.to_csv(out3, index=False)
print("Saved:", out3.resolve())

Saved: C:\projects\outputs\clean_items_file3.csv


In [15]:
df3_clean[["qty","total_amount"]].describe()
df3_clean.isna().sum().sort_values(ascending=False).head(10)
df3_clean.sample(10, random_state=1)

Unnamed: 0,description,barcode,qty,total_amount,source_file
8557,ADD VANILLA LARGE,,737.0,23130.18,rep_s_00191_SMRY-3.csv
155,CHOCOLATE CREAM FRAP SMALL,,283.0,63877.45,rep_s_00191_SMRY-3.csv
3939,FREEZE DROPS STRAWBERRY,,177.0,38940.0,rep_s_00191_SMRY-3.csv
120,ICED TOFFEE NUT LATTE LARGE,,16.0,5267.87,rep_s_00191_SMRY-3.csv
395,MANGO YOGHURT SMALL,,266.5,85722.6,rep_s_00191_SMRY-3.csv
7708,CINNAMON ROLL+DRINK,,12.0,4684.68,rep_s_00191_SMRY-3.csv
12417,ADD ENGLISH BREAKFAST TEA BAG,,3.0,126.13,rep_s_00191_SMRY-3.csv
1865,WHITE MOCHA CREAM FRAPP SMALL,,14.0,3361.26,rep_s_00191_SMRY-3.csv
13535,REPLACE ALMOND SMALL,,56.0,5117.12,rep_s_00191_SMRY-3.csv
11368,TOFFEE NUT LATTE LARGE,,11.0,3765.77,rep_s_00191_SMRY-3.csv
