In [3]:
import pandas as pd
import numpy as np
import re
from pathlib import Path

DATA_DIR = Path("../data")
OUT_DIR = Path("../outputs")
OUT_DIR.mkdir(exist_ok=True)
(OUT_DIR / "figures").mkdir(exist_ok=True)

In [4]:
def clean_number(x):
    """Convert '1,234.56' -> 1234.56. Return NaN if not numeric."""
    if pd.isna(x):
        return np.nan
    s = str(x).strip().replace(",", "")
    if s in ["", "None", "nan"]:
        return np.nan
    try:
        return float(s)
    except:
        return np.nan

def normalize_branch(name):
    """Standardize branch names for matching across files."""
    if pd.isna(name) or name is None:
        return None
    s = str(name).lower().strip()
    s = re.sub(r'[^a-z0-9\s]', ' ', s)   # remove punctuation
    s = re.sub(r'\s+', ' ', s).strip()  # collapse spaces
    return s

In [5]:
f4 = DATA_DIR / "rep_s_00673_SMRY.csv"
df4_raw = pd.read_csv(f4)
df4_raw.head(20)

Unnamed: 0,Stories,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9
0,Theoretical Profit By Category,,,,,,,,,
1,22-Jan-26,,,Years:2025 Month:0,,,,Page 1 of,,3.0
2,Category,Qty,Total Price,,Total Cost,Total Cost %,Total Profit,,Total Profit %,
3,Stories - Bir Hasan,,,,,,,,,
4,BEVERAGES,137572.1,1841385.84,,4143855.17,22.50,14270003.22,,77.50,
5,FOOD,102791.4,1264966.00,,4658225.80,36.82,7991434.23,,63.18,
6,Total By Branch:,240363.5,3106351.84,,8802080.97,28.34,22261437.45,,71.66,
7,Stories Centro Mall,,,,,,,,,
8,BEVERAGES,138680.5,2054988.70,,4591734.79,22.34,15958152.17,,77.66,
9,FOOD,121423.1,1307149.74,,4981860.70,38.11,8089636.74,,61.89,


In [6]:
df4 = df4_raw.copy()
df4["branch"] = np.nan

current_branch = None
for i, row in df4.iterrows():
    cat = str(row.get("Category", ""))
    if cat.lower().startswith("stories"):
        current_branch = cat
    df4.at[i, "branch"] = current_branch

df4[["Category","branch"]].head(25)

KeyError: "['Category'] not in index"

In [7]:
df4_raw.columns.tolist()


['Stories',
 'Unnamed: 1',
 'Unnamed: 2',
 'Unnamed: 3',
 'Unnamed: 4',
 'Unnamed: 5',
 'Unnamed: 6',
 'Unnamed: 7',
 'Unnamed: 8',
 'Unnamed: 9']

In [8]:
df4_raw.head(10)

Unnamed: 0,Stories,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9
0,Theoretical Profit By Category,,,,,,,,,
1,22-Jan-26,,,Years:2025 Month:0,,,,Page 1 of,,3.0
2,Category,Qty,Total Price,,Total Cost,Total Cost %,Total Profit,,Total Profit %,
3,Stories - Bir Hasan,,,,,,,,,
4,BEVERAGES,137572.1,1841385.84,,4143855.17,22.50,14270003.22,,77.50,
5,FOOD,102791.4,1264966.00,,4658225.80,36.82,7991434.23,,63.18,
6,Total By Branch:,240363.5,3106351.84,,8802080.97,28.34,22261437.45,,71.66,
7,Stories Centro Mall,,,,,,,,,
8,BEVERAGES,138680.5,2054988.70,,4591734.79,22.34,15958152.17,,77.66,
9,FOOD,121423.1,1307149.74,,4981860.70,38.11,8089636.74,,61.89,


In [9]:
df4_raw = df4_raw.rename(columns={
    "Stories": "Category",
    "Unnamed: 1": "Qty",
    "Unnamed: 2": "Total Price",
    "Unnamed: 4": "Total Cost",
    "Unnamed: 5": "Total Cost %",
    "Unnamed: 6": "Total Profit",
    "Unnamed: 8": "Total Profit %"
})

In [10]:
df4 = df4_raw.copy()

# drop rows where Category is empty
df4 = df4[df4["Category"].notna()].copy()

# remove repeated export header rows
df4 = df4[~df4["Category"].astype(str).str.contains("Theoretical Profit", case=False, na=False)]
df4 = df4[~df4["Category"].astype(str).str.contains("Page", case=False, na=False)]
df4 = df4[df4["Category"].astype(str).str.strip() != "Category"]

df4.head(12)

Unnamed: 0,Category,Qty,Total Price,Unnamed: 3,Total Cost,Total Cost %,Total Profit,Unnamed: 7,Total Profit %,Unnamed: 9
1,22-Jan-26,,,Years:2025 Month:0,,,,Page 1 of,,3.0
3,Stories - Bir Hasan,,,,,,,,,
4,BEVERAGES,137572.1,1841385.84,,4143855.17,22.5,14270003.22,,77.5,
5,FOOD,102791.4,1264966.0,,4658225.8,36.82,7991434.23,,63.18,
6,Total By Branch:,240363.5,3106351.84,,8802080.97,28.34,22261437.45,,71.66,
7,Stories Centro Mall,,,,,,,,,
8,BEVERAGES,138680.5,2054988.7,,4591734.79,22.34,15958152.17,,77.66,
9,FOOD,121423.1,1307149.74,,4981860.7,38.11,8089636.74,,61.89,
10,Total By Branch:,260103.6,3362138.44,,9573595.5,28.47,24047788.9,,71.53,
11,Stories LAU,,,,,,,,,


In [11]:
df4 = df4_raw.copy()

# keep only rows where Category is not empty
df4 = df4[df4["Category"].notna()].copy()

# remove export junk/header rows
df4 = df4[~df4["Category"].astype(str).str.contains("Theoretical Profit", case=False, na=False)]
df4 = df4[~df4["Category"].astype(str).str.contains("Page", case=False, na=False)]
df4 = df4[~df4["Category"].astype(str).str.match(r"\d{1,2}-[A-Za-z]{3}-\d{2}", na=False)]  # removes 22-Jan-26 style rows
df4 = df4[df4["Category"].astype(str).str.strip() != "Category"]  # removes the row that says Category Qty Total Price...

df4.head(15)


Unnamed: 0,Category,Qty,Total Price,Unnamed: 3,Total Cost,Total Cost %,Total Profit,Unnamed: 7,Total Profit %,Unnamed: 9
3,Stories - Bir Hasan,,,,,,,,,
4,BEVERAGES,137572.1,1841385.84,,4143855.17,22.5,14270003.22,,77.5,
5,FOOD,102791.4,1264966.0,,4658225.8,36.82,7991434.23,,63.18,
6,Total By Branch:,240363.5,3106351.84,,8802080.97,28.34,22261437.45,,71.66,
7,Stories Centro Mall,,,,,,,,,
8,BEVERAGES,138680.5,2054988.7,,4591734.79,22.34,15958152.17,,77.66,
9,FOOD,121423.1,1307149.74,,4981860.7,38.11,8089636.74,,61.89,
10,Total By Branch:,260103.6,3362138.44,,9573595.5,28.47,24047788.9,,71.53,
11,Stories LAU,,,,,,,,,
12,BEVERAGES,58430.0,777228.3,,1774898.67,22.84,5997384.33,,77.16,


In [12]:
df4["branch"] = np.nan
current_branch = None

for i, row in df4.iterrows():
    val = str(row["Category"])
    if val.lower().startswith("stories"):
        current_branch = val
    df4.at[i, "branch"] = current_branch

df4[["Category","branch"]].head(20)


TypeError: Invalid value 'Stories - Bir Hasan' for dtype 'float64'

In [13]:
df4 = df4.reset_index(drop=True)


In [14]:
# rows that are branch headers start with "Stories"
is_branch_row = df4["Category"].astype(str).str.lower().str.startswith("stories")

# create a helper column that has branch name only on branch rows
df4["branch"] = df4["Category"].where(is_branch_row)

# forward-fill branch downwards
df4["branch"] = df4["branch"].ffill()

df4[["Category","branch"]].head(20)

Unnamed: 0,Category,branch
0,Stories - Bir Hasan,Stories - Bir Hasan
1,BEVERAGES,Stories - Bir Hasan
2,FOOD,Stories - Bir Hasan
3,Total By Branch:,Stories - Bir Hasan
4,Stories Centro Mall,Stories Centro Mall
5,BEVERAGES,Stories Centro Mall
6,FOOD,Stories Centro Mall
7,Total By Branch:,Stories Centro Mall
8,Stories LAU,Stories LAU
9,BEVERAGES,Stories LAU


In [15]:
df4 = df4[df4["Category"].isin(["BEVERAGES","FOOD"])].copy()
df4.head(10)

Unnamed: 0,Category,Qty,Total Price,Unnamed: 3,Total Cost,Total Cost %,Total Profit,Unnamed: 7,Total Profit %,Unnamed: 9,branch
1,BEVERAGES,137572.1,1841385.84,,4143855.17,22.5,14270003.22,,77.5,,Stories - Bir Hasan
2,FOOD,102791.4,1264966.0,,4658225.8,36.82,7991434.23,,63.18,,Stories - Bir Hasan
5,BEVERAGES,138680.5,2054988.7,,4591734.79,22.34,15958152.17,,77.66,,Stories Centro Mall
6,FOOD,121423.1,1307149.74,,4981860.7,38.11,8089636.74,,61.89,,Stories Centro Mall
9,BEVERAGES,58430.0,777228.3,,1774898.67,22.84,5997384.33,,77.16,,Stories LAU
10,FOOD,73384.15,767489.57,,2799004.1,36.47,4875891.58,,63.53,,Stories LAU
13,BEVERAGES,49605.0,671882.79,,1426297.24,21.23,5292530.68,,78.77,,Stories Faqra
14,FOOD,46504.7,4199569.55,,1508720.49,35.93,2690849.07,,64.07,,Stories Faqra
17,BEVERAGES,55306.5,782327.67,,1803433.47,23.05,6019843.2,,76.95,,Stories.
18,FOOD,62895.2,6311370.0,,2473263.04,39.19,3838106.96,,60.81,,Stories.


In [16]:
for col in ["Qty", "Total Cost", "Total Profit"]:
    df4[col] = df4[col].apply(clean_number)

df4["Revenue"] = df4["Total Cost"] + df4["Total Profit"]
df4["Profit_Margin"] = df4["Total Profit"] / df4["Revenue"]
df4["branch_norm"] = df4["branch"].apply(normalize_branch)

df4[["branch","Category","Qty","Total Cost","Total Profit","Revenue","Profit_Margin"]].head(10)

Unnamed: 0,branch,Category,Qty,Total Cost,Total Profit,Revenue,Profit_Margin
1,Stories - Bir Hasan,BEVERAGES,137572.1,4143855.17,14270003.22,18413858.39,0.77496
2,Stories - Bir Hasan,FOOD,102791.4,4658225.8,7991434.23,12649660.03,0.631751
5,Stories Centro Mall,BEVERAGES,138680.5,4591734.79,15958152.17,20549886.96,0.776557
6,Stories Centro Mall,FOOD,121423.1,4981860.7,8089636.74,13071497.44,0.618876
9,Stories LAU,BEVERAGES,58430.0,1774898.67,5997384.33,7772283.0,0.771637
10,Stories LAU,FOOD,73384.15,2799004.1,4875891.58,7674895.68,0.635304
13,Stories Faqra,BEVERAGES,49605.0,1426297.24,5292530.68,6718827.92,0.787716
14,Stories Faqra,FOOD,46504.7,1508720.49,2690849.07,4199569.56,0.640744
17,Stories.,BEVERAGES,55306.5,1803433.47,6019843.2,7823276.67,0.769479
18,Stories.,FOOD,62895.2,2473263.04,3838106.96,6311370.0,0.608126


In [17]:
out4 = OUT_DIR / "clean_category_summary.csv"
df4.to_csv(out4, index=False)
print("Saved:", out4)

Saved: ..\outputs\clean_category_summary.csv


In [18]:
df4.shape, df4["branch"].nunique(), df4["Category"].value_counts()

((50, 14),
 25,
 Category
 BEVERAGES    25
 FOOD         25
 Name: count, dtype: int64)

In [19]:
from io import StringIO

f1 = DATA_DIR / "REP_S_00134_SMRY.csv"

with open(f1, "r", encoding="utf-8", errors="replace") as f:
    text1 = f.read()

df1_raw = pd.read_csv(StringIO(text1), header=None)
df1_raw.head(20)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,Stories,,,,,,,,,,,,,
1,Comparative Monthly Sales,,,,,,,,,,,,,
2,22-Jan-2026,,"Year: 2026,2025",,,,,,,,,,Page 1 of,0.01
3,,,,January,February,March,April,May,June,July,August,September,,
4,2025,Stories - Bir Hasan,,3355705.33,2842993.67,2266050.67,3459979.67,2125379.33,744638.33,3799740.25,3783897.91,3255935.75,,
5,,Stories Ain El Mreisseh,,12648546.67,9929973.33,9849926.67,13025923.33,7208975.00,2883403.33,11889329.16,11610097.92,11021123.75,,
6,,Stories Airport,,0.00,0.00,0.00,0.00,0.00,8428.83,2940003.01,6963426.45,7467197.62,,
7,,Stories Antelias,,2615854.33,2139010.67,3162716.67,3391741.33,2033635.67,728978.33,2963696.71,3182363.97,2909965.47,,
8,,Stories Batroun,,4266517.33,3388117.33,4890198.33,5516881.67,2963980.33,1311240.00,6135003.02,6751770.29,5041150.76,,
9,,Stories Bayada,,4497376.67,3419106.67,5083386.67,5196906.67,3043400.00,1208190.00,5448609.92,5613038.75,4917258.26,,


In [20]:
df1_raw.shape


(112, 14)

In [21]:
for c in range(min(10, df1_raw.shape[1])):
    sample = df1_raw[c].astype(str)
    if sample.str.contains("Stories", na=False).any():
        print("Found 'Stories' in column:", c)
    

Found 'Stories' in column: 0
Found 'Stories' in column: 1


In [22]:
BRANCH_COL = 1
YEAR_COL = 0

In [23]:
BRANCH_COL = 1
YEAR_COL = 0

In [24]:
df1_raw.columns = [f"c{i}" for i in range(df1_raw.shape[1])]

mask_branch = df1_raw[f"c{BRANCH_COL}"].astype(str).str.contains("Stories", na=False)
df1 = df1_raw[mask_branch].copy()

df1.head(10)

Unnamed: 0,c0,c1,c2,c3,c4,c5,c6,c7,c8,c9,c10,c11,c12,c13
4,2025.0,Stories - Bir Hasan,,3355705.33,2842993.67,2266050.67,3459979.67,2125379.33,744638.33,3799740.25,3783897.91,3255935.75,,
5,,Stories Ain El Mreisseh,,12648546.67,9929973.33,9849926.67,13025923.33,7208975.0,2883403.33,11889329.16,11610097.92,11021123.75,,
6,,Stories Airport,,0.0,0.0,0.0,0.0,0.0,8428.83,2940003.01,6963426.45,7467197.62,,
7,,Stories Antelias,,2615854.33,2139010.67,3162716.67,3391741.33,2033635.67,728978.33,2963696.71,3182363.97,2909965.47,,
8,,Stories Batroun,,4266517.33,3388117.33,4890198.33,5516881.67,2963980.33,1311240.0,6135003.02,6751770.29,5041150.76,,
9,,Stories Bayada,,4497376.67,3419106.67,5083386.67,5196906.67,3043400.0,1208190.0,5448609.92,5613038.75,4917258.26,,
10,,Stories Centro Mall,,3264533.33,2944806.67,2218466.67,3937166.67,1955286.67,924383.33,4201339.95,4307333.34,3232537.55,,
11,,Stories Event Starco,,0.0,0.0,0.0,0.0,0.0,0.0,918.92,3141.14,0.0,,
12,,Stories Faqra,,1889910.0,1829960.0,1479756.67,2298003.33,1109666.67,352083.33,482036.04,1535977.47,0.0,,
13,,Stories Khaldeh,,7468155.33,6108429.33,6420841.0,8455732.0,4487117.67,2262395.0,9464291.92,10661995.53,7859844.17,,


In [25]:
df1["year"] = df1[f"c{YEAR_COL}"].replace("", np.nan).ffill()
df1["year"] = df1["year"].apply(clean_number).astype("Int64")

df1[["year", f"c{BRANCH_COL}"]].head(15)

Unnamed: 0,year,c1
4,2025,Stories - Bir Hasan
5,2025,Stories Ain El Mreisseh
6,2025,Stories Airport
7,2025,Stories Antelias
8,2025,Stories Batroun
9,2025,Stories Bayada
10,2025,Stories Centro Mall
11,2025,Stories Event Starco
12,2025,Stories Faqra
13,2025,Stories Khaldeh
