In [6]:
# --- bootstrap: find src/quantlib ---
import sys
from pathlib import Path

def add_src_to_path(pkg="quantlib", levels_up=3):
    here = Path.cwd().resolve()
    candidates = [here / "src"] + [here.parents[i] / "src" for i in range(levels_up)]
    for p in candidates:
        if (p / pkg / "__init__.py").exists():
            sys.path.insert(0, str(p)); print(f"[OK] sys.path += {p}"); return
    raise FileNotFoundError("src/quantlib not found")

add_src_to_path()

# --- config ---
DATA_RAW = Path(r"C:\Users\quantbase\Desktop\quant\data_raw")
DATA_INT = Path(r"C:\Users\quantbase\Desktop\quant\data_int")
DATA_INT.mkdir(parents=True, exist_ok=True)


[OK] sys.path += C:\Users\quantbase\Desktop\quant\src


In [7]:
# --- tiny loader: read one CSV, keep only Close, EQ series, clean index ---
import pandas as pd

RENAME = {"DATE":"date","CLOSE":"close","SYMBOL":"symbol","SERIES":"series"}
def load_close(csv_path: Path) -> pd.DataFrame:
    cols = pd.read_csv(csv_path, nrows=0).columns
    use = [c for c in ["DATE","CLOSE","SYMBOL","SERIES"] if c in cols]
    df = (pd.read_csv(csv_path, usecols=use, parse_dates=["DATE"], low_memory=False)
            .rename(columns=RENAME))
    if "series" in df: df = df[df["series"].eq("EQ")]
    df = df.dropna(subset=["date","symbol","close"]).sort_values("date")
    sym = df["symbol"].iloc[0]
    return df.set_index("date")[["close"]].rename(columns={"close": str(sym)})


In [8]:
# --- build wide panel with anchor left-join (keeps the longest series' dates) ---
files = sorted(DATA_RAW.glob("*.csv"))
print(f"Found {len(files)} CSVs")

# find anchor (longest rows)
lengths = []
for fp in files:
    try:
        dfi = load_close(fp)
        lengths.append((len(dfi), fp))
    except Exception:
        pass
anchor_fp = sorted(lengths, reverse=True)[0][1]
anchor = load_close(anchor_fp)
print(f"Anchor: {anchor_fp.name} with {len(anchor)} rows")

# left-join others
prices_close = anchor.copy()
for fp in files:
    if fp == anchor_fp: continue
    try:
        dfi = load_close(fp)
        prices_close = prices_close.join(dfi, how="left")
    except Exception as e:
        print("[skip]", fp.name, str(e))

# basic hygiene
prices_close = prices_close[(prices_close > 0)]
prices_close.to_csv(DATA_INT/"prices_close_anchor_leftjoin.csv")
prices_close.to_pickle(DATA_INT/"prices_close_anchor_leftjoin.pkl")
print("Saved prices_close_anchor_leftjoin.*", prices_close.shape)


Found 57 CSVs
Anchor: ZOMATO.csv with 856 rows
Saved prices_close_anchor_leftjoin.* (859, 57)


In [9]:
# --- coverage summary (smoke test) ---
cov = []
for sym in prices_close.columns:
    s = prices_close[sym].dropna()
    cov.append((sym, len(s), s.index.min().date() if len(s) else None, s.index.max().date() if len(s) else None))
coverage = pd.DataFrame(cov, columns=["symbol","rows","start","end"]).sort_values("rows", ascending=False)
display(coverage.head(10)); display(coverage.tail(10))
print("Panel shape:", prices_close.shape)
assert prices_close.shape[1] >= 40, "Too few symbols loaded?"


Unnamed: 0,symbol,rows,start,end
0,ZOMATO,859,2022-03-30,2025-09-09
23,IDFCFIRSTB,859,2022-03-30,2025-09-09
25,INDIGO,859,2022-03-30,2025-09-09
27,IRCTC,859,2022-03-30,2025-09-09
29,IRFC,859,2022-03-30,2025-09-09
32,KOTAKBANK,859,2022-03-30,2025-09-09
34,MAZDOCK,859,2022-03-30,2025-09-09
35,MCX,859,2022-03-30,2025-09-09
36,MTARTECH,859,2022-03-30,2025-09-09
38,PPL,859,2022-03-30,2025-09-09


Unnamed: 0,symbol,rows,start,end
16,DCXINDIA,649,2022-11-11,2025-09-09
6,AVALON,600,2023-04-18,2025-09-09
4,AEROFLEX,506,2023-08-31,2025-09-09
30,JIOFIN,504,2023-09-04,2025-09-09
28,IREDA,446,2023-11-29,2025-09-09
26,INOXINDIA,430,2023-12-21,2025-09-09
39,PREMIERENE,257,2024-09-03,2025-09-09
54,VINCOFE,225,2024-10-18,2025-09-09
47,SWIGGY,207,2024-11-13,2025-09-09
55,WAAREERTL,107,2025-04-09,2025-09-09


Panel shape: (859, 57)
