In [14]:
import sys
from pathlib import Path


ROOT = Path("..").resolve()
RAW = ROOT / "data" / "raw"
PROC = ROOT / "data" / "processed"
SRC = ROOT / "src"


PROC.mkdir(parents=True, exist_ok=True)

(SRC / "__init__.py").touch(exist_ok=True)
if str(ROOT) not in sys.path:
    sys.path.insert(0, str(ROOT))

print("ROOT:", ROOT)
print("RAW :", RAW)
print("PROC:", PROC)
print("SRC :", SRC)



ROOT: C:\Users\10341\bootcamp_Jiayuan_zhang\project
RAW : C:\Users\10341\bootcamp_Jiayuan_zhang\project\data\raw
PROC: C:\Users\10341\bootcamp_Jiayuan_zhang\project\data\processed
SRC : C:\Users\10341\bootcamp_Jiayuan_zhang\project\src


In [15]:
import pandas as pd
from src.cleaning import drop_missing, fill_missing_median, normalize_data  # 来自你的 cleaning.py
import datetime as dt

def ts():
    return dt.datetime.now().strftime("%Y%m%d-%H%M%S")


In [16]:

csv_files = list(RAW.glob("*.csv"))
if not csv_files:
    raise FileNotFoundError(" No CSV files found in ../data/raw . plz finish stage 04 and 05。")

latest_csv = max(csv_files, key=lambda p: p.stat().st_mtime)
print("Using latest CSV:", latest_csv.name)

df = pd.read_csv(latest_csv, parse_dates=["date"])
df["price"] = pd.to_numeric(df.get("price"), errors="coerce")
if "ticker" in df.columns:
    df["ticker"] = df["ticker"].astype(str)

display(df.head())
print(df.dtypes)


Using latest CSV: STX_prices_20250820-234954.csv


Unnamed: 0,date,price,ticker
0,2025-02-21,99.502449,STX
1,2025-02-24,98.742737,STX
2,2025-02-25,98.387543,STX
3,2025-02-26,98.930191,STX
4,2025-02-27,99.324852,STX


date      datetime64[ns]
price            float64
ticker            object
dtype: object


In [17]:

df_clean = drop_missing(df, threshold=0.5)

df_clean = fill_missing_median(df_clean, ["price"])

df_clean = normalize_data(df_clean, ["price"])

print("missing value counts：\n", df_clean.isna().sum())
display(df_clean.head(10))



missing value counts：
 date      0
price     0
ticker    0
dtype: int64


Unnamed: 0,date,price,ticker
0,2025-02-21,0.360106,STX
1,2025-02-24,0.351894,STX
2,2025-02-25,0.348054,STX
3,2025-02-26,0.35392,STX
4,2025-02-27,0.358186,STX
5,2025-02-28,0.371411,STX
6,2025-03-03,0.35168,STX
7,2025-03-04,0.287904,STX
8,2025-03-05,0.270306,STX
9,2025-03-06,0.208556,STX


In [18]:
out_csv = PROC / f"STX_prices_clean_{ts()}.csv"
df_clean.to_csv(out_csv, index=False)

print(" Cleaned CSV saved to:", out_csv)

try:
    out_pq = PROC / f"STX_prices_clean_{ts()}.parquet"
    df_clean.to_parquet(out_pq)
    print("Parquet saved to:", out_pq)
except Exception as e:
    print(" Parquet save skipped:", e)



 Cleaned CSV saved to: C:\Users\10341\bootcamp_Jiayuan_zhang\project\data\processed\STX_prices_clean_20250821-002216.csv
Parquet saved to: C:\Users\10341\bootcamp_Jiayuan_zhang\project\data\processed\STX_prices_clean_20250821-002216.parquet
