In [1]:
!pip install boto3 pandas

import os, re, boto3, pandas as pd

MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "http://minio:9000")
MINIO_BUCKET   = os.getenv("MINIO_BUCKET", "demo-bucket")
MINIO_KEY      = os.getenv("MINIO_KEY", "minio-root-user")        # MINIO_ROOT_USER
MINIO_SECRET   = os.getenv("MINIO_SECRET", "minio-root-password")     # MINIO_ROOT_PASSWORD

s3 = boto3.client(
    "s3",
    aws_access_key_id=MINIO_KEY,
    aws_secret_access_key=MINIO_SECRET,
    endpoint_url=MINIO_ENDPOINT,
)

resp = s3.list_objects_v2(
    Bucket=MINIO_BUCKET,
    Prefix="finance/yahoo/daily/",
    Delimiter="/",
)

prefixes = [cp["Prefix"] for cp in resp.get("CommonPrefixes", [])]
latest = sorted([p for p in prefixes if re.search(r"ingest_date=\d{4}-\d{2}-\d{2}/$", p)])[-1]
latest


Collecting boto3
  Downloading boto3-1.40.11-py3-none-any.whl.metadata (6.7 kB)
Collecting botocore<1.41.0,>=1.40.11 (from boto3)
  Downloading botocore-1.40.11-py3-none-any.whl.metadata (5.7 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.14.0,>=0.13.0 (from boto3)
  Downloading s3transfer-0.13.1-py3-none-any.whl.metadata (1.7 kB)
Downloading boto3-1.40.11-py3-none-any.whl (140 kB)
Downloading botocore-1.40.11-py3-none-any.whl (14.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.0/14.0 MB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading jmespath-1.0.1-py3-none-any.whl (20 kB)
Downloading s3transfer-0.13.1-py3-none-any.whl (85 kB)
Installing collected packages: jmespath, botocore, s3transfer, boto3
Successfully installed boto3-1.40.11 botocore-1.40.11 jmespath-1.0.1 s3transfer-0.13.1


'finance/yahoo/daily/ingest_date=2025-08-18/'

In [2]:
!pip install s3fs

storage_options = {
    "key": MINIO_KEY,
    "secret": MINIO_SECRET,
    "client_kwargs": {"endpoint_url": MINIO_ENDPOINT},
}

df = pd.read_parquet(f"s3://{MINIO_BUCKET}/{latest}AAPL.parquet", storage_options=storage_options)
#df.shape, df.head(), 
df.info()
#df.groupby("ticker")["Close"].mean().round(2)
#df.groupby(('ticker',''))[('Close','AAPL')].mean().round(2)



Collecting s3fs
  Downloading s3fs-2025.7.0-py3-none-any.whl.metadata (1.4 kB)
Collecting aiobotocore<3.0.0,>=2.5.4 (from s3fs)
  Downloading aiobotocore-2.24.1-py3-none-any.whl.metadata (25 kB)
Collecting fsspec==2025.7.0 (from s3fs)
  Downloading fsspec-2025.7.0-py3-none-any.whl.metadata (12 kB)
Collecting aiohttp!=4.0.0a0,!=4.0.0a1 (from s3fs)
  Downloading aiohttp-3.12.15-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)
Collecting aioitertools<1.0.0,>=0.5.1 (from aiobotocore<3.0.0,>=2.5.4->s3fs)
  Downloading aioitertools-0.12.0-py3-none-any.whl.metadata (3.8 kB)
Collecting botocore<1.39.12,>=1.39.9 (from aiobotocore<3.0.0,>=2.5.4->s3fs)
  Downloading botocore-1.39.11-py3-none-any.whl.metadata (5.7 kB)
Collecting multidict<7.0.0,>=6.0.0 (from aiobotocore<3.0.0,>=2.5.4->s3fs)
  Downloading multidict-6.6.4-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl.metadata (5.3 kB)
Collecting wrapt<2.0.0,>=1.10.10 (from aiobotocore<3.

In [3]:
df_cols = list(df.columns)
type(df.columns), [type(c) for c in df_cols], df_cols[:5]


(pandas.core.indexes.base.Index,
 [str, str, str, str, str, str, str, str],
 ["('Date', '')",
  "('Close', 'AAPL')",
  "('High', 'AAPL')",
  "('Low', 'AAPL')",
  "('Open', 'AAPL')"])

In [4]:
import ast
import pandas as pd

# 1) Normalize columns to a real 2-level MultiIndex
def to_two_level_multiindex(df):
    new_cols = []
    for c in df.columns:
        if isinstance(c, tuple) and len(c) == 2:
            new_cols.append(c)
        elif isinstance(c, str) and c.startswith("("):
            try:
                t = ast.literal_eval(c)
                new_cols.append(t if isinstance(t, tuple) and len(t) == 2 else (str(c), ""))
            except Exception:
                new_cols.append((str(c), ""))
        else:
            new_cols.append((str(c), ""))
    out = df.copy()
    out.columns = pd.MultiIndex.from_tuples(new_cols)
    return out

df2 = to_two_level_multiindex(df)

# 2) (Important) Drop or rename the original ('ticker','') to avoid name clash
if ('ticker','') in df2.columns:
    # either drop:
    df2 = df2.drop(columns=[('ticker','')])
    # or rename instead:
    # df2 = df2.rename(columns={( 'ticker',''):('orig_ticker','')})

# 3) Go tidy by stacking level-1 (tickers like 'AAPL')
if ('Date','') in df2.columns:
    df2 = df2.set_index(('Date',''))

long = (
    df2.stack(1)                 # brings level-1 (tickers) into rows
       .reset_index(level=1)     # make that column explicit
       .rename(columns={'level_1': 'ticker'})
       .reset_index()            # bring Date back if you set it
)

# 4) Now grouping works (no duplicate 'ticker')
long.groupby('ticker')['Close'].mean().round(2)


ticker
           NaN
AAPL    216.64
Name: Close, dtype: float64