In [None]:


# ---------- Config ----------
MINIO_ENDPOINT = os.getenv("MINIO_ENDPOINT", "http://localhost:9000")
MINIO_ACCESS_KEY = os.getenv("MINIO_ACCESS_KEY")
MINIO_SECRET_KEY = os.getenv("MINIO_SECRET_KEY")
MINIO_BUCKET = os.getenv("MINIO_BUCKET", "deltabucket")
MINIO_PREFIX = os.getenv("MINIO_PREFIX", "")  # e.g. 'gold/wholeCorp_delta'

ES_URL = os.getenv("ES_URL", "http://localhost:9200")
ES_USERNAME = os.getenv("ES_USERNAME")
ES_PASSWORD = os.getenv("ES_PASSWORD")
ES_CA_CERT = os.getenv("ES_CA_CERT")  # path or None
ES_INDEX = os.getenv("ES_INDEX", "wholecorp")

CHUNKSIZE = int(os.getenv("CHUNKSIZE", "5000"))  # rows per bulk batch

# ---------- Clients ----------
# s3fs uses the S3 API and works with MinIO
fs = s3fs.S3FileSystem(
    key=MINIO_ACCESS_KEY,
    secret=MINIO_SECRET_KEY,
    client_kwargs={"endpoint_url": MINIO_ENDPOINT},
)

# Elasticsearch client
es_kwargs = {"basic_auth": (ES_USERNAME, ES_PASSWORD)} if ES_USERNAME else {}
if ES_URL.startswith("https"):
    es_kwargs["verify_certs"] = True if ES_CA_CERT else False
    if ES_CA_CERT:
        es_kwargs["ca_certs"] = ES_CA_CERT

es = Elasticsearch(ES_URL, **es_kwargs)

# ---------- Helpers ----------
def ensure_index(es: Elasticsearch, index: str):
    """Create the index with a simple mapping if it doesn't exist."""
    if es.indices.exists(index=index):
        return
    body = {
        "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0
        },
        "mappings": {
            "dynamic": True,
            "date_detection": True,
            "dynamic_templates": [
                # treat *_at or *_date as dates when possible
                {"dates": {"match_pattern": "regex", "match": ".*(_at|_date|Date|timestamp)$",
                           "mapping": {"type": "date", "ignore_malformed": True}}},
                # numeric strings -> try as keywords by default (let dynamic handle numerics)
            ],
        }
    }
    es.indices.create(index=index, body=body)

def _stable_id(doc: Dict[str, Any]) -> str:
    """Create a stable _id to deduplicate. Customize to your schema."""
    raw = json.dumps(doc, sort_keys=True, ensure_ascii=False)
    return hashlib.md5(raw.encode("utf-8")).hexdigest()

def dict_rows_from_csv(s3_path: str, chunksize: int) -> Iterator[Dict[str, Any]]:
    with fs.open(s3_path, "rb") as f:
        for chunk in pd.read_csv(f, chunksize=chunksize):
            # (Optional) normalize/clean here
            # e.g., convert 'updatedAt' to ISO
            if "updatedAt" in chunk.columns:
                chunk["updatedAt"] = pd.to_datetime(chunk["updatedAt"], errors="coerce").dt.tz_localize(None)
            for record in chunk.to_dict(orient="records"):
                yield record

def dict_rows_from_jsonl(s3_path: str) -> Iterator[Dict[str, Any]]:
    with fs.open(s3_path, "rb") as f:
        for line in f:
            if not line:
                continue
            rec = json.loads(line.decode("utf-8"))
            yield rec

def dict_rows_from_parquet(s3_path: str, chunksize: int) -> Iterator[Dict[str, Any]]:
    # Parquet isn't naturally chunked; load in frames then split (memory ok for moderate size).
    # For huge data, consider PyArrow row groups iteration.
    df = pd.read_parquet(f"s3://{s3_path}", storage_options={
        "key": MINIO_ACCESS_KEY,
        "secret": MINIO_SECRET_KEY,
        "client_kwargs": {"endpoint_url": MINIO_ENDPOINT},
    })
    if "updatedAt" in df.columns:
        df["updatedAt"] = pd.to_datetime(df["updatedAt"], errors="coerce").dt.tz_localize(None)
    if len(df) <= chunksize:
        for r in df.to_dict(orient="records"):
            yield r
    else:
        for start in range(0, len(df), chunksize):
            sub = df.iloc[start:start+chunksize]
            for r in sub.to_dict(orient="records"):
                yield r

def actions_from_docs(docs: Iterable[Dict[str, Any]], index: str) -> Iterator[Dict[str, Any]]:
    for d in docs:
        # (Optional) field remaps / type fixes
        # Example: coerce numeric strings
        # for k in ("amount","price","count"):
        #     if k in d:
        #         try: d[k] = float(d[k])
        #         except: pass

        yield {
            "_op_type": "index",
            "_index": index,
            "_id": d.get("_id") or _stable_id(d),
            "_source": d,
        }

def s3_keys(bucket: str, prefix: str = "") -> Iterator[str]:
    path = f"{bucket}/{prefix}".rstrip("/")
    for key in fs.find(path):
        # fs.find returns full paths like 'bucket/key'
        yield key

# ---------- Main pump ----------
def pump_object(key: str, index: str):
    s3_path = key  # already like 'bucket/key.ext'
    lower = s3_path.lower()
    if lower.endswith(".csv"):
        docs = dict_rows_from_csv(s3_path, chunksize=CHUNKSIZE)
    elif lower.endswith(".jsonl") or lower.endswith(".ndjson"):
        docs = dict_rows_from_jsonl(s3_path)
    elif lower.endswith(".parquet"):
        docs = dict_rows_from_parquet(s3_path, chunksize=CHUNKSIZE)
    else:
        print(f"Skip unsupported file type: {s3_path}")
        return

    # Stream to ES
    success, fail = 0, 0
    for ok, resp in streaming_bulk(es, actions_from_docs(docs, index=index), chunk_size=CHUNKSIZE, max_retries=3):
        if ok:
            success += 1
        else:
            fail += 1
    print(f"[{s3_path}] bulk result: success={success}, failed={fail}")

def main():
    ensure_index(es, ES_INDEX)
    root = f"{MINIO_BUCKET}/{MINIO_PREFIX}" if MINIO_PREFIX else MINIO_BUCKET
    keys = list(s3_keys(MINIO_BUCKET, MINIO_PREFIX))
    if not keys:
        print(f"No objects found under s3://{root}")
        return
    for key in keys:
        pump_object(key, ES_INDEX)

if __name__ == "__main__":
    main()


ModuleNotFoundError: No module named 'minio'

In [46]:
import s3fs, pyarrow.parquet as pq, pandas as pd

fs = s3fs.S3FileSystem(
    key="minioadmin", secret="minioadmin",
    client_kwargs={"endpoint_url": "http://localhost:9000"}
)

prefix = "deltabucket/silver/wholeCorp_delta"
files = fs.glob(f"{prefix}/*.parquet")

dfs = []
for f in files:
    table = pq.ParquetDataset(f"s3://{f}", filesystem=fs).read()
    df = table.to_pandas()
    dfs.append(df)

df_whole_silver = pd.concat(dfs, ignore_index=True)

prefix = "deltabucket/gold/wholeCorp_delta"
files = fs.glob(f"{prefix}/*.parquet")

dfs = []
for f in files:
    table = pq.ParquetDataset(f"s3://{f}", filesystem=fs).read()
    df = table.to_pandas()
    dfs.append(df)

df_whole_gold = pd.concat(dfs)

df_merged = df_whole_silver.merge(df_whole_gold[['統一編號','features']], on= "統一編號", how='left')
df_merged = df_merged[['統一編號', '公司名稱', '負責人', '登記地址', '資本額', '營業項目及代碼表', '縣市名稱', '區域名稱',
        '類別_全', '官網', '電話', 'features']]
df_merged.rename(columns={'features':'features_vector'}, inplace=True)

In [59]:
df_merged.features_vector[0]

{'type': 0,
 'size': 262147,
 'indices': array([178355, 262144, 262145, 262146], dtype=int32),
 'values': array([5.21230542, 2.32432391, 0.        , 0.        ])}

In [57]:
# convert "features" struct into dense numpy vector
def to_dense(row):
    if pd.isna(row):
        return None
    indices = row["indices"]
    values = row["values"]
    size = row["size"]
    dense = np.zeros(size, dtype=float)
    dense[indices] = values
    return dense.tolist()


df_merged["features_vector"] = df_merged["features_vector"].apply(to_dense)
print(df_merged[["公司名稱","features_vector"]].head())


MemoryError: 

In [None]:
# --- Read Delta from MinIO ---
storage_options = {
    "AWS_ACCESS_KEY_ID": "minioadmin",
    "AWS_SECRET_ACCESS_KEY": "minioadmin",
    "AWS_ENDPOINT_URL": MINIO_ENDPOINT,
}

dt = DeltaTable(DELTA_PATH, storage_options=storage_options)

# Convert to Pandas (you can chunk if too big)
df = dt.to_pandas()

# --- Send to Elasticsearch ---
es = Elasticsearch(ES_URL)

def docs():
    for rec in df.to_dict(orient="records"):
        yield {"_index": ES_INDEX, "_source": rec}

helpers.bulk(es, docs())
print(f"Inserted {len(df)} records into {ES_INDEX}")


ModuleNotFoundError: No module named 'elasticsearch'

In [None]:
import s3fs, json

fs = s3fs.S3FileSystem(
    key="minioadmin",
    secret="minioadmin",
    client_kwargs={"endpoint_url": "http://localhost:9000"},
)

log_path = "deltabucket/gold/wholeCorp_delta/_delta_log/00000000000000000000.json"

In [None]:
with fs.open(log_path) as f:
    for line in f:
        print(line)
        obj = json.loads(line)

b'{"commitInfo":{"timestamp":1756348865596,"operation":"WRITE","operationParameters":{"mode":"Overwrite","partitionBy":"[]"},"isolationLevel":"Serializable","isBlindAppend":false,"operationMetrics":{"numFiles":"3","numOutputRows":"1379786","numOutputBytes":"50111755"},"engineInfo":"Apache-Spark/3.5.0 Delta-Lake/3.1.0","txnId":"c5fc9746-d3b9-4f5e-bf3a-b0c9fd0da940"}}\n'
b'{"metaData":{"id":"39abccd6-8638-43ab-a593-82213368be39","format":{"provider":"parquet","options":{}},"schemaString":"{\\"type\\":\\"struct\\",\\"fields\\":[{\\"name\\":\\"\xe7\xb5\xb1\xe4\xb8\x80\xe7\xb7\xa8\xe8\x99\x9f\\",\\"type\\":\\"string\\",\\"nullable\\":true,\\"metadata\\":{}},{\\"name\\":\\"\xe5\x85\xac\xe5\x8f\xb8\xe5\x90\x8d\xe7\xa8\xb1\\",\\"type\\":\\"string\\",\\"nullable\\":true,\\"metadata\\":{}},{\\"name\\":\\"features\\",\\"type\\":{\\"type\\":\\"udt\\",\\"class\\":\\"org.apache.spark.ml.linalg.VectorUDT\\",\\"pyClass\\":\\"pyspark.ml.linalg.VectorUDT\\",\\"sqlType\\":{\\"type\\":\\"struct\\",\\"fiel

In [20]:
with fs.open(log_path) as f:
    for line in f:
        if not line.strip():
            continue
        obj = json.loads(line)
        if "metaData" in obj:
            print("Schema string:")
            print(obj["metaData"]["schemaString"])

Schema string:
{"type":"struct","fields":[{"name":"統一編號","type":"string","nullable":true,"metadata":{}},{"name":"公司名稱","type":"string","nullable":true,"metadata":{}},{"name":"features","type":{"type":"udt","class":"org.apache.spark.ml.linalg.VectorUDT","pyClass":"pyspark.ml.linalg.VectorUDT","sqlType":{"type":"struct","fields":[{"name":"type","type":"byte","nullable":false,"metadata":{}},{"name":"size","type":"integer","nullable":true,"metadata":{}},{"name":"indices","type":{"type":"array","elementType":"integer","containsNull":false},"nullable":true,"metadata":{}},{"name":"values","type":{"type":"array","elementType":"double","containsNull":false},"nullable":true,"metadata":{}}]}},"nullable":true,"metadata":{"ml_attr":{"num_attrs":262147}}}]}
