In [None]:
import json
import csv
import xml.etree.ElementTree as ET
from pathlib import Path

import pandas as pd

BASE_DIR = Path("..").resolve()
SRC_DIR = BASE_DIR / "for_convert"
DST_DIR = BASE_DIR / "converted_csv"
DST_DIR.mkdir(exist_ok=True)

In [None]:
def convert_jsonl_to_csv(filename: str):
    jsonl_path = SRC_DIR / filename
    csv_path = DST_DIR / (Path(filename).stem + ".csv")

    rows = []
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            obj = json.loads(line)
            rows.append(obj)

    if not rows:
        print(f"[JSONL] No rows found in {jsonl_path}")
        return

    fieldnames = sorted({k for row in rows for k in row.keys()})

    with open(csv_path, "w", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)

    print(f"[JSONL] {jsonl_path} -> {csv_path}")

In [None]:
def convert_xml_to_csv(filename: str, row_tag: str = "customer"):
    xml_path = SRC_DIR / filename
    csv_path = DST_DIR / (Path(filename).stem + ".csv")

    tree = ET.parse(xml_path)
    root = tree.getroot()

    rows = []
    for elem in root.findall(row_tag):
        row = {}
        for child in elem:
            row[child.tag] = (child.text or "").strip()
        rows.append(row)

    if not rows:
        print(f"[XML] No <{row_tag}> rows found in {xml_path}")
        return

    fieldnames = sorted({k for row in rows for k in row.keys()})

    with open(csv_path, "w", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)

    print(f"[XML] {xml_path} -> {csv_path}")

In [None]:

def convert_parquet_to_csv(filename: str):
    parquet_path = SRC_DIR / filename
    csv_path = DST_DIR / (Path(filename).stem + ".csv")

    df = pd.read_parquet(parquet_path)
    df.to_csv(csv_path, index=False)

    print(f"[PARQUET] {parquet_path} -> {csv_path}")

In [None]:
convert_jsonl_to_csv("financial_ratios.jsonl")
convert_xml_to_csv("geographic_data.xml", row_tag="customer")
convert_parquet_to_csv("credit_history.parquet")