# **Cấu hình file**

In [1]:
from pathlib import Path

DATA_RAW = Path(r"C:\Users\pitou\Desktop\Data Mining\data_raw")
DATA_PARQUET = Path(r"C:\Users\pitou\Desktop\Data Mining\data_parquet")
DATA_CLEAN = Path(r"C:\Users\pitou\Desktop\Data Mining\data_clean")

DATA_PARQUET.mkdir(parents=True, exist_ok=True)
DATA_CLEAN.mkdir(parents=True, exist_ok=True)

FILES = {
    'branches':      'branches.csv',
    'customers':     'customers.csv',
    'orders':        'orders.csv',
    'order_details': 'order_details.csv',
    'categories':    'categories.csv',
}

def p(path: Path) -> str:
    return path.resolve().as_posix()

print('RAW dir:', DATA_RAW.resolve())
print('PARQUET dir:', DATA_PARQUET.resolve())
print('CLEAN dir:', DATA_CLEAN.resolve())
print('Files:', FILES)

RAW dir: C:\Users\pitou\Desktop\Data Mining\data_raw
PARQUET dir: C:\Users\pitou\Desktop\Data Mining\data_parquet
CLEAN dir: C:\Users\pitou\Desktop\Data Mining\data_clean
Files: {'branches': 'branches.csv', 'customers': 'customers.csv', 'orders': 'orders.csv', 'order_details': 'order_details.csv', 'categories': 'categories.csv'}


# **Chuyển CSV sang Parquet**

In [2]:
import duckdb

con = duckdb.connect()
def csv_to_parquet(csv_path: Path, parquet_path: Path):
    sql = f'''
    COPY (
        SELECT * FROM read_csv_auto('{p(csv_path)}', ALL_VARCHAR=TRUE)
    )
    TO '{p(parquet_path)}' (FORMAT 'parquet');
    '''
    con.execute(sql)

for key, fname in FILES.items():
    csv_fp = DATA_RAW / fname
    if not csv_fp.exists():
        print(f'Missing CSV: {csv_fp}.')
        continue
    out_fp = DATA_PARQUET / f'{key}_raw.parquet'
    print(f'→ Converting {csv_fp.name} → {out_fp.name}')
    csv_to_parquet(csv_fp, out_fp)
print('Convert CSV to Parquet done.')


→ Converting branches.csv → branches_raw.parquet
→ Converting customers.csv → customers_raw.parquet
→ Converting orders.csv → orders_raw.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

→ Converting order_details.csv → order_details_raw.parquet


FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

→ Converting categories.csv → categories_raw.parquet
Convert CSV to Parquet done.


# **Làm sạch dữ liệu với Polars**

In [3]:
import polars as pl

def rename_if_present(df: pl.DataFrame | pl.LazyFrame, mapping: dict) -> pl.LazyFrame:
    current_cols = set(df.columns) if isinstance(df, pl.DataFrame) else set(df.columns)
    safe_map = {old: new for old, new in mapping.items() if old in current_cols}
    return df.rename(safe_map)

def strip_all_str(df: pl.LazyFrame, cols: list[str]) -> pl.LazyFrame:
    ex_cols = [c for c in cols if c in df.columns]
    return df.with_columns([pl.col(c).cast(pl.Utf8, strict=False).str.strip_chars().alias(c) for c in ex_cols])

def cast_if_present(df: pl.LazyFrame, col: str, dtype) -> pl.LazyFrame:
    return df.with_columns(pl.when(pl.col(col).is_not_null()).then(pl.col(col).cast(dtype, strict=False)).otherwise(pl.lit(None)).alias(col)) if col in df.columns else df

def write_lazy(df_lazy: pl.LazyFrame, out_path: Path):
    df_lazy.sink_parquet(p(out_path))

In [4]:
# BRANCHES
raw_fp = DATA_PARQUET / 'branches_raw.parquet'
out_fp = DATA_CLEAN / 'branches_clean.parquet'
if raw_fp.exists():
    df = pl.scan_parquet(p(raw_fp))
    df = rename_if_present(df, {
        'BRANCH_ID': 'branch_id',
        'REGION': 'region',
        'CITY': 'city',
        'TOWN': 'town',
        'BRANCH_TOWN': 'branch_town',
        'LAT': 'lat',
        'LON': 'lon',
    })

    # Làm sạch text cho các cột string
    df = strip_all_str(df, ['branch_id', 'region', 'city', 'town', 'branch_town'])

    # Ép kiểu cho ID và số
    df = cast_if_present(df, 'branch_id', pl.Utf8)
    df = cast_if_present(df, 'lat', pl.Float64)
    df = cast_if_present(df, 'lon', pl.Float64)

    # Loại bỏ dòng trùng theo branch_id nếu có
    if 'branch_id' in df.columns:
        df = df.unique(subset=['branch_id'])

    write_lazy(df, out_fp)
    print('branches ->', out_fp.name)
else:
    print('branches_raw.parquet not found, skipped.')

branches -> branches_clean.parquet


  current_cols = set(df.columns) if isinstance(df, pl.DataFrame) else set(df.columns)
  ex_cols = [c for c in cols if c in df.columns]
  return df.with_columns(pl.when(pl.col(col).is_not_null()).then(pl.col(col).cast(dtype, strict=False)).otherwise(pl.lit(None)).alias(col)) if col in df.columns else df
  if 'branch_id' in df.columns:


In [5]:
# CUSTOMERS
raw_fp = DATA_PARQUET / 'customers_raw.parquet'
out_fp = DATA_CLEAN / 'customers_clean.parquet'

if raw_fp.exists():
    df = pl.scan_parquet(p(raw_fp))
    df = rename_if_present(df, {
        'USERID': 'user_id',
        'USERNAME_': 'username',
        'NAMESURNAME': 'name_surname',
        'STATUS_': 'status',
        'USERGENDER': 'gender',
        'USERBIRTHDATE': 'birth_date',
        'REGION': 'region',
        'CITY': 'city',
        'TOWN': 'town',
        'DISTRICT': 'district',
        'ADDRESSTEXT': 'address_text',
    })

    # Làm sạch chuỗi cho các cột text
    text_cols = ['username', 'name_surname', 'gender', 'region', 'city', 'town', 'district', 'address_text']
    df = strip_all_str(df, text_cols)

    # Ép kiểu user_id -> string
    df = cast_if_present(df, 'user_id', pl.Utf8)

    # Parse ngày sinh
    if 'birth_date' in df.columns:
        df = df.with_columns(
            pl.col('birth_date').str.strptime(pl.Date, format='%Y-%m-%d', strict=False)
        )

    # Loại bỏ trùng user_id nếu có
    if 'user_id' in df.columns:
        df = df.unique(subset=['user_id'])

    write_lazy(df, out_fp)
    print('customers ->', out_fp.name)
else:
    print('customers_raw.parquet not found, skipped.')


customers -> customers_clean.parquet


  current_cols = set(df.columns) if isinstance(df, pl.DataFrame) else set(df.columns)
  ex_cols = [c for c in cols if c in df.columns]
  return df.with_columns(pl.when(pl.col(col).is_not_null()).then(pl.col(col).cast(dtype, strict=False)).otherwise(pl.lit(None)).alias(col)) if col in df.columns else df
  if 'birth_date' in df.columns:
  if 'user_id' in df.columns:


In [6]:
# ORDERS
raw_fp = DATA_PARQUET / 'orders_raw.parquet'
out_fp = DATA_CLEAN / 'orders_clean.parquet'

if raw_fp.exists():
    df = pl.scan_parquet(p(raw_fp))
    df = rename_if_present(df, {
        'ORDERID': 'order_id',
        'BRANCH_ID': 'branch_id',
        'DATE_': 'order_date',
        'USERID': 'user_id',
        'NAMESURNAME': 'name_surname',
        'TOTALBASKET': 'total_basket',
    })

    # Làm sạch text
    df = strip_all_str(df, ['branch_id', 'name_surname'])

    # Ép kiểu ID về string
    df = cast_if_present(df, 'order_id', pl.Utf8)
    df = cast_if_present(df, 'user_id', pl.Utf8)
    df = cast_if_present(df, 'branch_id', pl.Utf8)

    # Parse ngày
    if 'order_date' in df.columns:
        df = df.with_columns(
            pl.col('order_date').str.strptime(pl.Datetime, format='%Y-%m-%d %H:%M:%S', strict=False)
        )

    # Xử lý cột total_basket
    if 'total_basket' in df.columns:
        df = df.with_columns(
            pl.col('total_basket')
            .cast(pl.Utf8, strict=False)
            .str.replace_all(",", ".")
            .cast(pl.Float64, strict=False)
        )

    # Loại bỏ trùng order_id nếu có
    if 'order_id' in df.columns:
        df = df.unique(subset=['order_id'])

    write_lazy(df, out_fp)
    print('orders ->', out_fp.name)
else:
    print('orders_raw.parquet not found, skipped.')


  current_cols = set(df.columns) if isinstance(df, pl.DataFrame) else set(df.columns)
  ex_cols = [c for c in cols if c in df.columns]
  return df.with_columns(pl.when(pl.col(col).is_not_null()).then(pl.col(col).cast(dtype, strict=False)).otherwise(pl.lit(None)).alias(col)) if col in df.columns else df
  if 'order_date' in df.columns:
  if 'total_basket' in df.columns:
  if 'order_id' in df.columns:


orders -> orders_clean.parquet


In [7]:
# ORDER_DETAILS
raw_fp = DATA_PARQUET / 'order_details_raw.parquet'
out_fp = DATA_CLEAN / 'order_details_clean.parquet'

if raw_fp.exists():
    df = pl.scan_parquet(p(raw_fp))
    df = rename_if_present(df, {
        'ORDERID': 'order_id',
        'ORDERDETAILID': 'order_detail_id',
        'AMOUNT': 'amount',
        'UNITPRICE': 'unit_price',
        'TOTALPRICE': 'total_price',
        'ITEMID': 'item_id',
        'ITEMCODE': 'item_code',
    })

    # Ép kiểu các ID sang chuỗi
    df = cast_if_present(df, 'order_id', pl.Utf8)
    df = cast_if_present(df, 'order_detail_id', pl.Utf8)
    df = cast_if_present(df, 'item_id', pl.Utf8)
    df = cast_if_present(df, 'item_code', pl.Utf8)

    # Ép kiểu số lượng
    if 'amount' in df.columns:
        df = df.with_columns(pl.col('amount').cast(pl.Int64, strict=False))

    # Xử lý các cột số có dấu phẩy (unit_price, total_price)
    for col in ['unit_price', 'total_price']:
        if col in df.columns:
            df = df.with_columns(
                pl.col(col)
                .cast(pl.Utf8, strict=False)
                .str.replace_all(",", ".")
                .cast(pl.Float64, strict=False)
            )

    # Loại bỏ dòng không hợp lệ (amount <= 0 hoặc giá null)
    df = df.filter(
        (pl.col('amount') > 0)
        & (pl.col('unit_price').is_not_null())
        & (pl.col('total_price').is_not_null())
    )

    # Kiểm tra tính nhất quán giữa total_price và amount * unitprice
    if {'amount', 'unit_price', 'total_price'}.issubset(df.columns):
        df = df.with_columns(
            (pl.col('amount') * pl.col('unit_price')).alias('calc_line_total')
        )

    write_lazy(df, out_fp)
    print('order_details ->', out_fp.name)
else:
    print('order_details_raw.parquet not found, skipped.')


  current_cols = set(df.columns) if isinstance(df, pl.DataFrame) else set(df.columns)
  return df.with_columns(pl.when(pl.col(col).is_not_null()).then(pl.col(col).cast(dtype, strict=False)).otherwise(pl.lit(None)).alias(col)) if col in df.columns else df
  if 'amount' in df.columns:
  if col in df.columns:
  if {'amount', 'unit_price', 'total_price'}.issubset(df.columns):


order_details -> order_details_clean.parquet


In [8]:
# CATEGORIES
raw_fp = DATA_PARQUET / 'categories_raw.parquet'
out_fp = DATA_CLEAN / 'categories_clean.parquet'

if raw_fp.exists():
    df = pl.scan_parquet(p(raw_fp))
    df = rename_if_present(df, {
        'ITEMID': 'item_id',
        'CATEGORY1': 'category1',
        'CATEGORY1_ID': 'category1_id',
        'CATEGORY2': 'category2',
        'CATEGORY2_ID': 'category2_id',
        'CATEGORY3': 'category3',
        'CATEGORY3_ID': 'category3_id',
        'CATEGORY4': 'category4',
        'CATEGORY4_ID': 'category4_id',
        'BRAND': 'brand',
        'ITEMCODE': 'item_code',
        'ITEMNAME': 'item_name',
    })

    # Làm sạch chuỗi cho tất cả các cột text
    text_cols = [
        'category1', 'category2', 'category3', 'category4',
        'brand', 'item_name'
    ]
    df = strip_all_str(df, text_cols)

    # Ép kiểu ID về chuỗi
    id_cols = ['item_id', 'item_code', 'category1_id', 'category2_id', 'category3_id', 'category4_id']
    for c in id_cols:
        df = cast_if_present(df, c, pl.Utf8)

    # Loại bỏ dòng trùng item_id nếu có
    if 'item_id' in df.columns:
        df = df.unique(subset=['item_id'])

    write_lazy(df, out_fp)
    print('categories ->', out_fp.name)
else:
    print('categories_raw.parquet not found, skipped.')

categories -> categories_clean.parquet


  current_cols = set(df.columns) if isinstance(df, pl.DataFrame) else set(df.columns)
  ex_cols = [c for c in cols if c in df.columns]
  return df.with_columns(pl.when(pl.col(col).is_not_null()).then(pl.col(col).cast(dtype, strict=False)).otherwise(pl.lit(None)).alias(col)) if col in df.columns else df
  if 'item_id' in df.columns:


# **Kiểm tra nhanh thông tin các bảng**

In [9]:
def safe_scan(path: Path):
    return pl.scan_parquet(p(path)) if path.exists() else None

checks = {}
for name in ['branches', 'customers', 'orders', 'order_details', 'categories']:
    fp = DATA_CLEAN / f'{name}_clean.parquet'
    df = safe_scan(fp)
    if df is None:
        print(f'Missing {fp.name}')
        continue
    head = df.head(5).collect()
    rows = df.select(pl.len()).collect().item()
    checks[name] = {'rows': rows, 'sample': head}

    print(f'— {name}: {rows} rows')
    print(head)

def check_unique(df_lazy: pl.LazyFrame, key: str):
    if df_lazy is None or key not in df_lazy.collect_schema().names():
        return None

    dup = (
        df_lazy
        .group_by(key)
        .agg(pl.len().alias("count"))
        .filter(pl.col("count") > 1)
        .limit(5)
        .collect()
    )
    return dup

for name, key in [('branches','branch_id'), ('customers','customer_id'), ('orders','order_id'), ('categories','category_id')]:
    fp = DATA_CLEAN / f'{name}_clean.parquet'
    df = safe_scan(fp)
    if df is None:
        continue
    dup = check_unique(df, key)
    if dup is not None and dup.height > 0:
        print(f'Duplicates in {name}.{key}:')
        print(dup)
    else:
        print(f'{name}.{key} appears unique')

print('Validation pass finished.')

— branches: 161 rows
shape: (5, 7)
┌───────────┬────────────┬───────────┬───────────────────┬───────────────────┬──────────┬──────────┐
│ branch_id ┆ region     ┆ city      ┆ town              ┆ branch_town       ┆ lat      ┆ lon      │
│ ---       ┆ ---        ┆ ---       ┆ ---               ┆ ---               ┆ ---      ┆ ---      │
│ str       ┆ str        ┆ str       ┆ str               ┆ str               ┆ f64      ┆ f64      │
╞═══════════╪════════════╪═══════════╪═══════════════════╪═══════════════════╪══════════╪══════════╡
│ 538-KA2   ┆ İç Anadolu ┆ Kayseri   ┆ DEVELİ            ┆ DEVELİ            ┆ 3.8385e9 ┆ 3.5500e9 │
│ 132-IS2   ┆ Akdeniz    ┆ Isparta   ┆ GELENDOST         ┆ YALVAÇ            ┆ 3.8124e9 ┆ 3.1011e9 │
│ 538-KA1   ┆ İç Anadolu ┆ Kayseri   ┆ AKKIŞLA           ┆ BÜNYAN            ┆ 3.9000e9 ┆ 3.6167e9 │
│ 717-ÇA1   ┆ Marmara    ┆ Çanakkale ┆ AYVACIK/ÇANAKKALE ┆ AYVACIK/ÇANAKKALE ┆ 3.9601e9 ┆ 2.6405e9 │
│ 310-BA1   ┆ Ege        ┆ Balıkesir ┆ BALIKESİR MERKEZ 

# **Tiền xử lý dữ liệu từng file**

In [11]:
# BRANCHES
DATA_CLEAN = Path(r"C:\Users\pitou\Desktop\Data Mining\data_clean")
branches_fp = DATA_CLEAN / "branches_clean.parquet"

if branches_fp.exists():
    df = pl.read_parquet(str(branches_fp))

    # Scale lat/lon
    df = df.with_columns([
        (pl.col("lat") / 1e8).alias("lat"),
        (pl.col("lon") / 1e8).alias("lon")
    ])

    # Chuẩn hóa chuỗi, thay null
    df = df.with_columns([
        pl.col("region").fill_null("Unknown").str.strip_chars(),
        pl.col("city").fill_null("Unknown").str.strip_chars(),
        pl.col("town").fill_null("Unknown").str.strip_chars(),
    ])

    # Loại dòng thiếu branch_id
    df = df.filter(pl.col("branch_id").is_not_null())

    # Ghi đè lại file clean
    df.write_parquet(str(branches_fp))
    print("Đã tiền xử lý xong branches")
else:
    print("Không tìm thấy branches_clean.parquet")


Đã tiền xử lý xong branches


In [12]:
# ORDERS
orders_fp = DATA_CLEAN / "orders_clean.parquet"

if orders_fp.exists():
    df = pl.read_parquet(str(orders_fp))

    # Loại order thiếu user_id hoặc branch_id
    df = df.filter(
        pl.col("user_id").is_not_null() & pl.col("branch_id").is_not_null()
    )

    # Loại order có total_basket <= 0 hoặc null
    if "total_basket" in df.columns:
        df = df.filter(pl.col("total_basket").is_not_null() & (pl.col("total_basket") > 0))

    if "order_date" in df.columns:
        df = df.with_columns([
            pl.col("order_date").dt.year().alias("order_year"),
            pl.col("order_date").dt.month().alias("order_month"),
            pl.col("order_date").dt.weekday().alias("order_weekday")
        ])

    df.write_parquet(str(orders_fp))
    print("Đã tiền xử lý xong orders")
else:
    print("Không tìm thấy orders_clean.parquet")

Đã tiền xử lý xong orders


In [13]:
# ORDER_DETAILS
details_fp = DATA_CLEAN / "order_details_clean.parquet"

if details_fp.exists():
    df = pl.read_parquet(str(details_fp))

    # Loại dòng amount <= 0 hoặc null
    df = df.filter(pl.col("amount").is_not_null() & (pl.col("amount") > 0))

    # Loại dòng unit_price hoặc total_price <= 0 hoặc null
    df = df.filter(
        pl.col("unit_price").is_not_null() & (pl.col("unit_price") > 0) &
        pl.col("total_price").is_not_null() & (pl.col("total_price") > 0)
    )

    # Kiểm tra sai lệch giữa total_price và amount*unit_price
    df = df.with_columns(
        (pl.col("amount") * pl.col("unit_price") - pl.col("total_price")).alias("price_diff")
    )
    df = df.filter(pl.col("price_diff").abs() < 1e-2)  # loại dòng sai số lớn

    df = df.drop("price_diff")

    df.write_parquet(str(details_fp))
    print("Đã tiền xử lý xong order_details")
else:
    print("Không tìm thấy order_details_clean.parquet")

Đã tiền xử lý xong order_details


In [14]:
# CATEGORIES
categories_fp = DATA_CLEAN / "categories_clean.parquet"

if categories_fp.exists():
    df = pl.read_parquet(str(categories_fp))

    # Loại null item_id
    df = df.filter(pl.col("item_id").is_not_null())

    # Chuẩn hóa category / brand / item_name
    for c in ["category1","category2","category3","category4","brand","item_name"]:
        if c in df.columns:
            df = df.with_columns(pl.col(c).fill_null("Unknown").str.strip_chars())

    df.write_parquet(str(categories_fp))
    print("Đã tiền xử lý xong categories")
else:
    print("Không tìm thấy categories_clean.parquet")

Đã tiền xử lý xong categories


In [15]:
DATA_CLEAN = Path(r"C:\Users\pitou\Desktop\Data Mining\data_clean")
od = (DATA_CLEAN / "order_details_clean.parquet").as_posix()
cat = (DATA_CLEAN / "categories_clean.parquet").as_posix()

con = duckdb.connect()

# Tỷ lệ dòng có unit_price = 0
res1 = con.execute(f"""
SELECT 
  COUNT(*) AS n_all,
  SUM(CASE WHEN unit_price = 0 THEN 1 ELSE 0 END) AS n_zero,
  100.0 * SUM(CASE WHEN unit_price = 0 THEN 1 ELSE 0 END) / COUNT(*) AS pct_zero
FROM parquet_scan('{od}');
""").fetchdf()
print(res1)

# unit_price = 0 nhưng total_price > 0
res2 = con.execute(f"""
SELECT 
  COUNT(*) AS n_sus,
  SUM(total_price) AS sum_total_price_sus
FROM parquet_scan('{od}')
WHERE unit_price = 0 AND total_price > 0;
""").fetchdf()
print(res2)

# unit_price = 0 và total_price = 0 (amount > 0)
res3 = con.execute(f"""
SELECT 
  COUNT(*) AS n_free,
  SUM(amount) AS qty_free
FROM parquet_scan('{od}')
WHERE unit_price = 0 AND total_price = 0 AND amount > 0;
""").fetchdf()
print(res3)

# Top mặt hàng có unit_price = 0
res4 = con.execute(f"""
SELECT d.item_id, c.item_name, c.brand, COUNT(*) AS cnt
FROM parquet_scan('{od}') d
LEFT JOIN parquet_scan('{cat}') c ON d.item_id = c.item_id
WHERE d.unit_price = 0
GROUP BY 1,2,3
ORDER BY cnt DESC
LIMIT 20;
""").fetchdf()
print(res4)

con.close()


   n_all  n_zero  pct_zero
0  69125     0.0       0.0
   n_sus  sum_total_price_sus
0      0                  NaN
   n_free  qty_free
0       0       NaN
Empty DataFrame
Columns: [item_id, item_name, brand, cnt]
Index: []
