# 01_EDA — Exploratory Data Analysis (Structured Features)

This notebook explores the structured dataset for HS-code classification.  
We **do not** use raw text. Inputs are:
- `tags` (comma-separated keywords)
- numeric: `price` (CAD), `weight` (kg)
- categorical: `origin`, `dest`, `gift` (0/1)
- label: `label_id` (maps to `hs_mapping.csv`)

Outputs in this notebook:
- sanity checks (schema, missing values)
- class balance per `label_id` (with human-readable HS titles)
- top tags
- price and weight distributions
- clean vs noisy comparisons
- simple cross-tabs (label × selected tags)


In [None]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

DATA = Path('../data')
FIGS = Path('./figs')
FIGS.mkdir(parents=True, exist_ok=True)

pd.set_option("display.max_rows", 20)
pd.set_option("display.max_colwidth", 80)

In [None]:
# Load datasets
clean = pd.read_csv(DATA/'samples_clean.csv')
noisy = pd.read_csv(DATA/'samples_noisy.csv')
mapping = pd.read_csv(DATA/'hs_mapping.csv')

# Basic schema checks
required_cols = ['id','description','tags','price','weight','origin','dest','gift','label_id']
missing = [c for c in required_cols if c not in clean.columns]
print("Missing columns in clean:", missing)

missing_noisy = [c for c in required_cols if c not in noisy.columns]
print("Missing columns in noisy:", missing_noisy)

print("\nClean shape:", clean.shape, "Noisy shape:", noisy.shape)
clean.head()

In [None]:
print("HS Mapping (first 10 rows):")
mapping.head(10)

In [None]:
# Prepare mapping dicts
label_to_title = dict(zip(mapping['label_id'], mapping['title']))
clean['label_title'] = clean['label_id'].map(label_to_title)
noisy['label_title'] = noisy['label_id'].map(label_to_title)

# Class balance (clean)
cls_counts_clean = clean['label_id'].value_counts().sort_index()
cls_counts_clean_df = pd.DataFrame({'label_id': cls_counts_clean.index,
                                    'count': cls_counts_clean.values})
cls_counts_clean_df['title'] = cls_counts_clean_df['label_id'].map(label_to_title)
cls_counts_clean_df

In [None]:
plt.figure()
plt.bar(cls_counts_clean_df['label_id'].astype(str), cls_counts_clean_df['count'])
plt.title('Class counts (clean)')
plt.xlabel('label_id')
plt.ylabel('count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(FIGS/'class_counts_clean.png', dpi=120)
plt.show()

In [None]:
cls_counts_noisy = noisy['label_id'].value_counts().sort_index()
cls_counts_noisy_df = pd.DataFrame({'label_id': cls_counts_noisy.index,
                                    'count': cls_counts_noisy.values})
cls_counts_noisy_df['title'] = cls_counts_noisy_df['label_id'].map(label_to_title)
cls_counts_noisy_df

In [None]:
plt.figure()
plt.bar(cls_counts_noisy_df['label_id'].astype(str), cls_counts_noisy_df['count'])
plt.title('Class counts (noisy)')
plt.xlabel('label_id')
plt.ylabel('count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig(FIGS/'class_counts_noisy.png', dpi=120)
plt.show()

In [None]:
def explode_tags(df):
    # split comma-separated tags; trim spaces
    tags_series = df['tags'].astype(str).str.split(',')
    df_exp = df[['id','label_id']].copy()
    df_exp = df_exp.join(pd.Series(tags_series, name='tags_list'))
    df_exp = df_exp.explode('tags_list')
    df_exp['tag'] = df_exp['tags_list'].str.strip()
    df_exp = df_exp.drop(columns=['tags_list'])
    return df_exp[df_exp['tag']!='']

tags_clean = explode_tags(clean)
top_tags = tags_clean['tag'].value_counts().head(20)
top_tags

In [None]:
plt.figure()
plt.bar(top_tags.index.astype(str), top_tags.values)
plt.title('Top 20 tags (clean)')
plt.xlabel('tag')
plt.ylabel('count')
plt.xticks(rotation=60)
plt.tight_layout()
plt.savefig(FIGS/'top_tags_clean.png', dpi=120)
plt.show()

In [None]:
plt.figure()
clean['price'].dropna().plot(kind='hist', bins=15)
plt.title('Price distribution (clean)')
plt.xlabel('price (CAD)')
plt.ylabel('freq')
plt.tight_layout()
plt.savefig(FIGS/'price_hist_clean.png', dpi=120)
plt.show()

In [None]:
plt.figure()
clean['weight'].dropna().plot(kind='hist', bins=15)
plt.title('Weight distribution (clean)')
plt.xlabel('weight (kg)')
plt.ylabel('freq')
plt.tight_layout()
plt.savefig(FIGS/'weight_hist_clean.png', dpi=120)
plt.show()

In [None]:
# Choose a few common tags to inspect
inspect_tags = [t for t in top_tags.index[:8]]
wide = pd.DataFrame({'label_id': sorted(clean['label_id'].unique())})
for t in inspect_tags:
    wide[t] = clean['tags'].str.contains(fr'(^|,){t}(,|$)', regex=True).astype(int)
    
summary = wide.groupby('label_id').sum()
summary.index.name = 'label_id'
summary

In [None]:
cmp = pd.DataFrame({
    'avg_price_clean': [clean['price'].mean()],
    'avg_price_noisy': [noisy['price'].mean()],
    'avg_weight_clean': [clean['weight'].mean()],
    'avg_weight_noisy': [noisy['weight'].mean()],
})
cmp