# T-ECD Dataset Analysis

This notebook loads, joins, and analyzes the T-ECD dataset from Hugging Face.
It handles partitioned datasets (e.g., daily event files) by loading a configurable number of partitions and concatenating them.
It also includes the **Payments** dataset from the 'full' partition, as it is missing from the 'small' partition.

In [None]:
# Install necessary libraries
!pip install huggingface_hub pandas pyarrow ipywidgets matplotlib seaborn

## Authentication
You need a Hugging Face token to access the dataset. 
1. Go to https://huggingface.co/settings/tokens
2. Create a new token (Read access is sufficient)
3. Paste it below when prompted.

In [None]:
from huggingface_hub import login

login()

## Configuration & Helper Functions

We define global constants to control the data loading process.

In [None]:
from huggingface_hub import hf_hub_download, list_repo_files
import pandas as pd
import os
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict

# --- CONFIGURATION ---
REPO_ID = "t-tech/T-ECD"
REPO_TYPE = "dataset"
CACHE_DIR = "dataset_cache"  # Local folder to store downloaded files

# Dataset Paths
DATASET_PATH_SMALL = "dataset/small"
DATASET_PATH_FULL = "dataset/full" # Used for missing payments data

# GLOBAL CONSTANT: How many partitions to load for split datasets (e.g., events)
# Set this to a higher number (e.g., 50 or 100) to analyze more data.
# Set to None to load ALL available partitions (Warning: May run out of RAM in Colab)
NUM_PARTITIONS_TO_LOAD = 5 

def load_remote_parquet(filename):
    """
    Downloads a single parquet file from the HF repo to a local cache folder 
    and loads it into a Pandas DataFrame.
    """
    print(f"Downloading {filename} to {CACHE_DIR}...")
    try:
        local_path = hf_hub_download(
            repo_id=REPO_ID,
            filename=filename,
            repo_type=REPO_TYPE,
            local_dir=CACHE_DIR,
            local_dir_use_symlinks=False
        )
        # print(f"File cached at: {local_path}")
        df = pd.read_parquet(local_path)
        return df
    except Exception as e:
        print(f"Error loading {filename}: {e}")
        return None

def load_dataframe_from_partitions(file_list, limit=NUM_PARTITIONS_TO_LOAD):
    """
    Loads multiple parquet files from a list and concatenates them into a single DataFrame.
    """
    if not file_list:
        print("No files provided to load.")
        return None
    
    # Sort files to ensure order (e.g., by date)
    sorted_files = sorted(file_list)
    
    # Apply limit
    if limit is not None:
        files_to_load = sorted_files[:limit]
        print(f"Loading {len(files_to_load)} partitions (out of {len(sorted_files)} available)...")
    else:
        files_to_load = sorted_files
        print(f"Loading ALL {len(files_to_load)} partitions...")

    dfs = []
    for f in files_to_load:
        df = load_remote_parquet(f)
        if df is not None:
            dfs.append(df)
    
    if not dfs:
        return None
    
    print("Concatenating partitions...")
    full_df = pd.concat(dfs, ignore_index=True)
    return full_df

def analyze_dataframe(df, name="DataFrame"):
    """
    Performs standard data analysis steps from the lecture notes,
    including statistical summary, missing values, duplicates, and visualizations.
    """
    if df is None:
        print(f"{name} is None, skipping analysis.")
        return

    print(f"\n" + "="*20 + f" ANALYZING: {name} " + "="*20)
    print(f"Shape: {df.shape}")
    
    print("\n1. Head (First 5 rows):")
    display(df.head())
    
    print("\n2. Info (Data Types & Non-Null Counts):")
    df.info()
    
    print("\n3. Describe (Statistical Summary for Numeric Columns):")
    display(df.describe())
    
    print("\n4. Missing Values (NaN Count):")
    print(df.isnull().sum())
    
    print("\n5. Duplicates Count:")
    try:
        print(df.duplicated().sum())
    except TypeError:
        print("  Warning: Unable to check for duplicates due to unhashable types (e.g., embeddings).")
        if 'embedding' in df.columns:
            print("  Retrying without 'embedding' column:")
            print(f"  {df.drop(columns=['embedding']).duplicated().sum()}")
    
    print("\n6. Column Value Counts (Top 5 unique values for object columns):")
    cat_cols = df.select_dtypes(include=['object', 'category']).columns
    for col in cat_cols:
        if col == "embedding":
            print(f"  Skipping value counts for '{col}' column (performance optimization).")
            continue
        print(f"\nColumn: {col}")
        print(df[col].value_counts().head())

    print("\n7. Visualizations:")
    
    # Sample for visualization to avoid timeouts on large datasets
    SAMPLE_SIZE = 10000
    if len(df) > SAMPLE_SIZE:
        print(f"  (Using a random sample of {SAMPLE_SIZE} rows for plotting to improve performance)")
        plot_df = df.sample(SAMPLE_SIZE)
    else:
        plot_df = df

    # Numeric Distributions & Timedelta
    num_cols = df.select_dtypes(include=['number', 'timedelta']).columns
    if len(num_cols) > 0:
        print(f"  - Plotting distributions for numeric/timedelta columns: {list(num_cols)}")
        for col in num_cols:
            # Skip specific columns that cause issues or are not useful distributions
            if "id" in col.lower() and df[col].nunique() > 1000:
                 print(f"    Skipping distribution plot for {col} (likely an ID with high cardinality).")
                 continue
            
            plt.figure(figsize=(8, 4))
            series_to_plot = plot_df[col].dropna()
            
            # Handle timedelta: Convert to total seconds
            if pd.api.types.is_timedelta64_dtype(series_to_plot):
                print(f"    Converting {col} to total seconds for plotting.")
                series_to_plot = series_to_plot.dt.total_seconds()
            
            sns.histplot(series_to_plot, kde=True, bins=30)
            plt.title(f"Distribution of {col}")
            plt.xlabel(col)
            plt.ylabel("Frequency")
            plt.show()
            
    # Categorical Counts (Top 10)
    if len(cat_cols) > 0:
        print(f"  - Plotting counts for categorical columns: {list(cat_cols)}")
        for col in cat_cols:
            if col == "embedding":
                continue # Skip embedding plots
            if df[col].nunique() > 50: # Skip if too many unique values
                print(f"    Skipping plot for {col} (too many unique values: {df[col].nunique()})")
                continue
            plt.figure(figsize=(10, 5))
            # Use full dataframe for determining the top 10 order to be accurate
            top_10_order = df[col].value_counts().iloc[:10].index
            # Plot using the sample (or full if small)
            sns.countplot(y=col, data=plot_df, order=top_10_order)
            plt.title(f"Top 10 Counts for {col}")
            plt.xlabel("Count")
            plt.ylabel(col)
            plt.show()
    
    print("\n" + "="*60 + "\n")

## 1. Explore Repository & Index Files
We list all files once and categorize them to avoid repeated API calls.

In [None]:
all_files = list_repo_files(repo_id=REPO_ID, repo_type=REPO_TYPE)

# Categorize files by domain and type for easy access
dataset_files = defaultdict(list)

for f in all_files:
    if f.endswith(".pq"):
        # Example f: dataset/small/retail/events/01082.pq
        # Key: dataset/small/retail/events
        dirname = os.path.dirname(f).replace("\\", "/") # Normalize path separators
        dataset_files[dirname].append(f)

print("File Index Created. Available Directories:")
for d in sorted(dataset_files.keys()):
    count = len(dataset_files[d])
    print(f" - {d} ({count} files)")

## 2. Static Data Analysis (Users & Brands)
These are single files found in the root of the dataset partition.

In [None]:
# --- USERS ---
users_path = f"{DATASET_PATH_SMALL}/users.pq"
df_users = load_remote_parquet(users_path)
analyze_dataframe(df_users, "Users Data")

In [None]:
# --- BRANDS ---
brands_path = f"{DATASET_PATH_SMALL}/brands.pq"
df_brands = load_remote_parquet(brands_path)

# Handle broken brands file (empty embeddings)
if df_brands is None:
    print("Attempting to load Brands without 'embedding' column due to schema error...")
    local_path = f"{CACHE_DIR}/{brands_path}"
    if os.path.exists(local_path):
        try:
            df_brands = pd.read_parquet(local_path, columns=['brand_id'])
            print("Successfully loaded Brands data (excluding embeddings).")
        except Exception as e:
            print(f"Fallback load failed: {e}")

analyze_dataframe(df_brands, "Brands Data")

## 3. Retail Domain Analysis
Contains `items` (static) and `events` (partitioned).

In [None]:
# --- RETAIL ITEMS ---
retail_items_path = f"{DATASET_PATH_SMALL}/retail/items.pq"
df_retail_items = load_remote_parquet(retail_items_path)
analyze_dataframe(df_retail_items, "Retail Items")

In [None]:
# --- RETAIL EVENTS ---
retail_events_dir = f"{DATASET_PATH_SMALL}/retail/events"
retail_event_files = dataset_files.get(retail_events_dir, [])

df_retail_events = load_dataframe_from_partitions(retail_event_files)
analyze_dataframe(df_retail_events, "Retail Events (Joined)")

## 4. Marketplace Domain Analysis
Contains `items` (static) and `events` (partitioned).

In [None]:
# --- MARKETPLACE ITEMS ---
mp_items_path = f"{DATASET_PATH_SMALL}/marketplace/items.pq"
df_mp_items = load_remote_parquet(mp_items_path)
analyze_dataframe(df_mp_items, "Marketplace Items")

In [None]:
# --- MARKETPLACE EVENTS ---
mp_events_dir = f"{DATASET_PATH_SMALL}/marketplace/events"
mp_event_files = dataset_files.get(mp_events_dir, [])

df_mp_events = load_dataframe_from_partitions(mp_event_files)
analyze_dataframe(df_mp_events, "Marketplace Events (Joined)")

## 5. Offers Domain Analysis
Contains `items` (static) and `events` (partitioned).

In [None]:
# --- OFFERS ITEMS ---
offers_items_path = f"{DATASET_PATH_SMALL}/offers/items.pq"
df_offers_items = load_remote_parquet(offers_items_path)
analyze_dataframe(df_offers_items, "Offers Items")

In [None]:
# --- OFFERS EVENTS ---
offers_events_dir = f"{DATASET_PATH_SMALL}/offers/events"
offers_event_files = dataset_files.get(offers_events_dir, [])

df_offers_events = load_dataframe_from_partitions(offers_event_files)
analyze_dataframe(df_offers_events, "Offers Events (Joined)")

## 6. Reviews Domain Analysis
Reviews are partitioned by day directly in the folder.

In [None]:
# --- REVIEWS ---
reviews_dir = f"{DATASET_PATH_SMALL}/reviews"
reviews_files = dataset_files.get(reviews_dir, [])

df_reviews = load_dataframe_from_partitions(reviews_files)
analyze_dataframe(df_reviews, "Reviews (Joined)")

## 7. Payments Domain Analysis (From 'Full' Dataset)
**Note:** The `payments` data is missing from the `dataset/small` partition. 
We will load a sample from `dataset/full` to ensure we cover this domain in our analysis.

In [None]:
# --- PAYMENTS EVENTS ---
pay_events_dir = f"{DATASET_PATH_FULL}/payments/events"
pay_event_files = dataset_files.get(pay_events_dir, [])

print(f"Found {len(pay_event_files)} payment event files in FULL dataset. Loading sample...")
df_pay_events = load_dataframe_from_partitions(pay_event_files)
analyze_dataframe(df_pay_events, "Payments Events (Sample from Full)")

In [None]:
# --- PAYMENTS RECEIPTS ---
pay_receipts_dir = f"{DATASET_PATH_FULL}/payments/receipts"
pay_receipts_files = dataset_files.get(pay_receipts_dir, [])

print(f"Found {len(pay_receipts_files)} payment receipt files in FULL dataset. Loading sample...")
df_pay_receipts = load_dataframe_from_partitions(pay_receipts_files)
analyze_dataframe(df_pay_receipts, "Payments Receipts (Sample from Full)")