In [1]:
import pandas as pd

from src.path import DataPaths
from tools.type_check import print_detailed_info


df = pd.read_parquet(DataPaths.file_parquet_clean)
print_detailed_info(df)

Original dataset: 21,946 rows
Number of columns in the DataFrame: 31
Data columns (total 31 columns):
 #   Column                               Actual type     Preview
---  ------------------------------------ ---------------- --------------------------------------------------
 0   unspsc                               str             Sewing and stitchery and weaving equipme...
 1   root_domain                          str             studio-atcoat.com
 2   page_url                             str             https://studio-atcoat.com/1372696759/?id...
 3   product_title                        str             Glimakra Warping Board (8m)
 4   product_summary                      str             The Glimakra Warping Board is designed f...
 5   product_name                         str             Warping Board
 6   product_identifier                   ndarray(0,)     []
 7   brand                                str             CST
 8   intended_industries                  ndarray(1,)     [

In [2]:
simple_arrays = [
    'product_identifier',          # column 6
    'intended_industries',         # column 8
    'applicability',               # column 9
    'ethical_and_sustainability_practices',  # column 11
    'materials',                   # column 14
    'ingredients',                 # column 15
    'manufacturing_countries',     # column 16
    'manufacturing_type',          # column 18
    'customization',               # column 19
    'packaging_type',              # column 20
    'form'                         # column 21
    'quality_standards_and_certifications',  # column 28
    'miscellaneous_features'                 # column 29
]

dictionary_arrays = [
    'production_capacity',                   # column 12
    'price',                                 # column 13
    'size',                                  # column 22
    'color',                                 # column 23
    'purity',                                # column 24
    'energy_efficiency',                     # column 25
    'pressure_rating',                       # column 26
    'power_rating',                          # column 27
]

In [3]:
# Function to analyze a single column
def analyze_dict_array_column(df, column_name):
    print(f"\n{'='*80}")
    print(f"ANALYSIS FOR COLUMN: {column_name}")
    print(f"{'='*80}")

    # Extract the column
    try:
        column_data = df[column_name]
        print(f"Column type: {type(column_data)}")
        print(f"Total rows: {len(column_data)}")

        # Check if column exists and has data
        if column_data.empty:
            print(f"Column '{column_name}' is empty")
            return

        # Get sample data
        sample = column_data.iloc[0]
        print(f"Sample value type: {type(sample)}")

        # Initialize tracking variables
        all_keys = {}
        key_counts = {}
        rows_with_errors = []
        rows_analyzed = 0
        non_empty_rows = 0

        # Analyze each row
        for idx, array in column_data.items():
            rows_analyzed += 1

            # Skip None or empty arrays
            if array is None or (hasattr(array, '__len__') and len(array) == 0):
                continue

            non_empty_rows += 1
            try:
                # Extract all keys from dictionaries in this array
                row_keys = set()

                # Handle different array types
                if isinstance(array, (list, tuple, np.ndarray)):
                    for item in array:
                        if isinstance(item, dict):
                            item_keys = set(item.keys())
                            row_keys.update(item_keys)

                            # Count each key occurrence
                            for key in item_keys:
                                all_keys[key] = all_keys.get(key, 0) + 1
                elif isinstance(array, dict):
                    # If the array itself is a dictionary
                    item_keys = set(array.keys())
                    row_keys.update(item_keys)

                    # Count each key occurrence
                    for key in item_keys:
                        all_keys[key] = all_keys.get(key, 0) + 1

                # Track key counts per row
                if row_keys:
                    num_keys = len(row_keys)
                    key_counts[num_keys] = key_counts.get(num_keys, 0) + 1

            except Exception as e:
                rows_with_errors.append((idx, str(e)))

        # Display results
        print(f"\n--- All Dictionary Keys Present in '{column_name}' ---")
        for key, count in sorted(all_keys.items()):
            print(f"'{key}': found in {count} dictionaries")

        print(f"\n--- Distribution of Key Counts in '{column_name}' ---")
        for count, frequency in sorted(key_counts.items()):
            percentage = (frequency / non_empty_rows) * 100 if non_empty_rows > 0 else 0
            print(f"Rows with {count} keys: {frequency} ({percentage:.2f}%)")

        if len(key_counts) > 1:
            print(f"\nThere are variations in the number of keys per row in '{column_name}'!")
        else:
            print(f"\nAll rows have the same number of keys in '{column_name}'.")

        print(f"\nTotal rows analyzed: {rows_analyzed}")
        print(f"Non-empty rows: {non_empty_rows}")
        print(f"Rows with errors: {len(rows_with_errors)}")

    except Exception as e:
        print(f"Error analyzing column '{column_name}': {e}")

# Analyze all specified columns
import numpy as np
for column_name in dictionary_arrays:
    analyze_dict_array_column(df, column_name)

# Generate a summary table of all columns and their keys
print("\n\n")
print(f"{'='*120}")
print(f"SUMMARY OF ALL DICTIONARY KEYS ACROSS COLUMNS")
print(f"{'='*120}")

summary_data = []

for column_name in dictionary_arrays:
    try:
        column_data = df[column_name]
        all_keys = {}
        rows_with_keys = 0

        for array in column_data:
            if array is None or (hasattr(array, '__len__') and len(array) == 0):
                continue

            row_has_keys = False
            try:
                if isinstance(array, (list, tuple, np.ndarray)):
                    for item in array:
                        if isinstance(item, dict):
                            for key in item.keys():
                                all_keys[key] = all_keys.get(key, 0) + 1
                            row_has_keys = True
                elif isinstance(array, dict):
                    for key in array.keys():
                        all_keys[key] = all_keys.get(key, 0) + 1
                    row_has_keys = True
            except:
                pass

            if row_has_keys:
                rows_with_keys += 1

        summary_data.append({
            'column': column_name,
            'keys': sorted(all_keys.keys()),
            'rows_with_keys': rows_with_keys,
            'key_counts': all_keys
        })
    except Exception as e:
        print(f"Error summarizing column '{column_name}': {e}")

# Print the summary table
print(f"{'Column':<35} | {'Keys Present':<50} | {'Rows With Keys':<15}")
print(f"{'-'*35}-+-{'-'*50}-+-{'-'*15}")

for data in summary_data:
    keys_str = ", ".join(data['keys'])
    if len(keys_str) > 47:
        keys_str = keys_str[:44] + "..."
    print(f"{data['column']:<35} | {keys_str:<50} | {data['rows_with_keys']:<15}")

print("\nDetailed key distribution by column:")
for data in summary_data:
    print(f"\n{data['column']}:")
    for key, count in sorted(data['key_counts'].items()):
        print(f"  '{key}': {count}")


ANALYSIS FOR COLUMN: production_capacity
Column type: <class 'pandas.core.series.Series'>
Total rows: 21946
Sample value type: <class 'numpy.ndarray'>

--- All Dictionary Keys Present in 'production_capacity' ---
'quantity': found in 36 dictionaries
'time_frame': found in 36 dictionaries
'type': found in 36 dictionaries
'unit': found in 36 dictionaries

--- Distribution of Key Counts in 'production_capacity' ---
Rows with 4 keys: 35 (100.00%)

All rows have the same number of keys in 'production_capacity'.

Total rows analyzed: 21946
Non-empty rows: 35
Rows with errors: 0

ANALYSIS FOR COLUMN: price
Column type: <class 'pandas.core.series.Series'>
Total rows: 21946
Sample value type: <class 'numpy.ndarray'>

--- All Dictionary Keys Present in 'price' ---
'amount': found in 8154 dictionaries
'currency': found in 8154 dictionaries
'type': found in 8154 dictionaries

--- Distribution of Key Counts in 'price' ---
Rows with 3 keys: 6522 (100.00%)

All rows have the same number of keys in '