In [1]:
import pandas as pd
import os
import time

from src.path import DataPaths
from src.deprecated_main import merge_by_url_and_title, merge_by_title_and_domain
from main import optimized_merge

# Get file paths
original_path = DataPaths.file_parquet_original
final_path = DataPaths.file_parquet_final

# Get file sizes
original_size_bytes = os.path.getsize(original_path)
final_size_bytes = os.path.getsize(final_path)

original_size_mb = original_size_bytes / (1024 * 1024)
final_size_mb = final_size_bytes / (1024 * 1024)

# Load DataFrames
print("Loading original file...")
original_df = pd.read_parquet(original_path)

print("Loading final file...")
final_df = pd.read_parquet(final_path)

# Calculate basic metrics
row_diff = len(final_df) - len(original_df)
row_diff_pct = (row_diff / len(original_df)) * 100

col_diff = len(final_df.columns) - len(original_df.columns)
col_diff_pct = (col_diff / len(original_df.columns)) * 100

size_diff = final_size_mb - original_size_mb
size_diff_pct = (size_diff / original_size_mb) * 100

# Find added/removed columns
original_cols = set(original_df.columns)
final_cols = set(final_df.columns)

added_cols = final_cols - original_cols
removed_cols = original_cols - final_cols

# Measure processing speed
print("Measuring processing speed of deprecated approach...")
# Make a copy to avoid modifying the original
deprecated_df = original_df.copy()
start_time = time.time()
merge_by_url_and_title(deprecated_df)
merge_by_title_and_domain(deprecated_df)
deprecated_time = time.time() - start_time

print("Measuring processing speed of optimized approach...")
# Make a copy to avoid modifying the original
optimized_df = original_df.copy()
start_time = time.time()
optimized_df = optimized_merge(optimized_df)
optimized_time = time.time() - start_time

speedup = (deprecated_time - optimized_time) / deprecated_time * 100

# Print comparison table
print("\n" + "="*60)
print("DATAFRAME COMPARISON")
print("="*60)
print(f"{'Metric':<20} {'Original':<15} {'Final':<15} {'Difference':<15} {'Percentage':<15}")
print("-"*80)
print(f"{'Rows':<20} {len(original_df):<15,} {len(final_df):<15,} {row_diff:<15,} {row_diff_pct:<15.2f}%")
print(f"{'Columns':<20} {len(original_df.columns):<15} {len(final_df.columns):<15} {col_diff:<15} {col_diff_pct:<15.2f}%")
print(f"{'Size (MB)':<20} {original_size_mb:<15.2f} {final_size_mb:<15.2f} {size_diff:<15.2f} {size_diff_pct:<15.2f}%")
print(f"{'Processing Time (s)':<20} {deprecated_time:<15.2f} {optimized_time:<15.2f} {deprecated_time - optimized_time:<15.2f} {speedup:<15.2f}%")

# Print column changes
print("\n" + "="*60)
print("COLUMN CHANGES")
print("="*60)

if added_cols:
    print(f"Added columns ({len(added_cols)}):")
    for col in sorted(added_cols):
        print(f"  + {col}")

if removed_cols:
    print(f"\nRemoved columns ({len(removed_cols)}):")
    for col in sorted(removed_cols):
        print(f"  - {col}")

if not (added_cols or removed_cols):
    print("No column changes detected.")

Loading original file...
Loading final file...
Measuring processing speed of deprecated approach...
Logged 2 rows with merge errors to E:\veridion_deduplication\data\error\merge_errors.csv
Measuring processing speed of optimized approach...
Logged 2 rows with merge errors to E:\veridion_deduplication\data\error\merge_errors.csv

DATAFRAME COMPARISON
Metric               Original        Final           Difference      Percentage     
--------------------------------------------------------------------------------
Rows                 21,946          18,954          -2,992          -13.63         %
Columns              31              27              -4              -12.90         %
Size (MB)            10.72           7.24            -3.48           -32.47         %
Processing Time (s)  16.60           0.61            15.99           96.35          %

COLUMN CHANGES
Added columns (2):
  + components
  + product_description

Removed columns (6):
  - description
  - ingredients
  - manufa

In [2]:
from tools.type_check import print_detailed_info
print_detailed_info(final_df)


Original dataset: 18,954 rows
Number of columns in the DataFrame: 27
Data columns (total 27 columns):
 #   Column                               Actual type     Preview
---  ------------------------------------ ---------------- --------------------------------------------------
 0   product_description                  str             The #64 Elastic Bands 5LB is a versatile...
 1   unspsc                               str             Fastening supplies | Straps
 2   root_domain                          str             basspaper.ca
 3   page_url                             str             https://shop.basspaper.ca/Product?Catego...
 4   product_title                        str             #64 Elastic Bands 5LB
 5   brand                                str             Alpha Poster
 6   eco_friendly                         bool            True
 7   product_identifier                   ndarray(2,)     ['Product_Code: 819-0685N' 'Part_Number:...
 8   intended_industries                  nda

In [3]:
print_detailed_info(original_df)

Original dataset: 21,946 rows
Number of columns in the DataFrame: 31
Data columns (total 31 columns):
 #   Column                               Actual type     Preview
---  ------------------------------------ ---------------- --------------------------------------------------
 0   unspsc                               str             Sewing and stitchery and weaving equipme...
 1   root_domain                          str             studio-atcoat.com
 2   page_url                             str             https://studio-atcoat.com/1372696759/?id...
 3   product_title                        str             Glimakra Warping Board (8m)
 4   product_summary                      str             The Glimakra Warping Board is designed f...
 5   product_name                         str             Warping Board
 6   product_identifier                   ndarray(0,)     []
 7   brand                                str             CST
 8   intended_industries                  ndarray(1,)     [