In [1]:
import pandas as pd

from src.path import DataPaths
from src.process_columns import clean_columns
from src.merge import merge_dataframe_rows

from tools.type_check import print_detailed_info
from tools.save_data import export_dataframe

df = pd.read_parquet(DataPaths.file_parquet_original)


In [2]:
df = clean_columns(df)
print_detailed_info(df)

Original dataset: 21,946 rows
Number of columns in the DataFrame: 27
Data columns (total 27 columns):
 #   Column                               Actual type     Preview
---  ------------------------------------ ---------------- --------------------------------------------------
 0   unspsc                               str             Sewing and stitchery and weaving equipme...
 1   root_domain                          str             studio-atcoat.com
 2   page_url                             str             https://studio-atcoat.com/1372696759/?id...
 3   product_title                        str             Glimakra Warping Board (8m)
 4   product_identifier                   ndarray(0,)     []
 5   brand                                str             CST
 6   intended_industries                  ndarray(1,)     ['Textile']
 7   applicability                        ndarray(1,)     ['use with floor looms']
 8   eco_friendly                         bool            True
 9   ethical_and_

In [3]:
# df = merge_dataframe_rows(df, 'product_title')

In [4]:
from src.merge import merge_root_domain
test_df = df.head(10)[['root_domain']].copy()
print_detailed_info(test_df)
test_df.head(10)

Original dataset: 10 rows
Number of columns in the DataFrame: 1
Data columns (total 1 columns):
 #   Column                               Actual type     Preview
---  ------------------------------------ ---------------- --------------------------------------------------
 0   root_domain                          str             studio-atcoat.com


Unnamed: 0,root_domain
0,studio-atcoat.com
1,worm-gears.net
2,customcarcoverco.com
3,plumbmaster.com
4,sogno.in
5,plumbmaster.com
6,advancedpressuresystems.ca
7,armstrongmedical.com
8,advancedpressuresystems.ca
9,workwonderly.com


In [5]:
test_df = merge_dataframe_rows(test_df, 'root_domain')
print_detailed_info(test_df)
test_df.head(10)

Original dataset: 8 rows
Number of columns in the DataFrame: 1
Data columns (total 1 columns):
 #   Column                               Actual type     Preview
---  ------------------------------------ ---------------- --------------------------------------------------
 0   root_domain                          str             advancedpressuresystems.ca


Unnamed: 0,root_domain
0,advancedpressuresystems.ca
1,armstrongmedical.com
2,customcarcoverco.com
3,plumbmaster.com
4,sogno.in
5,studio-atcoat.com
6,workwonderly.com
7,worm-gears.net


In [6]:
# Create a minimal test dataframe with duplicates across domains
test_data = {
    'product_title': ['Test Product', 'Test Product', 'Another Product'],
    'root_domain': ['domain1.com', 'domain2.com', 'domain3.com'],
    'page_url': ['https://domain1.com/prod', 'https://domain2.com/prod', 'https://domain3.com/other']
}

import pandas as pd
mini_df = pd.DataFrame(test_data)

print("Original test data:")
print(mini_df)

# Import your merge function and apply it
from src.merge import merge_dataframe_rows

# Try to merge
result_df = merge_dataframe_rows(mini_df, 'product_title')

print("\nAfter merging:")
print(result_df)

# Examine the structure of the result to see exactly what happened
print("\nDetailed result structure:")
for _, row in result_df.iterrows():
    print(f"Product: {row['product_title']}")
    print(f"Domain: {row['root_domain']} (type: {type(row['root_domain'])})")
    print(f"URL: {row['page_url']} (type: {type(row['page_url'])})")
    print("-" * 40)

Original test data:
     product_title  root_domain                   page_url
0     Test Product  domain1.com   https://domain1.com/prod
1     Test Product  domain2.com   https://domain2.com/prod
2  Another Product  domain3.com  https://domain3.com/other

After merging:
     product_title                root_domain  \
0  Another Product                domain3.com   
1     Test Product  domain1.com | domain2.com   

                                            page_url  
0                          https://domain3.com/other  
1  https://domain1.com/prod | https://domain2.com...  

Detailed result structure:
Product: Another Product
Domain: domain3.com (type: <class 'str'>)
URL: https://domain3.com/other (type: <class 'str'>)
----------------------------------------
Product: Test Product
Domain: domain1.com | domain2.com (type: <class 'str'>)
URL: https://domain1.com/prod | https://domain2.com/prod (type: <class 'str'>)
----------------------------------------


In [7]:
# Start with loading and cleaning the data
df_test1 = pd.read_parquet(DataPaths.file_parquet_original)
df_test1 = clean_columns(df_test1)

# Print information about the original dataset
print(f"Original dataset: {len(df_test1):,} rows")

# Let's check how many potential duplicates we have by product_title
duplicate_mask = df_test1.duplicated(subset=['product_title'], keep=False)
duplicate_count = duplicate_mask.sum()
print(f"Rows with duplicate product_title: {duplicate_count:,} ({duplicate_count/len(df_test1):.2%} of total)")

# Apply the merge operation
import time
start_time = time.time()

# Merge based on product_title
merged_df = merge_dataframe_rows(df_test1, 'product_title')

end_time = time.time()
processing_time = end_time - start_time

# Print information about the merged dataset
print(f"Merged dataset: {len(merged_df):,} rows")
print(f"Rows reduced: {len(df_test1) - len(merged_df):,} ({(len(df_test1) - len(merged_df))/len(df_test1):.2%} of original)")
print(f"Processing time: {processing_time:.2f} seconds")

# Check a few examples to confirm merging worked correctly
print("\nSample of merged records with multiple domains:")
# Find rows where root_domain contains a pipe character (merged domains)
merged_domains_mask = merged_df['root_domain'].str.contains(' \| ', regex=True)
merged_sample = merged_df[merged_domains_mask].head(3)

if len(merged_sample) > 0:
    for idx, row in merged_sample.iterrows():
        print(f"\nProduct: {row['product_title']}")
        print(f"Domains: {row['root_domain']}")
        print(f"URLs: {row['page_url']}")
        print("-" * 60)
else:
    print("No records with merged domains found in sample.")

  merged_domains_mask = merged_df['root_domain'].str.contains(' \| ', regex=True)


Original dataset: 21,946 rows
Rows with duplicate product_title: 4,422 (20.15% of total)
Logged 2 rows with merge errors to E:\veridion_deduplication\data\error\merge_errors.csv
Merged dataset: 18,954 rows
Rows reduced: 2,992 (13.63% of original)
Processing time: 5.95 seconds

Sample of merged records with multiple domains:

Product: 4x4 Tyres
Domains: laws-tyres.com | peterboroughmobiletyre.com
URLs: https://www.laws-tyres.com/ | https://www.peterboroughmobiletyre.com/4x4-tyres
------------------------------------------------------------

Product: Accessories
Domains: armytrucktom.nl | bothwellfarmsupplies.com | cdivalve.com | cutter-shop.com | frescowindows.com | gefranonline.net | hugoandtedltd.com | intellitecmv.com | lloydpans.com | millargb.com | plumbmaster.com
URLs: http://bothwellfarmsupplies.com/index.html | https://armytrucktom.nl/ | https://cutter-shop.com/shop/ | https://frescowindows.com/ | https://lloydpans.com/help/shipping | https://millargb.com/en/ | https://www.cdiva