In [1]:
import pandas as pd

from src.path import DataPaths
from src.merge import merge_dataframe_rows

from tools.type_check import print_detailed_info
from tools.save_data import export_dataframe



df = pd.read_parquet(DataPaths.file_parquet_original)
print_detailed_info(df)

Original dataset: 21,946 rows
Number of columns in the DataFrame: 31
Data columns (total 31 columns):
 #   Column                               Actual type     Preview
---  ------------------------------------ ---------------- --------------------------------------------------
 0   unspsc                               str             Sewing and stitchery and weaving equipme...
 1   root_domain                          str             studio-atcoat.com
 2   page_url                             str             https://studio-atcoat.com/1372696759/?id...
 3   product_title                        str             Glimakra Warping Board (8m)
 4   product_summary                      str             The Glimakra Warping Board is designed f...
 5   product_name                         str             Warping Board
 6   product_identifier                   ndarray(0,)     []
 7   brand                                str             CST
 8   intended_industries                  ndarray(1,)     [

In [2]:
df.head(10)

Unnamed: 0,unspsc,root_domain,page_url,product_title,product_summary,product_name,product_identifier,brand,intended_industries,applicability,...,form,size,color,purity,energy_efficiency,pressure_rating,power_rating,quality_standards_and_certifications,miscellaneous_features,description
0,Sewing and stitchery and weaving equipment and...,studio-atcoat.com,https://studio-atcoat.com/1372696759/?idx=510,Glimakra Warping Board (8m),The Glimakra Warping Board is designed for use...,Warping Board,[],,[Textile],[use with floor looms],...,[],"[{'dimension': 'Length', 'qualitative': False,...",[],[],,[],[],[],[],"The ""Warping Board"" is designed for use with f..."
1,Electric alternating current AC motors,worm-gears.net,https://worm-gears.net/tag/worm-gear-box/,NMRV Worm Gearbox Motor,The NMRV Worm Gearbox Motor is a high-efficien...,Worm Gearbox Motor,[],,[Industrial],[industrial applications],...,[],[],"[{'original': 'Blue', 'simple': 'Blue'}, {'ori...",[],,[],"[{'qualitative': False, 'type': 'min', 'unit':...",[],"[Omnibearing installation, High radiation effi...","The ""Worm Gearbox Motor"" is a high-efficiency ..."
2,Vehicle trim and exterior covering,customcarcoverco.com,https://customcarcoverco.com/collections/vendo...,Nissan R33 GTR Car Cover,A custom car cover designed for the Nissan R33...,Car Cover,[],,[Automotive],[protecting vehicles from the elements],...,[],[],[],[],,[],[],[],"[Personalization with custom brand logos, grap...","The ""Car Cover"" is a custom-designed cover tai..."
3,Pipe connectors,plumbmaster.com,https://www.plumbmaster.com/search?q=wolverine...,Flexible Fittings,"Flexible fittings for plumbing applications, a...",Flexible Fittings,[],,[Plumbing],[plumbing installations],...,[],[],[],[],,[],[],[],"[allows for movement, flexibility in installat...","""Flexible Fittings"" are designed for plumbing ..."
4,Doors,sogno.in,http://www.sogno.in/product-detail-CST-HGD-331...,CST-HGD-33103 Hinged Closet Door,The CST-HGD-33103 Hinged Closet Door is a meti...,Hinged Closet Door,[],CST,"[Home Appliances, Construction]",[Closet Storage],...,[],[],[],[],,[],[],[],"[Italian craftsmanship, German engineering, Sm...","The ""Hinged Closet Door"" is a storage solution..."
5,Faucets or taps,plumbmaster.com,https://www.plumbmaster.com/search?q=wolverine...,Deep Faucets,"Faucets with a deep design, providing a secure...",Deep Faucets,[],,[Plumbing],[],...,[],[],[],[],,[],[],[],"[deep design, secure and stable connection]","""Deep Faucets"" are designed with a deep design..."
6,Dispensing tools,advancedpressuresystems.ca,https://advancedpressuresystems.ca/collections...,10K Dry Shut-Off Gun Handle Assembly,The 10K Dry Shut-Off Gun Handle Assembly is a ...,Dry Shut-Off Gun Handle Assembly,[],,[Manufacturing],[High-Pressure Water Blasting Operations],...,[],[],[],[],,"[{'qualitative': True, 'type': 'exact', 'unit'...",[],[],[],"The ""Dry Shut-Off Gun Handle Assembly"" is a co..."
7,Medical facility materials handling and distri...,armstrongmedical.com,https://www.armstrongmedical.com//cart-systems...,Cranberry Cart Systems,Cranberry Cart Systems are part of the Armstro...,Cranberry Cart Systems,[],Armstrong Medical,[Healthcare],[medical and healthcare applications],...,[],[],"[{'original': 'Cranberry', 'simple': 'Red'}]",[],,[],[],[],[vibrant and eye-catching look],"""Cranberry Cart Systems"" from the Armstrong Me..."
8,Pneumatic tools,advancedpressuresystems.ca,https://advancedpressuresystems.ca/collections...,10K Air Operated Control Gun,An air operated control gun designed for water...,Air Operated Control Gun,[],,[Manufacturing],[water blasting operations],...,[],"[{'dimension': None, 'qualitative': True, 'typ...",[],[],,[],[],[],[],"The ""Air Operated Control Gun"" is designed for..."
9,Tshirts,workwonderly.com,https://www.workwonderly.com/tags/Medicine/col...,5 THINGS YOU SHOULD KNOW ABOUT MY NURSE PRACTI...,A long sleeve tee with the message '5 THINGS Y...,Long Sleeve Tee,[],,"[Fashion, Retail]",[Wearing],...,[],[],"[{'original': 'Blue', 'simple': 'Blue'}, {'ori...",[],,[],[],[],[Long sleeve],"The ""Long Sleeve Tee"" is a long sleeve t-shirt..."


In [3]:
# Step 1 Merging by common key, page_url and product_title
df['key'] = df['page_url'] + '|' + df['product_title']

# Print initial number of rows
initial_rows = len(df)
print(f"Initial number of rows: {initial_rows}")

# Find the duplicate rows based on the key column
duplicated_mask = df.duplicated(subset=['key'], keep=False)
duplicates_df = df[duplicated_mask].copy()
duplicate_count = len(duplicates_df)

export_dataframe(duplicates_df, DataPaths.visualization_merge_url_title_dir, 'duplicates_before_merge', file_format='csv')

merged_df = merge_dataframe_rows(df, key_column='key')

# Before dropping the key column, identify the duplicates
merged_duplicates = merged_df[merged_df['key'].isin(duplicates_df['key'].unique())].copy()

# Calculate how many rows were removed through merging
merged_rows = len(merged_df)
rows_removed = initial_rows - merged_rows
percent_removed = rows_removed / initial_rows

print(f"Rows after merging: {merged_rows}")
print(f"Rows removed through merging: {rows_removed} ({percent_removed:.2%} of total)")

# Now it's safe to drop the key column from both dataframes
merged_df = merged_df.drop(columns=['key'])
merged_duplicates = merged_duplicates.drop(columns=['key'])

print(f"Number of previously duplicate rows after merging: {len(merged_duplicates)}")

export_dataframe(merged_duplicates, DataPaths.visualization_merge_url_title_dir, 'duplicates_after_merge', file_format='csv')
export_dataframe(merged_df, DataPaths.visualization_merge_url_title_dir, 'merged_url_title', file_format='csv')
export_dataframe(merged_df, DataPaths.visualization_merge_url_title_dir, 'merged_url_title', file_format='parquet')

Initial number of rows: 21946
Exported data to: E:\veridion_deduplication\data\visualization\processed\2_merge_url_title\duplicates_before_merge.csv
Rows after merging: 21937
Rows removed through merging: 9 (0.04% of total)
Number of previously duplicate rows after merging: 9
Exported data to: E:\veridion_deduplication\data\visualization\processed\2_merge_url_title\duplicates_after_merge.csv
Exported data to: E:\veridion_deduplication\data\visualization\processed\2_merge_url_title\merged_url_title.csv
Exported data to: E:\veridion_deduplication\data\visualization\processed\2_merge_url_title\merged_url_title.snappy.parquet


WindowsPath('E:/veridion_deduplication/data/visualization/processed/2_merge_url_title/merged_url_title.snappy.parquet')

In [4]:
# Step 2 Merging by common key, product_title and root_domain
print("\n" + "="*50)
print("STEP 2: Merging by product_title and root_domain")
print("="*50)

# Create a new key column by combining product_title and root_domain
df['key'] = df['product_title'] + '|' + df['root_domain']

# Print initial number of rows
initial_rows = len(df)
print(f"Initial number of rows: {initial_rows}")

# Find the duplicate rows based on the new key column
duplicated_mask = df.duplicated(subset=['key'], keep=False)
duplicates_df = df[duplicated_mask].copy()
duplicate_count = len(duplicates_df)
print(f"Number of duplicate rows identified: {duplicate_count} ({duplicate_count/initial_rows:.2%} of total)")

export_dataframe(duplicates_df, DataPaths.visualization_merge_title_domain_dir, 'duplicates_before_merge', file_format='csv')

# Merge the dataframe rows based on the new key
merged_df = merge_dataframe_rows(df, key_column='key')

# Before dropping the key column, identify the duplicates
merged_duplicates = merged_df[merged_df['key'].isin(duplicates_df['key'].unique())].copy()

# Calculate how many rows were removed through merging
merged_rows = len(merged_df)
rows_removed = initial_rows - merged_rows
percent_removed = rows_removed / initial_rows

print(f"Rows after merging: {merged_rows}")
print(f"Rows removed through merging: {rows_removed} ({percent_removed:.2%} of total)")

# Now it's safe to drop the key column from both dataframes
merged_df = merged_df.drop(columns=['key'])
merged_duplicates = merged_duplicates.drop(columns=['key'])

print(f"Number of previously duplicate rows after merging: {len(merged_duplicates)}")

export_dataframe(merged_duplicates, DataPaths.visualization_merge_title_domain_dir, 'duplicates_after_merge', file_format='csv')
export_dataframe(merged_df, DataPaths.visualization_merge_title_domain_dir, 'merged_title_domain', file_format='csv')
export_dataframe(merged_df, DataPaths.visualization_merge_title_domain_dir, 'merged_title_domain', file_format='parquet')


STEP 2: Merging by product_title and root_domain
Initial number of rows: 21946
Number of duplicate rows identified: 4257 (19.40% of total)
Exported data to: E:\veridion_deduplication\data\visualization\processed\3_merge_title_domain\duplicates_before_merge.csv
Logged 2 rows with merge errors to E:\veridion_deduplication\data\error\merge_errors.csv
Rows after merging: 19054
Rows removed through merging: 2892 (13.18% of total)
Number of previously duplicate rows after merging: 1365
Exported data to: E:\veridion_deduplication\data\visualization\processed\3_merge_title_domain\duplicates_after_merge.csv
Exported data to: E:\veridion_deduplication\data\visualization\processed\3_merge_title_domain\merged_title_domain.csv
Exported data to: E:\veridion_deduplication\data\visualization\processed\3_merge_title_domain\merged_title_domain.snappy.parquet


WindowsPath('E:/veridion_deduplication/data/visualization/processed/3_merge_title_domain/merged_title_domain.snappy.parquet')