In [1]:
import pandas as pd
from nltk import ngrams

In [2]:
# Read the Excel file
df2 = pd.read_excel('snacksAndConfectionary_cleaned.xlsx')

In [3]:
# Define a function to calculate Jaccard similarity for sets of tokens
def jaccard_similarity(str1, str2):
    # Convert strings to sets of n-grams (trigrams in this case)
    set1 = set(ngrams(str1, 3))  
    set2 = set(ngrams(str2, 3))
    # Calculate Jaccard similarity
    intersection = len(set1.intersection(set2))
    union = len(set1) + len(set2) - intersection
    return intersection / union if union != 0 else 0  # Avoid division by zero

In [4]:
threshold = 0.80 # Set a similarity threshold for considering products as duplicates

In [5]:
similar_products = {} # Dictionary to store groups of similar product titles and their master codes

In [6]:
# Iterate over pairs and find similar products
for i in range(len(df2)):
    for j in range(i + 1, len(df2)):
        similarity = jaccard_similarity(df2['product_name_english'][i], df2['product_name_english'][j])
        if similarity > threshold:
            # Add similar products to the dictionary
            if i not in similar_products:
                similar_products[i] = {
                    'Product Titles': [df2['product_name_english'][i], df2['product_name_english'][j]],
                    'Master Codes': [str(df2['encrypt_master_code'][i]), str(df2['encrypt_master_code'][j])]
                }
            else:
                similar_products[i]['Product Titles'].append(df2['product_name_english'][j])
                similar_products[i]['Master Codes'].append(str(df2['encrypt_master_code'][j]))

In [7]:
# Convert the dictionary to a list of dictionaries for DataFrame creation
similar_products_list = [{'Product Titles': ', '.join(similar['Product Titles']), 'Master Codes': ', '.join(similar['Master Codes'])} for similar in similar_products.values()]


In [8]:
# Create a DataFrame from the list of similar products
similar_products_df = pd.DataFrame(similar_products_list)

In [9]:
# Save the DataFrame to an Excel file
similar_products_df.to_excel('snacksAndConfectionary_duplicateTitles.xlsx', index=False)