### Importing pandas library to manipulate the structured data

In [42]:
import pandas as pd
from nltk import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [43]:
df2 = pd.read_excel('babyCare_cleaned.xlsx')

In [44]:
# Step 2: Define a function to calculate Jaccard similarity for sets of tokens
def jaccard_similarity(str1, str2):
    # Convert strings to sets of n-grams (trigrams in this case)
    set1 = set(ngrams(str1, 3))  
    set2 = set(ngrams(str2, 3))
     # Calculate Jaccard similarity
    intersection = len(set1.intersection(set2))
    union = len(set1) + len(set2) - intersection
    return intersection / union if union != 0 else 0  # Avoid division by zero

In [45]:
threshold = 0.80 # Set a similarity threshold for considering products as duplicates

In [46]:
product_master_dict = {} # Dictionary to store product_name_english and corresponding Master_codes

In [47]:
# Iterate over pairs and find similar products
for i in range(len(df2)):
    for j in range(i + 1, len(df2)):
        similarity = jaccard_similarity(df2['product_name_english'][i], df2['product_name_english'][j])
        if similarity > threshold:
            
            # Store the product_name_english and corresponding Master_codes in the dictionary
            product_master_dict.setdefault(df2['product_name_english'][i], set()).update([df2['encrypt_master_code'][i], df2['encrypt_master_code'][j]])
            product_master_dict.setdefault(df2['product_name_english'][j], set()).update([df2['encrypt_master_code'][i], df2['encrypt_master_code'][j]])



In [49]:
product_master_dict_frozen = {k: frozenset(v) for k, v in product_master_dict.items()}


In [50]:
df3 = pd.DataFrame(list(product_master_dict_frozen.items()), columns=['Product', 'Mastercode'])

In [51]:
grouped_df = df3.groupby('Mastercode')['Product'].apply(list).reset_index(name='Duplicated_Products')


In [52]:
grouped_df['Mastercode'] = grouped_df['Mastercode'].apply(set)


In [54]:
grouped_df.to_excel('babyCare_duplicateTitles.xlsx', index=False)