### Importing pandas library to manipulate the structured data

In [1]:
import pandas as pd
from nltk import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df2 = pd.read_excel('babyCare_cleaned.xlsx')

In [3]:
# Step 2: Define a function to calculate Jaccard similarity for sets of tokens
def jaccard_similarity(str1, str2):
    # Convert strings to sets of n-grams (trigrams in this case)
    set1 = set(ngrams(str1, 3))  
    set2 = set(ngrams(str2, 3))
     # Calculate Jaccard similarity
    intersection = len(set1.intersection(set2))
    union = len(set1) + len(set2) - intersection
    return intersection / union if union != 0 else 0  # Avoid division by zero

In [4]:
threshold = 0.80 # Set a similarity threshold for considering products as duplicates

In [5]:
product_master_dict = {} # Dictionary to store product_name_english and corresponding Master_codes

In [6]:
# Iterate over pairs and find similar products
for i in range(len(df2)):
    for j in range(i + 1, len(df2)):
        similarity = jaccard_similarity(df2['product_name_english'][i], df2['product_name_english'][j])
        if similarity > threshold:
            
            # Store the product_name_english and corresponding Master_codes in the dictionary
            product_master_dict.setdefault(df2['product_name_english'][i], set()).update([df2['RowNumber'][i], df2['RowNumber'][j]])
            product_master_dict.setdefault(df2['product_name_english'][j], set()).update([df2['RowNumber'][i], df2['RowNumber'][j]])



In [7]:
df3 = pd.DataFrame(list(product_master_dict.items()), columns=['Product', 'RowNumber'])

In [8]:
df3

Unnamed: 0,Product,RowNumber
0,pigeon feeding bottle flexible soft elastic 4m...,"{31563, 292}"
1,johnsons baby shampoo 750ml,"{13176, 32238}"
2,nestle nan-2 optipro 300gm,"{15201, 33076}"
3,nestle nan-2 optipro 300gm box,"{15201, 33076}"
4,philips avent baby soother ultra soft 0-6m -,"{31038, 31022}"
...,...,...
126,cool & cool baby wipes ultra soft and gentle 8...,"{34529, 35268}"
127,philips avent teats flow natural 0m+ 2 pc,"{34946, 34558}"
128,philips avent teats flow natural 1m+ 2 pc -,"{34946, 34558}"
129,johnsons baby cologne brisa 100 ml -,"{35740, 35453}"


In [9]:
product_master_dict_frozen = {k: frozenset(v) for k, v in product_master_dict.items()}


In [10]:
df3 = pd.DataFrame(list(product_master_dict_frozen.items()), columns=['Product', 'Values'])

In [11]:
grouped_df = df3.groupby('Values')['Product'].apply(list).reset_index(name='Grouped_Products')


In [12]:
grouped_df['Values'] = grouped_df['Values'].apply(set)


In [13]:
print(grouped_df)

                   Values                                   Grouped_Products
0            {31563, 292}  [pigeon feeding bottle flexible soft elastic 4...
1          {32683, 34383}  [aveeno baby body wash daily care sensitive sk...
2          {32969, 32677}                            [pigeon baby bottle  -]
3          {32617, 35638}                     [pigeon baby feeding bottle -]
4          {32558, 34415}  [pigeon baby liquid soap 200ml, pigeon baby li...
..                    ...                                                ...
76  {32080, 34450, 31630}  [philips avent baby feeding bottle natural 260...
77         {31620, 35373}  [johnsons blossom baby powder 200gm thai, john...
78         {34220, 31614}              [nestle nan grow-3 optipro 600gm box]
79         {32803, 32269}  [nestle lactogrow  3 milk powder gentle grow 4...
80         {35740, 35453}  [johnsons baby cologne brisa 100 ml -, johnson...

[81 rows x 2 columns]


In [14]:
grouped_df.to_excel('babyCare_duplicateTitles.xlsx', index=False)