### Importing pandas library to manipulate the structured data

In [9]:
import nltk

# Download the stopwords corpus
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/haniyakhan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [10]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
# Read the Excel file
df2 = pd.read_excel('catalog.xlsx')

In [12]:
# Combine all product names into a single list
product_names = df2['product_name_english'].tolist()

In [13]:
master_codes = df2['encrypt_master_code'].tolist()

In [14]:
# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

In [15]:
# Fit the vectorizer on the product names
tfidf_matrix = vectorizer.fit_transform(product_names)

In [16]:
# Calculate cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

In [17]:
# Create a dictionary to store indices of duplicate product titles
duplicate_indices = {}
for i, product_title in enumerate(product_names):
    if product_title not in duplicate_indices:
        duplicate_indices[product_title] = {'indices': [i], 'master_codes': [master_codes[i]]}
    else:
        duplicate_indices[product_title]['indices'].append(i)
        duplicate_indices[product_title]['master_codes'].append(master_codes[i])


In [18]:
# Create a list to store output
output_data = []
for product_title, data in duplicate_indices.items():
    if len(data['indices']) > 1:  # Only consider titles with duplicates
        duplicates = ', '.join([product_names[idx] for idx in data['indices']])
        master_codes_str = ', '.join([str(code) for code in data['master_codes']])
        count = len(data['indices'])
        output_data.append({'Duplicates': duplicates, 'Master Codes': master_codes_str, 'Count': count})

In [19]:
# Save the list to a DataFrame
output_df = pd.DataFrame(output_data)

In [20]:
# Save the DataFrame to an Excel file
output_df.to_excel('duplicate_products_with_master_codes_and_count.xlsx', index=False)