### Importing pandas library to manipulate the structured data

In [18]:
import pandas as pd
from nltk import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Loading the data into a dataframe

In [19]:
original_df = pd.read_excel('catalog.xlsx')

### extracting all information related to a single category and storing catalog indexes of those entries

In [20]:
# copying the information related to the specified category into a new dataframe
df2 = original_df[original_df['category'] == 'Baby Care 👶🏻🍼'].copy()

# resetting the index of the dataframe to enumerate the new dataframe's index sequentially
df2 = df2.reset_index(drop=True)

# inserting the original catalog indexes of each row into a new column in the newer dataframe
df2.insert(0, 'RowNumber', original_df.index[original_df['category'] == 'Baby Care 👶🏻🍼'].tolist())

In [21]:
# Converts all text to lowercase, in order to train and test effectively
df2['product_name_english'] = df2['product_name_english'].str.lower()


# Replacing & with and when written between two words
df2['product_name_english'] = df2['product_name_english'].str.replace(r'\b&\b', 'and', regex=True)

# Remove characters other than letters, digits, '&', '%', '-', and '+'
df2['product_name_english'] = df2['product_name_english'].str.replace(r'[^a-zA-Z0-9&%\-+ ]', '', regex=True)

# Remove trailing digits preceded by whitespace
df2['product_name_english'] = df2['product_name_english'].str.replace(r'\s\d+$', '', regex=True)

# Remove leading and trailing whitespaces
df2['product_name_english'] = df2['product_name_english'].str.strip()

# Remove words followed by digits at the end    
df2['product_name_english'] = df2['product_name_english'].str.replace(r'\b\w+\d+$', '', regex=True)

# Replace 'pcs' with 'pieces'
df2['product_name_english'] = df2['product_name_english'].str.replace(r'pcs\b', 'pieces', regex=True)

# Replace 'mo' with 'months'
df2['product_name_english'] = df2['product_name_english'].str.replace(r'mo\b', 'months', regex=True)

# Remove 'imp' or 'basic' (with word boundaries) such that they are removed only if they are a whole word
df2['product_name_english'] = df2['product_name_english'].str.replace(r'\bimp\b|\bbasic\b', '', regex=True)

# Remove sequences of digits with four or more digits at word boundaries such that no digits are removed from alpha-numeric combinations
df2['product_name_english'] = df2['product_name_english'].str.replace(r'\b\d{4,}\b', '', regex=True)

# Remove words starting with 'scf' followed by zero or more digits at word boundaries
df2['product_name_english'] = df2['product_name_english'].str.replace(r'\bscf\d*\b', '', regex=True)

# Remove words that start with two letters, followed by two or more digits, and an optional additional word character at word boundaries
df2['product_name_english'] = df2['product_name_english'].str.replace(r'\b[A-Za-z]{2}\d{2,}\w?\b', '', regex=True)

# Remove words that start with a single letter, followed by two or more digits, and an optional additional word character at word boundarie
df2['product_name_english'] = df2['product_name_english'].str.replace(r'\b[A-Za-z]\d{2,}\w?\b', '', regex=True)

# Replace digits followed by 's' at word boundaries with the same digits and 'pieces'
df2['product_name_english'] = df2['product_name_english'].str.replace(r'(\d+)s\b', r'\1 pieces', regex=True)

# Replace 'disp' with 'disposable'    
df2['product_name_english'] = df2['product_name_english'].str.replace(r'disp\b', 'disposable', regex=True)

# Replace 'swmpnts' with 'swimpants'
df2['product_name_english'] = df2['product_name_english'].str.replace(r'swmpnts\b', 'swimpants', regex=True)

# Remove 'nf' at word boundaries
df2['product_name_english'] = df2['product_name_english'].str.replace(r'nf\b', '', regex=True)

# Remove 'bf' at word boundaries
df2['product_name_english'] = df2['product_name_english'].str.replace(r'bf\b', '', regex=True)

# Remove 's - t '
df2['product_name_english'] = df2['product_name_english'].str.replace(r's - t ', '', regex=True)



In [22]:
df2.to_excel('babyCare_cleaned.xlsx', index=False)