### Importing pandas library to manipulate the structured data

In [1]:
import pandas as pd
from nltk import ngrams
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

### Loading the data into a dataframe

In [2]:
original_df = pd.read_excel('catalog.xlsx')

### extracting all information related to a single category and storing catalog indexes of those entries

In [4]:
# copying the information related to the specified category into a new dataframe
df2 = original_df[original_df['category'] == 'Snacks & Confectionery'].copy()

# resetting the index of the dataframe to enumerate the new dataframe's index sequentially
df2 = df2.reset_index(drop=True)

# inserting the original catalog indexes of each row into a new column in the newer dataframe
df2.insert(0, 'RowNumber', original_df.index[original_df['category'] == 'Snacks & Confectionery'].tolist())

In [12]:


    # Converts all text to lowercase, in order to train and test effectively
    df2['product_name_english'] = df2['product_name_english'].str.lower()

    # Remove substrings in 'product_name_english' column that start with '-00'  followed by any sequence of characters
    df2['product_name_english'] = df2['product_name_english'].str.replace('-00.*', '', regex=True)

    # Remove leading and trailing whitespaces 
    df2['product_name_english'] = df2['product_name_english'].str.strip()

    # Remove non-alphanumeric characters, except for whitespace, percent, and ampersand,
    df2['product_name_english'] = df2['product_name_english'].str.replace(r'[^\w\s%&]', '', regex=True)

    # Replace occurrences of digits followed by 's' with the digits and ' pieces'
    df2['product_name_english'] = df2['product_name_english'].str.replace(r'(\d+)s', r'\1 pieces', regex=True)

    # Add a space between digits and alphabetic characters
    df2['product_name_english'] = df2['product_name_english'].str.replace(r'(\d)(?=[a-zA-Z])', r'\1 ', regex=True)

    # Remove words consisting of five or more consecutive digits
    df2['product_name_english'] = df2['product_name_english'].str.replace(r'\b\d{5,}\b', '', regex=True)

    # Replace 'pcs' with 'pieces'
    df2['product_name_english'] = df2['product_name_english'].str.replace(r'pcs\b', 'pieces', regex=True)

    # Replace 'pc' with 'pieces'
    df2['product_name_english'] = df2['product_name_english'].str.replace(r'pc\b', 'pieces', regex=True)

    # Replace 'choc' with 'chocolate'
    df2['product_name_english'] = df2['product_name_english'].str.replace(r'choc\b', 'chocolate', regex=True)

    # Replace 'xtra' with 'extra'
    df2['product_name_english'] = df2['product_name_english'].str.replace(r'xtra\b', 'extra', regex=True)

    # Remove whole words 'imp' or 'basic'
    df2['product_name_english'] = df2['product_name_english'].str.replace(r'\bimp\b|\bbasic\b', '', regex=True)

    # Replace consecutive whitespace characters with a single space
    df2['product_name_english'] = df2['product_name_english'].str.replace(r'\s+', ' ', regex=True)

    

### Calling the data cleaning function to store the relevant information from the filtered dataframe

In [13]:
resultDf, catalogIndex, originalNames = dataCleaningSandC(filtered_df)

### Writing all the data in a text file in order to make it in a suitable format for manual annotation
#### Refer to the readme file in github repo to understand the annotation details further

In [14]:
# getting all the cleaned titles in a list
column_data = resultDf['product_name_english']

# Writing to a text file with each cleaned entry on a new line
with open('outputSandC.txt', 'w') as file:
    for entry in column_data:
        file.write(str(entry) + '\n')

### Importing shuffling functions to shuffle order of cleaned titles in order to make sure data is varied and has no pattern

In [15]:
from shuffleEntries import shuffleData, writeShuffleData

### This will take the unshuffled file and store all the shuffled entries in a new file

In [16]:
shuffledData = shuffleData("outputSandC.txt")
writeShuffleData(shuffledData)