In [1]:
#################################################################
# This script is for generating training dataset from 
# original .csv file by standardizing product_type and 
# labelling the first class label by algorithms.
#
# Please make sure the input file path and type is correct
# Also, make sure product_to_all.json and main_to_num.json
# in the same directory(or same path.) Since these two files
# are the standardize information we use and the product label 
# map to number we predefined.
# 
# Author: Luis Lin
# Date: June 27, 2022
#################################################################

In [2]:
import json
import spacy 
from difflib import SequenceMatcher
from typing import List
import pandas as pd
from string_grouper import match_strings, match_most_similar
import numpy as np
import text_cleaner as tc
import re
#the main_categories with all sub_categories dict
products_to_all = {}
#the main_categories relate to numbers dict
main_categories_map_to_num = {}
#all categories of product including the name of main_cat
all_cat = set()
#each specific item maps to the main_categories_number
specific_products_map_to_num = {}

'''
Input: two separate strings
Function:
    We provide another statistic model (ML) from spacy.similarity to compare two strings similarity firstly.
    if spacy similarities doesn't not exit like 0.0, then we compare two strings similarity by "gestalt pattern matching" 
    not_Statistic model(ML)  It is a character-based matcher. 
Output: Similarity [0, 1] between those two strings
'''
def similar(a: str, b: str) -> float:
    nlp = spacy.load("en_core_web_md")
    doc1 = nlp(a)
    doc2 = nlp(b)
    statistical_method_score = doc1.similarity(doc2)
    non_statistical_method_score = SequenceMatcher(None, a, b).ratio()
    if statistical_method_score < 0.1:
        return non_statistical_method_score
    return statistical_method_score

'''
Input: string
Function: remove any plural format and wired suffix, for example, dresses -> ['dress']
Output: list 
'''
def lemma_string(original_string:str) -> List:
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(original_string.lower())
    temp = []
    for token in doc:
        if token.text == '&' or token.text == 'and' or token.text == '/' or token.text == ',':
            continue
        if token.text == "glasses" or token.text == 'booty' or token.text == "sunglasses" or token.text == "earrings": #some of them should be represented in plural format
            temp.append(token.text)
        else:
            temp.append(token.lemma_)
    return temp
'''
input: original string
function: remove pre_suffix of the original 'product_type' to match more accurate items
output: modified string
'''
def modify_product_type(original:str):
    if len(original) <= 1:
        return original
    removable_words = ["clothing", "sale", "man", "men","mens", "men's", "woman", "women", "womens", "women's",  "unisex","girl", "girls", "girl's", "lady", "ladies", "snow","ladies'", "active","boy", "boys", "boy's", "graphic", "premium", "cozy", "designer","comfort", "athletic","casual", 'youth', 'adult']
    original = original.replace("+", " ").replace("-", " ").replace("&", " ").replace("/", " ").replace(",", " ")
    original = re.compile(r"\s+").sub(" ", original).strip()
    ori_list = original.split(" ")
    
    acc = ['acc', 'accessory', 'acessories','accesssories','accessories', "jewelry","polarized", "non polar"]
    shoes = ['footwear']
    homeware = ["home", "homeware"]
    tops = ["top", "tops"]
    main_cat = ''
    sub_cat = []

    res = []
    
    for e in ori_list:
        e = e.strip().lower()
        if e in removable_words:
            continue
        elif not res.__contains__(e):
            res.append(e)
        if e in acc:
            main_cat = "accessories"
        elif e in shoes:
            main_cat = "shoes"
        elif e in tops:
            main_cat = "tops"
        elif e == "bottoms":
            main_cat = "bottoms"
        elif e in homeware:
            main_cat = "homeware"
        elif e == "beauty":
            main_cat = "beauty"
        else:
            sub_cat.append(e)
    #if main_cat doesn't exist but sub_cat exists, we can try to map directly by specific_product_map
    sub_cat = " ".join(sub_cat)
    
    return " ".join(res), main_cat, sub_cat


'''
Input: None
Function: Read the .json file to initilize the dictionaries and sets
Output: None
'''
def initiailize_containers() -> None:
    with open("json_files/product_to_all.json") as f1:
        global products_to_all 
        products_to_all = json.load(f1)
    with open("json_files/main_categories_to_num.json") as f2: 
        global main_categories_map_to_num
        main_categories_map_to_num = json.load(f2)
    with open("json_files/specific_product_map_to_num.json") as f3:
        global specific_products_map_to_num
        specific_products_map_to_num = json.load(f3)
    for key in specific_products_map_to_num.keys():
        all_cat.add(key)
'''
Input: dataframe
Function: to calculate some statistics data after pre-processing.
'''
def summary_of_the_new_df(df:pd.DataFrame)->None:
    unknown = 0
    totalnum = df.shape[0]
    shoes = 0
    other_clothing = 0
    tops = 0
    bottoms = 0
    beauty = 0
    home = 0
    acc = 0
    other = 0
    for row in range(df.shape[0]):
        n = df.loc[row, "label_1st"]
        if n == 0:
            unknown += 1
        elif n == 1:
            shoes += 1
        elif n == 2:
            tops += 1
        elif n == 3:
            bottoms += 1
        elif n == 4:
            other_clothing += 1
        elif n == 5:
            beauty += 1
        elif n == 6:
            acc += 1
        elif n == 7:
            home += 1
        elif n == 8:
            other += 1
        
    print("unknown", unknown, "\t\tratio of all", unknown/totalnum)
    total = totalnum - unknown
    print("\nshoes", shoes,"\t\tratio", "{:10.2f}".format(shoes/total))
    print("tops", tops, "\t\tratio", "{:10.2f}".format(tops/total))
    print("bottoms", bottoms, "\t\tratio", "{:10.2f}".format(bottoms/total))
    print("other_clothing", other_clothing,"\tratio", "{:10.2f}".format(other_clothing/total))
    print("beauty", beauty, "\t\tratio", "{:10.2f}".format(beauty/total))
    print("accessories", acc, "\tratio", "{:10.2f}".format(acc/total))
    print("homeware", home, "\t\tratio", "{:10.2f}".format(home/total))
    print("other", other, "\t\tratio", "{:10.2f}".format(other/total))
#iniyilize all maps and sets.
initiailize_containers()

Init Plugin
Init Graph Optimizer
Init Kernel


In [3]:
original_file_path = "/Users/luis/Downloads/products-June-28th.csv"
assert original_file_path != None
columns = ["id", "title", "tags", "images", "gender","product_type",  "colors", "buckets", "url", "body_html"]
df = pd.DataFrame()
try:
    df = pd.read_csv(original_file_path,  usecols=columns).reset_index()
except:
    print("The path seems incorrect")
#insert new columns to this df which are useful and processed information 

df.insert(df.columns.get_loc("gender"), "product_type(modified)", "", allow_duplicates=True)
df.insert(df.columns.get_loc("gender"), "main_category", "", allow_duplicates=True)
df.insert(df.columns.get_loc("gender"), "sub_category", "", allow_duplicates=True)
df.insert(df.columns.get_loc("gender"), "match_most_similar_>80%_string", "", allow_duplicates=True)
df.insert(df.columns.get_loc("gender"), "match_most_similar_>60%_string", "", allow_duplicates=True)
df.insert(df.columns.get_loc("gender"), "label_1st", 0, allow_duplicates=True)
df.insert(df.columns.get_loc("gender"), "label_2nd", 0, allow_duplicates=True)
df.insert(df.columns.get_loc("gender"), "label_3rd", 0, allow_duplicates=True)
df.insert(df.columns.get_loc("buckets"), "buckets_num", 0, allow_duplicates=True)
df.insert(df.columns.get_loc("url"), "color_num", 0, allow_duplicates=True)
df = df[["index","id", "title", "tags", "images", "gender","product_type", "product_type(modified)", "main_category", "sub_category", "match_most_similar_>80%_string", "match_most_similar_>60%_string", "label_1st", "label_2nd","label_3rd", "buckets_num", "buckets", "color_num", "colors", "url", "body_html"]]
df.insert(df.columns.get_loc("body_html"), "raw_text", "", allow_duplicates=True)
for i in range(df.shape[0]):
    
    ori_word = df.loc[i, 'product_type']
    title = df.loc[i, 'title']
    tags = df.loc[i, 'tags']
    buckets = df.loc[i, 'buckets']
    body_html = df.loc[i, 'body_html']
    
    if not isinstance(ori_word, str):
        df.loc[i,['product_type']] = "unknown"
        df.loc[i, ['product_type(modified)']] = "unknown"
        continue
    #get the modify_product_type (remove)
    try:
        product_type_new, main_cat, sub_cat= modify_product_type(ori_word)
        df.loc[i, ['product_type(modified)']] = product_type_new
        df.loc[i, ["main_category"]] = main_cat
        df.loc[i, ["sub_category"]] = sub_cat
        
        if (main_cat != ""):
            #label the 1st, label if we already know exact main_catgories. 
            df.loc[i,["label_1st"]] = main_categories_map_to_num.get(main_cat)
    except:
        print(i, ori_word)
    try:
        #clean body_html as well. 
        df.loc[i, 'raw_text'] = tc.cleanHtml(body_html)
    except:
        print("Something wrong with clean html: ", i)
del(df["body_html"])

In [4]:

#build data series from all categories.
pre_defiened_labels = pd.Series(list(all_cat), name="pre_defined_label")
#get 80% most_similar_mathes(dataframe) by using package string-matcher
most_similar_matches = match_most_similar( pre_defiened_labels, df["sub_category"],\
    min_similarity = 0.80, ignore_index=False, replace_na=False)
most_similar_matches = pd.concat([df['index'],df["sub_category"], most_similar_matches], axis=1)
#get 60% most_similar_matches(dataframe) 
less_similar_matches = match_strings(pre_defiened_labels, df["sub_category"],\
    min_similarity = 0.65, ignore_index = False, replace_na = False)
#fill up 80% most_similar_80% column
empty_cells = 0
for row in range(most_similar_matches.shape[0]):
    index = most_similar_matches.loc[row, "index"]
    most_similar_pre_defined_label = most_similar_matches.loc[row, 'most_similar_pre_defined_label']
    most_similar_index = most_similar_matches.loc[row, ['most_similar_index']].item()
    
    if not np.isnan(most_similar_index):
        df.loc[df['index'] == index, ['match_most_similar_>80%_string']] = most_similar_pre_defined_label
    else:
        empty_cells += 1
#fill up 60% less_similar_60% column
index_similarity_map = {}
index_to_pre_map = {}
for row in range(less_similar_matches.shape[0]):
    current_index = less_similar_matches.loc[row, 'right_index']
    label = less_similar_matches.loc[row, 'left_pre_defined_label']
    similarity = less_similar_matches.loc[row, 'similarity']
    
    if index_similarity_map.get(current_index) == None:
        index_similarity_map.update({current_index:similarity})
        index_to_pre_map.update({current_index:label})
    elif index_similarity_map.get(current_index) >= similarity:
        index_similarity_map.update({current_index:similarity})
        index_to_pre_map.update({current_index:label})
    else: # similarity < then current, ignore
        continue
for key in index_to_pre_map.keys():
    df.loc[df['index'] == key, ['match_most_similar_>60%_string']] = index_to_pre_map.get(key)

In [5]:
#Algorithm 2: if both 60% and 80% don't exist, just ignore, 
#             if the 80% string doesn't exist, compare the 60% string with 80% string similarity by diff() and spacy(), 
#             let spacy.similarity to decide whether above 60% to fill the 80%. if spacy similarity does not exist, fill 
#             fill up 80% by similar() algorithm. 
#     speed : 500 data around 1 mins based on M1 pro.
double_check_pairs_pt_word60_dic = {}
#see the accurancy 

def is_nan_string(string):
    return len(string) == 0
#w = df.loc[66, ["match_most_similar_>80%_string"]].item()
#print(w)
#is_nan_string(w)
for row in range(df.shape[0]):
    word_80 = df.loc[row, "match_most_similar_>80%_string"]
    word_60 = df.loc[row, "match_most_similar_>60%_string"]
    #print(is_nan_string(word_80), "   ",word_60)
    if is_nan_string(word_80) and is_nan_string(word_60):
        #found nothing
        continue    
    elif is_nan_string(word_80) and not is_nan_string(word_60):
        pt_modified = df.loc[row, "sub_category"]
        pair = (pt_modified, word_60)
        #print(pair)
        if pair not in double_check_pairs_pt_word60_dic.keys():
            
            double_check_pairs_pt_word60_dic.update({pair: similar(word_60, pt_modified)})
        #print("product_type(modified) ", pt_modified, " words(60%): ", word_60, " Similarity: ", similar(word_60, pt_modified))
        if similar(word_60, pt_modified) > 0.0:
            df.loc[row, "match_most_similar_>80%_string"] = word_60

  statistical_method_score = doc1.similarity(doc2)


In [6]:
double_check_pairs_pt_word60_dic

{('blanket towel', 'blanket'): 0.7409514024113583,
 ('dresses sun dresses', 'dress'): 0.8874078855687398,
 ('dresses', 'dress'): 1.0000000571365557,
 ('denim', 'denim short'): 0.8061123004322047,
 ('swim wear one piece', 'one piece'): 0.7537377263726195,
 ('hats', 'hat'): 1.0000000929979673,
 ('sunglasses sunglasses acetateframe non polar',
  'sunglasses'): 0.8361596445974069,
 ('masks', 'mask'): 0.9999999509638685,
 ('clogs', 'clog'): 0.8888888888888888,
 ('sunglasses sunglasses metal frame', 'sunglasses'): 0.8023430109343745,
 ('sunglasses sunglasses acetate frame', 'sunglasses'): 0.7872220237976908,
 ('leather bags', 'leather'): 0.7887546956095955,
 ('bags', 'bag'): 0.9999999574708032,
 ('bottom', 'bikini bottom'): 0.7010592044148047,
 ('jeans', 'jean'): 0.18565857977597314,
 ('shoe', 'shoes'): 1.0000000267699771,
 ('pullover', 'pullover hoodie'): 0.8388999417205346,
 ('sunglasses sunglasses acetate frame non polar',
  'sunglasses'): 0.6967328885980686,
 ('jumpsuits rompers', 'jumps

In [7]:
#From previous we finished match similar products. We start to label the "label_1st" column
for row in range(df.shape[0]):
    key = df.loc[row, 'match_most_similar_>80%_string']
    if key != None and specific_products_map_to_num.get(key) != None:
        df.loc[row, ['label_1st']] = specific_products_map_to_num.get(key)
    


In [8]:
#df.to_csv("look.csv", index=False)

In [9]:
summary_of_the_new_df(df)

unknown 7945 		ratio of all 0.2794976430028847

shoes 1303 		ratio       0.06
tops 6033 		ratio       0.29
bottoms 3764 		ratio       0.18
other_clothing 1251 	ratio       0.06
beauty 403 		ratio       0.02
accessories 5985 	ratio       0.29
homeware 691 		ratio       0.03
other 1051 		ratio       0.05


In [10]:
df.to_csv("processed_products_from_June28.csv", index=False)