## This note is for creating training sample for NER test.

In [2]:
import random
from spacy import displacy
import spacy
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
from spacy.matcher import Matcher
from text_cleaner import cleanHtml as html_cleaner
from text_cleaner import raw_content
import re
import pandas as pd
import sys
import json

Init Plugin
Init Graph Optimizer
Init Kernel


In [3]:
#Open Original files.
f1 = open("json_files/product_to_all.json")
f2 = open("json_files/main_categories_to_num.json")
f3 = open("json_files/labels_v2.json")
#get dictionary {"shoes": list[all categories]}
product_to_all = json.load(f1)
main_cat_to_num = json.load(f2)
original_labels_v2 = json.load(f3)

#### 1. Creat entity rules for all entities.

In [4]:
patterns_words = set()
with open("patterns_files/patterns_for_clothes.txt") as pa:
   patterns_words =  {l.lower().removesuffix('\n') for l in pa.readlines()}
colors_words = set()
with open("patterns_files/colors.txt") as colors_file:
   colors_words = {c.lower().removesuffix('\n') for c in colors_file.readlines()}
for o in original_labels_v2.get('color'):
   colors_words.add(o.lower().strip()) 
sizes_words = set()
with open("patterns_files/sizes.txt") as sizes_file:
   sizes_words = {s.replace('"', '').removesuffix(",\n").replace("\n","") for s in sizes_file.readlines()}

In [241]:
#list all known patterns we need to detect.

shoes_words = product_to_all.get("shoes")
tops_words = product_to_all.get("tops")
bottoms_words = product_to_all.get("bottoms")
other_clothing_words = product_to_all.get("other_clothing")
beauty_words = product_to_all.get("beauty")
accessories_words = product_to_all.get("accessories")
homeware_words = product_to_all.get("homeware")
others_words = product_to_all.get("other")

shoes_words.append("shoes")
tops_words.append("tops")
bottoms_words.append("bottoms")
others_words.append("others")
beauty_words.append("beauty")
accessories_words.append("accessory")
homeware_words.append("home")

In [5]:
genders_words = [o.lower().strip() for o in original_labels_v2.get("gender")]
main_cats_words = list(main_cat_to_num.keys())
colors_words = list(colors_words)
patterns_words = list(patterns_words)
sizes_words = list(sizes_words)

In [27]:
nlp = spacy.load("en_core_web_sm")
entity_rulers = nlp.add_pipe("entity_ruler", validate=True)
nlp.remove_pipe("ner")

('ner', <spacy.pipeline.ner.EntityRecognizer at 0x2917259e0>)

In [7]:

from typing import List
'''
input : the string of product_type like "t-shirt", "tee shirt"
function: build pattern syntax correctly
output: List[{"LEMMA": "t"}, {"IS_PUNCT": True}, {"LEMMA":"shirt"}] 
'''
def check_puct(word: str)-> List:
    res = []
    if '-' in word:
        temp_list = word.split("-")
        num = 1
        for word in temp_list:
            res.append({"LEMMA":word})
            if num < len(temp_list):
                res.append({"IS_PUNCT": True, "OP": "?"})
            num += 1
    else:
        res.append({"LEMMA":word})
    
    return res 

In [8]:
'''
input : the string of product_type like "t-shirt", "tee shirt"
function: build pattern syntax correctly
output: List[{"LOWER": "t"}, {"IS_PUNCT": True}, {"LOWER":"shirt"}] 
'''
def check_puct_and_lower_pattern(word: str) ->List:
    res = []
    if '-' in word:
        temp_list = word.split("-")
        num = 1
        for word in temp_list:
            res.append({"LOWER":word})
            if num < len(temp_list):
                res.append({"IS_PUNCT": True, "OP": "?"})
            num += 1
    else:
        res.append({"LOWER":word})
    
    return res 


#### Add Patterns to EntityRuler pipe.

In [246]:
tops_patterns = []
tops_phrases_words = []
tops_single_words = []
for item in tops_words:
    if len(item.split()) > 1:
        tops_phrases_words.append(item)
    else:
        tops_single_words.append(item)
for item in tops_phrases_words:
    t = {"label": "TOPS", "pattern": [p for word in item.split() for p in check_puct(word)], "id": item}
    tops_patterns.append(t)
    
for word in tops_single_words:
    t = {"label": "TOPS", "pattern": [p for p in check_puct(word)], "id":word}
    tops_patterns.append(t)

for item in tops_phrases_words:
    t = {"label": "TOPS", "pattern": [p for word in item.split() for p in check_puct_and_lower_pattern(word)], "id": item}
    tops_patterns.append(t)
    
for word in tops_single_words:
    t = {"label": "TOPS", "pattern": [p for p in check_puct_and_lower_pattern(word)], "id":word}
    tops_patterns.append(t)
    
for item in tops_patterns:
    try:
        entity_rulers.add_patterns([item])
    except ValueError:
        print(item)

In [247]:
shoes_patterns = []
shoes_phrases_words = []
shoes_single_words = []
for item in shoes_words:
    if len(item.split()) > 1:
        shoes_phrases_words.append(item)
    else:
        shoes_single_words.append(item)
for item in shoes_phrases_words:
    t = {"label": "SHOES", "pattern": [p for word in item.split() for p in check_puct(word)], "id": item}
    shoes_patterns.append(t)
    
for word in shoes_single_words:
    t = {"label": "SHOES", "pattern": [p for p in check_puct(word)], "id": word}
    shoes_patterns.append(t)

for item in shoes_phrases_words:
    t = {"label": "SHOES", "pattern": [p for word in item.split() for p in check_puct_and_lower_pattern(word)], "id": item}
    shoes_patterns.append(t)
    
for word in shoes_single_words:
    t = {"label": "SHOES", "pattern": [p for p in check_puct_and_lower_pattern(word)], "id": word}
    shoes_patterns.append(t)
    
for item in shoes_patterns:
    try:
        entity_rulers.add_patterns([item])
    except ValueError:
        print(item)

In [248]:
bottoms_patterns = []
bottoms_phrases_words = []
bottoms_single_words = []
for item in bottoms_words:
    if len(item.split()) > 1:
        bottoms_phrases_words.append(item)
    else:
        bottoms_single_words.append(item)
for item in bottoms_phrases_words:
    t = {"label": "BOTTOMS", "pattern": [p for word in item.split() for p in check_puct(word)], "id": item}
    bottoms_patterns.append(t)
    
for word in bottoms_single_words:
    t = {"label": "BOTTOMS", "pattern": [p for p in check_puct(word)], "id": word}
    bottoms_patterns.append(t)

for item in bottoms_phrases_words:
    t = {"label": "BOTTOMS", "pattern": [p for word in item.split() for p in check_puct_and_lower_pattern(word)], "id": item}
    bottoms_patterns.append(t)
    
for word in bottoms_single_words:
    t = {"label": "BOTTOMS", "pattern": [p for p in check_puct_and_lower_pattern(word)], "id": word}
    bottoms_patterns.append(t)
for item in bottoms_patterns:
    try:
        entity_rulers.add_patterns([item])
    except ValueError:
        print(item)

In [249]:
other_clothing_patterns = []
other_clothing_phrases_words = []
other_clothing_single_words = []
for item in other_clothing_words:
    if len(item.split()) > 1:
        other_clothing_phrases_words.append(item)
    else:
        other_clothing_single_words.append(item)
for item in other_clothing_phrases_words:
    t = {"label": "OTHER_CLOTHING", "pattern": [p for word in item.split() for p in check_puct(word)], "id": item}
    other_clothing_patterns.append(t)
    
for word in other_clothing_single_words:
    t = {"label": "OTHER_CLOTHING", "pattern": [p for p in check_puct(word)], "id": word}
    other_clothing_patterns.append(t)
    
for item in other_clothing_phrases_words:
    t = {"label": "OTHER_CLOTHING", "pattern": [p for word in item.split() for p in check_puct_and_lower_pattern(word)], "id": item}
    other_clothing_patterns.append(t)
    
for word in other_clothing_single_words:
    t = {"label": "OTHER_CLOTHING", "pattern": [p for p in check_puct_and_lower_pattern(word)], "id": word}
    other_clothing_patterns.append(t)
    
for item in other_clothing_patterns:
    try:
        entity_rulers.add_patterns([item])
    except ValueError:
        print(item)

In [250]:
beauty_patterns = []
beauty_phrases_words = []
beauty_single_words = []
for item in beauty_words:
    if len(item.split()) > 1:
        beauty_phrases_words.append(item)
    else:
        beauty_single_words.append(item)
for item in beauty_phrases_words:
    t = {"label": "BEAUTY", "pattern": [p for word in item.split() for p in check_puct(word)], "id": item}
    beauty_patterns.append(t)
    
for word in beauty_single_words:
    t = {"label": "BEAUTY", "pattern": [p for p in check_puct(word)], "id": word}
    beauty_patterns.append(t)
for item in beauty_phrases_words:
    t = {"label": "BEAUTY", "pattern": [p for word in item.split() for p in check_puct_and_lower_pattern(word)], "id": item}
    beauty_patterns.append(t)
    
for word in beauty_single_words:
    t = {"label": "BEAUTY", "pattern": [p for p in check_puct_and_lower_pattern(word)], "id": word}
    beauty_patterns.append(t)
    
for item in beauty_patterns:
    try:
        entity_rulers.add_patterns([item])
    except ValueError:
        print(item)

In [251]:
home_patterns = []
home_phrases_words = []
home_single_words = []
for item in homeware_words:
    if len(item.split()) > 1:
        home_phrases_words.append(item)
    else:
        home_single_words.append(item)
for item in home_phrases_words:
    t = {"label": "HOME", "pattern": [p for word in item.split() for p in check_puct(word)], "id": item}
    home_patterns.append(t)
    
for word in home_single_words:
    t = {"label": "HOME", "pattern": [p for p in check_puct(word)], "id": word}
    home_patterns.append(t)
for item in home_phrases_words:
    t = {"label": "HOME", "pattern": [p for word in item.split() for p in check_puct_and_lower_pattern(word)], "id": item}
    home_patterns.append(t)
    
for word in home_single_words:
    t = {"label": "HOME", "pattern": [p for p in check_puct_and_lower_pattern(word)], "id": word}
    home_patterns.append(t)
    
for item in home_patterns:
    try:
        entity_rulers.add_patterns([item])
    except ValueError:
        print(item)

In [252]:
accessories_patterns = []
accessories_phrases_words = []
accessories_single_words = []
for item in accessories_words:
    if len(item.split()) > 1:
        accessories_phrases_words.append(item)
    else:
        accessories_single_words.append(item)
for item in accessories_phrases_words:
    t = {"label": "ACCESSORIES", "pattern": [p for word in item.split() for p in check_puct(word)], "id": item}
    accessories_patterns.append(t)
    
for word in accessories_single_words:
    t = {"label": "ACCESSORIES", "pattern": [p for p in check_puct(word)], "id": word}
    accessories_patterns.append(t)
for item in accessories_phrases_words:
    t = {"label": "ACCESSORIES", "pattern": [p for word in item.split() for p in check_puct_and_lower_pattern(word)], "id": item+"v2"}
    accessories_patterns.append(t)
    
for word in accessories_single_words:
    t = {"label": "ACCESSORIES", "pattern": [p for p in check_puct_and_lower_pattern(word)], "id": word+"v2"}
    accessories_patterns.append(t)
    
for item in accessories_patterns:
    try:
        entity_rulers.add_patterns([item])
    except ValueError:
        print(item)

In [253]:
others_patterns = []
others_phrases_words = []
others_single_words = []
for item in others_words:
    if len(item.split()) > 1:
        others_phrases_words.append(item)
    else:
        others_single_words.append(item)
for item in others_phrases_words:
    t = {"label": "OTHERS", "pattern": [p for word in item.split() for p in check_puct(word)], "id": item}
    others_patterns.append(t)
    
for word in others_single_words:
    t = {"label": "OTHERS", "pattern": [p for p in check_puct(word)], "id": word}
    others_patterns.append(t)

for item in others_phrases_words:
    t = {"label": "OTHERS", "pattern": [p for word in item.split() for p in check_puct_and_lower_pattern(word)], "id": item+"v2"}
    others_patterns.append(t)
    
for word in others_single_words:
    t = {"label": "OTHERS", "pattern": [p for p in check_puct_and_lower_pattern(word)], "id": word+"v2"}
    others_patterns.append(t)
for item in others_patterns:
    try:
        entity_rulers.add_patterns([item])
    except ValueError:
        print(item)

In [10]:
genders_patterns = []
genders_phrases_words = []
genders_single_words = []
for item in genders_words:
    if len(item.split()) > 1:
        genders_phrases_words.append(item)
    else:
        genders_single_words.append(item)
for item in genders_phrases_words:
    t = {"label": "GENDER", "pattern": [p for word in item.split() for p in check_puct(word)], "id": item}
    genders_patterns.append(t)
    
for word in genders_single_words:
    t = {"label": "GENDER", "pattern": [p for p in check_puct(word)], "id": word}
    genders_patterns.append(t)

for item in genders_phrases_words:
    t = {"label": "GENDER", "pattern": [p for word in item.split() for p in check_puct_and_lower_pattern(word)], "id": item}
    genders_patterns.append(t)
    
for word in genders_single_words:
    t = {"label": "GENDER", "pattern": [p for p in check_puct_and_lower_pattern(word)], "id": word}
    genders_patterns.append(t)
for item in genders_patterns:
    try:
        entity_rulers.add_patterns([item])
    except ValueError:
        print(item)

In [11]:
patterns_patterns = []
patterns_phrases_words = []
patterns_single_words = []
for item in patterns_words:
    if len(item.split()) > 1:
        patterns_phrases_words.append(item)
    else:
        patterns_single_words.append(item)
for item in patterns_phrases_words:
    t = {"label": "PATTERN", "pattern": [p for word in item.split() for p in check_puct(word)], "id": item}
    patterns_patterns.append(t)
    
for word in patterns_single_words:
    t = {"label": "PATTERN", "pattern": [p for p in check_puct(word)], "id": word}
    patterns_patterns.append(t)
    
for item in patterns_phrases_words:
    t = {"label": "PATTERN", "pattern": [p for word in item.split() for p in check_puct_and_lower_pattern(word)], "id": item+"v2"}
    patterns_patterns.append(t)
    
for word in patterns_single_words:
    t = {"label": "PATTERN", "pattern": [p for p in check_puct_and_lower_pattern(word)], "id": word+"v2"}
    patterns_patterns.append(t)
    
for item in patterns_patterns:
    try:
        entity_rulers.add_patterns([item])
    except ValueError:
        print(item)

In [12]:
colors_patterns = []
colors_phrases_words = []
colors_single_words = []
for item in colors_words:
    if len(item.split()) > 1:
        colors_phrases_words.append(item)
    else:
        colors_single_words.append(item)
for item in colors_phrases_words:
    t = {"label": "COLOR", "pattern": [p for word in item.split() for p in check_puct(word)], "id": item}
    colors_patterns.append(t)
    
for word in colors_single_words:
    t = {"label": "COLOR", "pattern": [p for p in check_puct(word)], "id": word}
    
    colors_patterns.append(t)

for item in colors_phrases_words:
    t = {"label": "COLOR", "pattern": [p for word in item.split() for p in check_puct_and_lower_pattern(word)], "id": item}
    colors_patterns.append(t)
    
for word in colors_single_words:
    t = {"label": "COLOR", "pattern": [p for p in check_puct_and_lower_pattern(word)], "id": word}
    colors_patterns.append(t)
for item in colors_patterns:
    try:
        entity_rulers.add_patterns([item])
    except ValueError:
        print(item)

In [24]:
sizes_patterns = []
for word in sizes_words:
    if word in ['S', 'M', 'L']: 
        t = {"label":"SIZE","pattern":word, "id":word}
    else:
        t = {"label":"SIZE","pattern":[{"LOWER": word}], "id":word}
    sizes_patterns.append(t)
#regex_expression = r"(\d+(?:\.|\/|)\d+|\d+) ?(?:\-|x) ?(\d+(?:\.|\/|)\d+|\d+) ?(?:mm|MM|cm|CM|[Ii]nches|in|\"|)|(\d+(?:\.|\'|/|)\d+ ?(?:cm|CM|mm|MM|[Ii]nches|inch|in|\"))|(\d*(?:X{1,3}[SL])(?:$|\s+))|(?:[Ss]ize) ?\d+(?:\.|\/|\d+)"
regex_expression_1 = r"(\d*(?:X{1,3}[SL])(?:$|\s+))"
regex_expression_2 = r"(\d+(?:\.|\'|/|)\d+ ?(?:cm|CM|mm|MM|[Ii]nches|inch|in|)"
regex_expression_3 = r"(?:cm|CM|mm|MM|[Ii]nches|inch|in)"
p1 = [{"TEXT": {"REGEX": regex_expression_1}}]
p2 = [{"TEXT": {"REGEX": regex_expression_2}}]
p3 = [{"TEXT": {'REGEX': regex_expression_3}}]

sizes_patterns.append({"label":"SIZE", "pattern":p1, "id": "number size"})
#sizes_patterns.append({"label":"SIZE", "pattern":p3})
#sizes_patterns.append({"label":"SIZE", "pattern":[ [{"ORTH":"size"}, {"ORTH": ":", "OP": "?"}, {{"TEXT":  {"REGEX": r"\d+(?:\.|\/|)\d+|\d+) ?(?:\-|x) ?(\d+(?:\.|\/|)\d+|\d+) ?(?:mm|MM|cm|CM|[Ii]nches|in|\"|"}}} ]]})

for item in sizes_patterns:
    try:
        entity_rulers.add_patterns([item])
    except:
        print(item)

    

In [19]:
df = pd.read_csv("/Users/luis/Documents/GitHub/2022Summer/AI-Oriented-Recommendation-System/PreprocessingData/data/processed_products_from_June28.csv")
#THIS entity_options is the file to create the color for label. We can ignore this if we don't want entity colorful just igonre this .py file.
sys.path.append("/Users/luis/Documents/GitHub/2022Summer/AI-Oriented-Recommendation-System/PreprocessingData/")
from spaCy import entity_options as e

In [26]:
#Randomly choose from original texts and test our patterns rules.
sample = df.loc[random.randint(0, df.shape[0]), "raw_text"]
if isinstance(sample, str):
    doc = nlp(sample)
    displacy.render(doc, style="ent", options=e.get_entity_options())

In [23]:
#This scope is FOR DEBUGGING the pattern functionality.
example2 = "I have a Men Black/black WHITE white watch and a lot of pink watches. Also I have a zebra plaid top tops. This size XXL T-shirt is amazing. Clogs are good.\
    Earrings, earing and earring are different. Bikini top. This jumpsuits size is 3XL which is size 24 - 25mm. cotton is 100%  the Size is 23 x 23 cm. What is 21 CM 5.54  "

doc = nlp(example2)
#displacy.render(doc, style="ent",options=e.get_entity_options())
ents = [(ent.text, ent.label_) for ent in doc.ents]
displacy.render(doc, style="ent", options=e.get_entity_options())

In [52]:
matcher = Matcher(nlp.vocab, validate=True)
#sizes_pattern = [{"LABEL": "SIZE", "PATTERN": [ {"TEXT": {"REGEX": "(\d*(?:M|X{0,2}[SL]))"}} ]}]
#entity_rulers.add_patterns(sizes_pattern)

p = [[{"TEXT": {"REGEX": r"(\d+(?:\.|\/|)\d+|\d+) ?(?:\-|x|X) ?(\d+(?:\.|\/|)\d+|\d+) ?(mm|MM|cm|CM|[Ii]nches|in|\"|)|(\d+(?:\.|\'|/|)\d+ ?(?:cm|CM|mm|MM|[Ii]nches|inch|in|\"))|(\d*(?:X{1,3}[SL])(?:$|\s+))|\d+(?:\.|\/|\d+)"}}]]
#add s, m, l, small medium, large, manually...
matcher.add("size", p)
size_text = "Hat Sizing Guide Adult Sizes 3XL XS S M L XL XXL One Size (OS)* CM \
    54cm 55.5 57 59 61 63 55-61 Inches 211⁄4 215⁄8 221⁄2 231⁄4 24 247⁄8 215⁄8 - 24.5 \
    US Sizes 63⁄4 67⁄8 71⁄8 73⁄8 75⁄8 77⁄8 67⁄8 - 75⁄8 UK Sizes 65⁄8 63⁄4 7 71⁄4 71⁄2 73⁄4 63⁄4 - 71⁄2 . size 21 21 cm" 
doc = nlp(size_text)
for id, s, end in matcher(doc):
    print(doc[s:end], [s, end])

3XL [5, 6]
XS [6, 7]
XL [10, 11]
XXL [11, 12]
54 [20, 21]
55.5 [22, 23]
57 [23, 24]
59 [24, 25]
61 [25, 26]
63 [26, 27]
55 [27, 28]
61 [29, 30]
211⁄4 [31, 32]
215⁄8 [32, 33]
221⁄2 [33, 34]
231⁄4 [34, 35]
24 [35, 36]
247⁄8 [36, 37]
215⁄8 [37, 38]
24.5 [39, 40]
63⁄4 [43, 44]
67⁄8 [44, 45]
71⁄8 [45, 46]
73⁄8 [46, 47]
75⁄8 [47, 48]
77⁄8 [48, 49]
67⁄8 [49, 50]
75⁄8 [51, 52]
65⁄8 [54, 55]
63⁄4 [55, 56]
71⁄4 [57, 58]
71⁄2 [58, 59]
73⁄4 [59, 60]
63⁄4 [60, 61]
71⁄2 [62, 63]
21 [65, 66]
21 [66, 67]


In [263]:
'''
#IF we revise the algorithm of clean body_html and then we need to Rebuild new 'raw_text' column
for row in range(df.shape[0]):
    df.loc[row, 'raw_text'] = raw_content(df.loc[row, "title"], df.loc[row, 'tags'], df.loc[row, 'body_html'])

df.to_csv("(v1.4)cleaned_dataset.csv", index=False)
'''

'\n#IF we revise the algorithm of clean body_html and then we need to Rebuild new \'raw_text\' column\nfor row in range(df.shape[0]):\n    df.loc[row, \'raw_text\'] = raw_content(df.loc[row, "title"], df.loc[row, \'tags\'], df.loc[row, \'body_html\'])\n\ndf.to_csv("(v1.4)cleaned_dataset.csv", index=False)\n'