## This note is for creating training sample for NER test.

In [76]:
import random
from spacy import displacy
import spacy
from spacy.lang.en import English
from spacy.pipeline import EntityRuler
from spacy.matcher import Matcher
from text_cleaner import cleanHtml as html_cleaner
from text_cleaner import raw_content
import re
import pandas as pd
import sys
import json
import unicodedata
from spacy.tokens import DocBin

In [77]:
#Open Original files.
f1 = open("json_files/product_to_all.json")
f2 = open("json_files/main_categories_to_num.json")
f3 = open("json_files/labels_v2.json")
#get dictionary {"shoes": list[all categories]}
product_to_all = json.load(f1)
main_cat_to_num = json.load(f2)
original_labels_v2 = json.load(f3)

#### 1. Creat entity rules for all entities.

In [78]:
patterns_words = set()
with open("patterns_files/patterns_for_clothes.txt") as pa:
   patterns_words =  {l.lower().removesuffix('\n') for l in pa.readlines()}
colors_words = set()
with open("patterns_files/colors.txt") as colors_file:
   colors_words = {c.lower().removesuffix('\n') for c in colors_file.readlines()}
for o in original_labels_v2.get('color'):
   colors_words.add(o.lower().strip()) 
sizes_words = set()
with open("patterns_files/sizes.txt") as sizes_file:
   sizes_words = {s.replace('"', '').removesuffix(",\n").replace("\n","") for s in sizes_file.readlines()}

In [79]:
#list all known patterns we need to detect.

shoes_words = product_to_all.get("shoes")
tops_words = product_to_all.get("tops")
bottoms_words = product_to_all.get("bottoms")
other_clothing_words = product_to_all.get("other_clothing")
beauty_words = product_to_all.get("beauty")
accessories_words = product_to_all.get("accessories")
homeware_words = product_to_all.get("homeware")
others_words = product_to_all.get("others")

shoes_words.append("shoes")
tops_words.append("tops")
bottoms_words.append("bottoms")
others_words.append("others")
beauty_words.append("beauty")
accessories_words.append("accessory")
homeware_words.append("home")

In [80]:
genders_words = [o.lower().strip() for o in original_labels_v2.get("gender")]
main_cats_words = list(main_cat_to_num.keys())
colors_words = list(colors_words)
patterns_words = list(patterns_words)
sizes_words = list(sizes_words)

In [81]:
nlp = spacy.load("en_core_web_lg")
entity_rulers = nlp.add_pipe("entity_ruler", validate=True)
nlp.remove_pipe("ner")

('ner', <spacy.pipeline.ner.EntityRecognizer at 0x2915e8dd0>)

In [82]:
from typing import List
'''
input : the string of product_type like "t-shirt", "tee shirt"
function: build pattern syntax correctly
output: List[{"LEMMA": "t"}, {"IS_PUNCT": True}, {"LEMMA":"shirt"}] 
'''
def check_puct(word: str)-> List:
    res = []
    if '-' in word:
        temp_list = word.split("-")
        num = 1
        for word in temp_list:
            res.append({"LEMMA":word})
            if num < len(temp_list):
                res.append({"IS_PUNCT": True, "OP": "?"})
            num += 1
    else:
        res.append({"LEMMA":word})
    
    return res 

In [83]:
'''
input : the string of product_type like "t-shirt", "tee shirt"
function: build pattern syntax correctly
output: List[{"LOWER": "t"}, {"IS_PUNCT": True}, {"LOWER":"shirt"}] 
'''
def check_puct_and_lower_pattern(word: str) ->List:
    res = []
    if '-' in word:
        temp_list = word.split("-")
        num = 1
        for word in temp_list:
            res.append({"LOWER":word})
            if num < len(temp_list):
                res.append({"IS_PUNCT": True, "OP": "?"})
            num += 1
    else:
        res.append({"LOWER":word})
    
    return res 


#### Add Patterns to EntityRuler pipe.

In [84]:
tops_patterns = []
tops_phrases_words = []
tops_single_words = []
for item in tops_words:
    if len(item.split()) > 1:
        tops_phrases_words.append(item)
    else:
        tops_single_words.append(item)
for item in tops_phrases_words:
    t = {"label": "TOPS", "pattern": [p for word in item.split() for p in check_puct(word)], "id": item}
    tops_patterns.append(t)
    
for word in tops_single_words:
    t = {"label": "TOPS", "pattern": [p for p in check_puct(word)], "id":word}
    tops_patterns.append(t)

for item in tops_phrases_words:
    t = {"label": "TOPS", "pattern": [p for word in item.split() for p in check_puct_and_lower_pattern(word)], "id": item}
    tops_patterns.append(t)
    
for word in tops_single_words:
    t = {"label": "TOPS", "pattern": [p for p in check_puct_and_lower_pattern(word)], "id":word}
    tops_patterns.append(t)
    
for item in tops_patterns:
    try:
        entity_rulers.add_patterns([item])
    except ValueError:
        print(item)

In [85]:
shoes_patterns = []
shoes_phrases_words = []
shoes_single_words = []
for item in shoes_words:
    if len(item.split()) > 1:
        shoes_phrases_words.append(item)
    else:
        shoes_single_words.append(item)
for item in shoes_phrases_words:
    t = {"label": "SHOES", "pattern": [p for word in item.split() for p in check_puct(word)], "id": item}
    shoes_patterns.append(t)
    
for word in shoes_single_words:
    t = {"label": "SHOES", "pattern": [p for p in check_puct(word)], "id": word}
    shoes_patterns.append(t)

for item in shoes_phrases_words:
    t = {"label": "SHOES", "pattern": [p for word in item.split() for p in check_puct_and_lower_pattern(word)], "id": item}
    shoes_patterns.append(t)
    
for word in shoes_single_words:
    t = {"label": "SHOES", "pattern": [p for p in check_puct_and_lower_pattern(word)], "id": word}
    shoes_patterns.append(t)
    
for item in shoes_patterns:
    try:
        entity_rulers.add_patterns([item])
    except ValueError:
        print(item)

In [86]:
bottoms_patterns = []
bottoms_phrases_words = []
bottoms_single_words = []
for item in bottoms_words:
    if len(item.split()) > 1:
        bottoms_phrases_words.append(item)
    else:
        bottoms_single_words.append(item)
for item in bottoms_phrases_words:
    t = {"label": "BOTTOMS", "pattern": [p for word in item.split() for p in check_puct(word)], "id": item}
    bottoms_patterns.append(t)
    
for word in bottoms_single_words:
    t = {"label": "BOTTOMS", "pattern": [p for p in check_puct(word)], "id": word}
    bottoms_patterns.append(t)

for item in bottoms_phrases_words:
    t = {"label": "BOTTOMS", "pattern": [p for word in item.split() for p in check_puct_and_lower_pattern(word)], "id": item}
    bottoms_patterns.append(t)
    
for word in bottoms_single_words:
    t = {"label": "BOTTOMS", "pattern": [p for p in check_puct_and_lower_pattern(word)], "id": word}
    bottoms_patterns.append(t)
for item in bottoms_patterns:
    try:
        entity_rulers.add_patterns([item])
    except ValueError:
        print(item)

In [87]:
other_clothing_patterns = []
other_clothing_phrases_words = []
other_clothing_single_words = []
for item in other_clothing_words:
    if len(item.split()) > 1:
        other_clothing_phrases_words.append(item)
    else:
        other_clothing_single_words.append(item)
for item in other_clothing_phrases_words:
    t = {"label": "OTHER_CLOTHING", "pattern": [p for word in item.split() for p in check_puct(word)], "id": item}
    other_clothing_patterns.append(t)
    
for word in other_clothing_single_words:
    t = {"label": "OTHER_CLOTHING", "pattern": [p for p in check_puct(word)], "id": word}
    other_clothing_patterns.append(t)
    
for item in other_clothing_phrases_words:
    t = {"label": "OTHER_CLOTHING", "pattern": [p for word in item.split() for p in check_puct_and_lower_pattern(word)], "id": item}
    other_clothing_patterns.append(t)
    
for word in other_clothing_single_words:
    t = {"label": "OTHER_CLOTHING", "pattern": [p for p in check_puct_and_lower_pattern(word)], "id": word}
    other_clothing_patterns.append(t)
    
for item in other_clothing_patterns:
    try:
        entity_rulers.add_patterns([item])
    except ValueError:
        print(item)

In [88]:
beauty_patterns = []
beauty_phrases_words = []
beauty_single_words = []
for item in beauty_words:
    if len(item.split()) > 1:
        beauty_phrases_words.append(item)
    else:
        beauty_single_words.append(item)
for item in beauty_phrases_words:
    t = {"label": "BEAUTY", "pattern": [p for word in item.split() for p in check_puct(word)], "id": item}
    beauty_patterns.append(t)
    
for word in beauty_single_words:
    t = {"label": "BEAUTY", "pattern": [p for p in check_puct(word)], "id": word}
    beauty_patterns.append(t)
for item in beauty_phrases_words:
    t = {"label": "BEAUTY", "pattern": [p for word in item.split() for p in check_puct_and_lower_pattern(word)], "id": item}
    beauty_patterns.append(t)
    
for word in beauty_single_words:
    t = {"label": "BEAUTY", "pattern": [p for p in check_puct_and_lower_pattern(word)], "id": word}
    beauty_patterns.append(t)
    
for item in beauty_patterns:
    try:
        entity_rulers.add_patterns([item])
    except ValueError:
        print(item)

In [89]:
home_patterns = []
home_phrases_words = []
home_single_words = []
for item in homeware_words:
    if len(item.split()) > 1:
        home_phrases_words.append(item)
    else:
        home_single_words.append(item)
for item in home_phrases_words:
    t = {"label": "HOME", "pattern": [p for word in item.split() for p in check_puct(word)], "id": item}
    home_patterns.append(t)
    
for word in home_single_words:
    t = {"label": "HOME", "pattern": [p for p in check_puct(word)], "id": word}
    home_patterns.append(t)
for item in home_phrases_words:
    t = {"label": "HOME", "pattern": [p for word in item.split() for p in check_puct_and_lower_pattern(word)], "id": item}
    home_patterns.append(t)
    
for word in home_single_words:
    t = {"label": "HOME", "pattern": [p for p in check_puct_and_lower_pattern(word)], "id": word}
    home_patterns.append(t)
    
for item in home_patterns:
    try:
        entity_rulers.add_patterns([item])
    except ValueError:
        print(item)

In [90]:
accessories_patterns = []
accessories_phrases_words = []
accessories_single_words = []
for item in accessories_words:
    if len(item.split()) > 1:
        accessories_phrases_words.append(item)
    else:
        accessories_single_words.append(item)
for item in accessories_phrases_words:
    t = {"label": "ACCESSORIES", "pattern": [p for word in item.split() for p in check_puct(word)], "id": item}
    accessories_patterns.append(t)
    
for word in accessories_single_words:
    t = {"label": "ACCESSORIES", "pattern": [p for p in check_puct(word)], "id": word}
    accessories_patterns.append(t)
for item in accessories_phrases_words:
    t = {"label": "ACCESSORIES", "pattern": [p for word in item.split() for p in check_puct_and_lower_pattern(word)], "id": item+"v2"}
    accessories_patterns.append(t)
    
for word in accessories_single_words:
    t = {"label": "ACCESSORIES", "pattern": [p for p in check_puct_and_lower_pattern(word)], "id": word+"v2"}
    accessories_patterns.append(t)
    
for item in accessories_patterns:
    try:
        entity_rulers.add_patterns([item])
    except ValueError:
        print(item)

In [91]:
others_patterns = []
others_phrases_words = []
others_single_words = []
for item in others_words:
    if len(item.split()) > 1:
        others_phrases_words.append(item)
    else:
        others_single_words.append(item)
for item in others_phrases_words:
    t = {"label": "OTHERS", "pattern": [p for word in item.split() for p in check_puct(word)], "id": item}
    others_patterns.append(t)
    
for word in others_single_words:
    t = {"label": "OTHERS", "pattern": [p for p in check_puct(word)], "id": word}
    others_patterns.append(t)

for item in others_phrases_words:
    t = {"label": "OTHERS", "pattern": [p for word in item.split() for p in check_puct_and_lower_pattern(word)], "id": item+"v2"}
    others_patterns.append(t)
    
for word in others_single_words:
    t = {"label": "OTHERS", "pattern": [p for p in check_puct_and_lower_pattern(word)], "id": word+"v2"}
    others_patterns.append(t)
for item in others_patterns:
    try:
        entity_rulers.add_patterns([item])
    except ValueError:
        print(item)

In [92]:
genders_patterns = []
genders_phrases_words = []
genders_single_words = []
for item in genders_words:
    if len(item.split()) > 1:
        genders_phrases_words.append(item)
    else:
        genders_single_words.append(item)
for item in genders_phrases_words:
    t = {"label": "GENDER", "pattern": [p for word in item.split() for p in check_puct(word)], "id": item}
    genders_patterns.append(t)
    
for word in genders_single_words:
    t = {"label": "GENDER", "pattern": [p for p in check_puct(word)], "id": word}
    genders_patterns.append(t)

for item in genders_phrases_words:
    t = {"label": "GENDER", "pattern": [p for word in item.split() for p in check_puct_and_lower_pattern(word)], "id": item}
    genders_patterns.append(t)
    
for word in genders_single_words:
    t = {"label": "GENDER", "pattern": [p for p in check_puct_and_lower_pattern(word)], "id": word}
    genders_patterns.append(t)
for item in genders_patterns:
    try:
        entity_rulers.add_patterns([item])
    except ValueError:
        print(item)

In [93]:
patterns_patterns = []
patterns_phrases_words = []
patterns_single_words = []
for item in patterns_words:
    if len(item.split()) > 1:
        patterns_phrases_words.append(item)
    else:
        patterns_single_words.append(item)
for item in patterns_phrases_words:
    t = {"label": "PATTERN", "pattern": [p for word in item.split() for p in check_puct(word)], "id": item}
    patterns_patterns.append(t)
    
for word in patterns_single_words:
    t = {"label": "PATTERN", "pattern": [p for p in check_puct(word)], "id": word}
    patterns_patterns.append(t)
    
for item in patterns_phrases_words:
    t = {"label": "PATTERN", "pattern": [p for word in item.split() for p in check_puct_and_lower_pattern(word)], "id": item+"v2"}
    patterns_patterns.append(t)
    
for word in patterns_single_words:
    t = {"label": "PATTERN", "pattern": [p for p in check_puct_and_lower_pattern(word)], "id": word+"v2"}
    patterns_patterns.append(t)
    
for item in patterns_patterns:
    try:
        entity_rulers.add_patterns([item])
    except ValueError:
        print(item)

In [94]:
colors_patterns = []
colors_phrases_words = []
colors_single_words = []
for item in colors_words:
    if len(item.split()) > 1:
        colors_phrases_words.append(item)
    else:
        colors_single_words.append(item)
for item in colors_phrases_words:
    t = {"label": "COLOR", "pattern": [p for word in item.split() for p in check_puct(word)], "id": item}
    colors_patterns.append(t)
    
for word in colors_single_words:
    t = {"label": "COLOR", "pattern": [p for p in check_puct(word)], "id": word}
    
    colors_patterns.append(t)

for item in colors_phrases_words:
    t = {"label": "COLOR", "pattern": [p for word in item.split() for p in check_puct_and_lower_pattern(word)], "id": item}
    colors_patterns.append(t)
    
for word in colors_single_words:
    t = {"label": "COLOR", "pattern": [p for p in check_puct_and_lower_pattern(word)], "id": word}
    colors_patterns.append(t)
for item in colors_patterns:
    try:
        entity_rulers.add_patterns([item])
    except ValueError:
        print(item)

In [95]:
sizes_patterns = []
#only s, m, l, large, small, medium describe size word can be added to sizes_word
for word in sizes_words:
    if word in ['S', 'M', 'L']: 
        t = {"label":"SIZE","pattern":word, "id":word}
    else:
        t = {"label":"SIZE","pattern":[{"LOWER": word}], "id":word}
    sizes_patterns.append(t)
#regex_expression = r"(\d+(?:\.|\/|)\d+|\d+) ?(?:\-|x) ?(\d+(?:\.|\/|)\d+|\d+) ?(?:mm|MM|cm|CM|[Ii]nches|in|\"|)|(\d+(?:\.|\'|/|)\d+ ?(?:cm|CM|mm|MM|[Ii]nches|inch|in|\"))|(\d*(?:X{1,3}[SL])(?:$|\s+))|(?:[Ss]ize) ?\d+(?:\.|\/|\d+)"
#regex_expression_1 = r"(\d*(?:X{1,3}[SL])(?:$|\s+))"
#regex_expression_2 = r"(\d+(?:\.|\'|/|)\d+ ?(?:cm|CM|mm|MM|[Ii]nches|inch|in|)"
#regex_expression_3 = r"(?:cm|CM|mm|MM|[Ii]nches|inch|in)"
#p1 = [{"TEXT": {"REGEX": regex_expression_1}}]
#p2 = [{"TEXT": {"REGEX": regex_expression_2}}]
#p3 = [{"TEXT": {'REGEX': regex_expression_3}}]

#sizes_patterns.append({"label":"SIZE", "pattern":p1, "id": "number size"})
#sizes_patterns.append({"label":"SIZE", "pattern":p3})
#sizes_patterns.append({"label":"SIZE", "pattern":[ [{"ORTH":"size"}, {"ORTH": ":", "OP": "?"}, {{"TEXT":  {"REGEX": r"\d+(?:\.|\/|)\d+|\d+) ?(?:\-|x) ?(\d+(?:\.|\/|)\d+|\d+) ?(?:mm|MM|cm|CM|[Ii]nches|in|\"|"}}} ]]})

for item in sizes_patterns:
    try:
        entity_rulers.add_patterns([item])
    except:
        print(item)

    

In [96]:
df = pd.read_csv("/Users/luis/Documents/GitHub/2022Summer/AI-Oriented-Recommendation-System/PreprocessingData/data/(V1.5)all_products_data_set.csv")
#THIS entity_options is the file to create the color for label. We can ignore this if we don't want entity colorful just igonre this .py file.
sys.path.append("/Users/luis/Documents/GitHub/2022Summer/AI-Oriented-Recommendation-System/PreprocessingData/")
from spaCy import entity_options as e

In [98]:
#Randomly choose from original texts and test our patterns rules.
sample = df.loc[random.randint(0, df.shape[0]), "raw_text"]
if isinstance(sample, str):
    doc = nlp(sample)
    displacy.render(doc, style="ent", options=e.get_entity_options())



In [99]:
#This scope is FOR DEBUGGING the pattern functionality.
example2 = "I have a Men Black/black WHITE white watch and a lot of pink watches. Also I have a S zebra plaid top tops. This size XXL T-shirt is amazing. Clogs are good.\
    Earrings, earing and earring are different. Bikini top. This jumpsuits size is 3XL which is size 24 - 25mm. cotton is 100%  the Size is 23 x 23 cm. What is 21 CM 5.54  "

doc = nlp(example2)
#displacy.render(doc, style="ent",options=e.get_entity_options())
ents = [(ent.text, ent.label_) for ent in doc.ents]
displacy.render(doc, style="ent", options=e.get_entity_options())

for ent in doc.ents:
    print(ent.start_char, ent.end_char, ent.text, ent.label_)
    

9 12 Men GENDER
13 18 Black COLOR
19 24 black COLOR
25 30 WHITE COLOR
31 36 white COLOR
37 42 watch ACCESSORIES
56 60 pink COLOR
61 68 watches ACCESSORIES
84 85 S SIZE
86 91 zebra PATTERN
92 97 plaid PATTERN
102 106 tops TOPS
122 129 T-shirt TOPS
142 147 Clogs SHOES
161 169 Earrings ACCESSORIES
205 215 Bikini top OTHER_CLOTHING
222 231 jumpsuits OTHER_CLOTHING


In [100]:
#matcher = Matcher(nlp.vocab, validate=True)
#sizes_pattern = [{"LABEL": "SIZE", "PATTERN": [ {"TEXT": {"REGEX": "(\d*(?:M|X{0,2}[SL]))"}} ]}]
#entity_rulers.add_patterns(sizes_pattern)

#regex_string = r"(\d+(?:\.|\/|)\d+|\d+) ?(?:\-|x) ?(\d+(?:\.|\/|)\d+|\d+) ?(?:mm|MM|cm|CM|[Ii]nches|in|\"|)|(\d+(?:\.|\'|\/|)\d* ?(?:\%|cm|CM|mm|MM|[Ii]nches|inch|in|\"))|(\d*(?:X{1,3}[SL])(?:$|\s+))|\d+(?:\.|\/|\d+)"
#matches = regex.findall(regex_string, size_text, overlapped=True)
#m = set([m for m in matches])
#matches2 = re.findall(regex_string, size_text)
#m2 = set([m for m in matches2])   
#print(len(m), len(m2), m.difference(m2))
#p = re.compile(r"(\d+(?:\.|\/|)\d+|\d+) ?(?:\-|x) ?(\d+(?:\.|\/|)\d+|\d+) ?(?:mm|MM|cm|CM|[Ii]nches|in|\"|)|(\d+(?:\.|\'|\/|)\d+ ?(?:\%|cm|CM|mm|MM|[Ii]nches|inch|in|\"))|(\d*(?:X{1,3}[SL])(?:$|\s+))|\d+(?:\.|\/|\d+) ?\d+")


#regex_string = r"(\d+(?:\.|\/|)\d+|\d+) ?(?:\-|x) ?(\d+(?:\.|\/|)\d+|\d+) ?(?:mm|MM|cm|CM|[Ii]nches|in|\"|)|(\d+(?:\.|\'|\/|)\d+ ?(?:\%|cm|CM|mm|MM|[Ii]nches|inch|in|\"))|(\d*(?:X{1,3}[SL])(?:$|\s+))|\d+(?:\.|\/|\d+) ?\d+"
#size_text = "I have a Men Black/black WHITE white watch and a lot of pink watches. Also I have a zebra plaid top tops. This size XXL T-shirt is amazing. Clogs are good.\
#    Earrings, earing and earring are different. Bikini top. This jumpsuits size is 3XL which is size 24 - 25mm. cotton is 100%  the Size is 23 x 23 cm. What is 21 CM 5.54 "

#matches3 = re.finditer(regex_string, example2)
#for match in matches3:
#    print("[({start_char}, {end_char}, '{ENT}')]".format(start_char = match.span()[0], end_char = match.span()[1], ENT = "SIZE"))
#'''

In [56]:

# coding=utf8
# the above tag defines encoding for this document and is for Python 2.x compatibility


#regex_string = r"(\d+(?:\.|\/|)\d+|\d+) ?(?:\-|x) ?(\d+(?:\.|\/|)\d+|\d+) ?(?:mm|MM|cm|CM|[Ii]nches|in|\"|)|(\d+(?:\.|\'|\/|)\d+ ?(?:\%|cm|CM|mm|MM|[Ii]nches|inch|in|\"))|(\d*(?:X{1,3}[SL])(?:$|\s+))|\d+(?:\.|\/|\d+) ?\d+"

#test_str = "3XL I have a Men GENDER Black 3.33 cm COLOR / black COLOR WHITE COLOR white COLOR watch and a lot of pink COLOR watches. Also I have a zebra plaid top. This size XXL SIZE T-shirt is amazing. Clogs are good. Earrings, earing and earring are different. Bikini top. This jumpsuits size is 3XL which is size 24 - 25 mm. cotton is 100 % the size is 23.3 x 23 cm. What is 21 CM 7/45 x 13/45 cm  5.45 inches 7234/177"

#matches = re.finditer(regex_string, test_str)

#for matchNum, match in enumerate(matches, start=0):
    
#    print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
    
# Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.



In [57]:
#Check the 5000 trainning data with size, color, and gender.


In [58]:
def parse_train_data(text):
    
    doc = nlp(text)
    #ignore for now 
    #detections = [(doc[start:end].start_char, doc[start:end].end_char, 'TOPS') for idx, start, end in type_matcher(doc) ]
    
    detections = [(ent.start_char, ent.end_char, ent.label_) for ent in doc.ents]
    #detections =  [(span.start_char, span.end_char, 'TOPS') for span in spacy.util.filter_spans(spans)] #remove duplicates or overlaps using spacy.util.filter_spans
    
    regex_string = r"(\d+(?:\.|\/|)\d+|\d+) ?(?:\-|x) ?(\d+(?:\.|\/|)\d+|\d+) ?(?:mm|MM|cm|CM|[Ii]nches|in|\"|)|(\d+(?:\.|\'|\/|)\d+ ?(?:\%|cm|CM|mm|MM|[Ii]nches|inch|in|\"))|(\d*(?:X{1,3}[SL])(?:$|\s+))|\d+(?:$|\.|\/|\d+) ?\d+"
    size_matches = re.finditer(regex_string, text)
    size_detections = [(match.start(), match.end(),'SIZE') for match in size_matches]
    detections = detections + size_detections
    detections.sort(key = lambda x: x[0])
    return (doc.text, detections)

In [1]:
e = "XXL is the biggest size of my jacket"
parse_train_data(e)

NameError: name 'parse_train_data' is not defined

In [59]:
class Training_sample:
    annotations_count = {}
    text = ""
    annotations = []
    def __init__(self, text, annotations):
        self.text = text
        self.annotations = annotations
        self.annotations_count = {}
        if len(annotations) != 0:
            for start, end, label in annotations:
                if label not in self.annotations_count.keys():
                    self.annotations_count.update({label:1})
                else:#existed
                    temp = self.annotations_count.get(label) + 1
                    self.annotations_count.update({label:temp})
                    
    def get_text(self)->str:
        return self.text
    def get_annotations(self)->List:
        return self.annotations
    def get_annotations_count(self)->dict:
        return self.annotations_count
    def get_labels(self) -> set:
        return set(self.annotations_count.keys())
    def get_format(self) -> tuple:
        return (text, annotations)

In [60]:

should_have_labels = {'COLOR', 'SIZE', 'GENDER', 'PATTERN'}
empty = 0
empty_rows = []
empty_annotation = 0
training_data_1 = []
training_data_2 = []
training_data_3 = []
training_data_4 = []
training_data_5 = []

annotations_count = {} 
for row in range(df.shape[0]):
    text = df.iloc[row]['raw_text']
    #if the raw_text is empty then just ignore
    if not isinstance(text, str):
        #empty_rows.append(row)
        continue
    text, annotations = parse_train_data(text)
    ts = Training_sample(text, annotations)
    intersection_set = ts.get_labels().intersection(should_have_labels)
    if len(intersection_set) == 1:
        training_data_1.append(ts.get_format())
    elif len(intersection_set) == 2:
        training_data_2.append(ts.get_format())
    elif len(intersection_set) == 3:
        training_data_3.append(ts.get_format())
    elif len(intersection_set) == 4:
        training_data_4.append(ts.get_format())
    else:
        training_data_5.append(ts.get_format())
    



In [61]:
print(len(training_data_1))
print(len(training_data_2))
print(len(training_data_3))
print(len(training_data_4))
print(len(training_data_5))

10868
9645
2586
182
2704


In [62]:
example3 = "Boxy fit top with small square sleeves, a classic curved collar, and open front placket. Easy to dress up or down. Details: 55% Linen, 45% Viscose Made In New York City, USA Lola is 57, bust: 34 waist: 26 hips: 35 wearing a size XS About the brand: Caron Callahan is a collection created for women who love beautiful clothes but don’t need to make a fuss about them. The company ethos is rooted in well-made, feminine clothes, while keeping in mind that they should be playful, purposeful and wearable. The label is owner and designer, Caron Callahan, constantly inspired by her friends and neighbours, sets out to make clothes for a creative, spirited woman who loves fashion, but doesn’t necessarily follow it."

In [63]:
def create_training(dataset: List[tuple])->DocBin:
    db = DocBin()
    unpassed_text = []
    for text, annotations in dataset:
        doc = nlp(text)
        ents = []
        for start, end, label in annotations:
            span = doc.char_span(start, end, label=label, alignment_mode="expand")#,alignment_mode="strict")
            if span is None:
                continue
            else:
                ents.append(span)
        try:
            doc.ents = ents
        except:
            unpassed_text.append(text)
        db.add(doc)
#db.to_disk("./train.spacy")
    print("unpassed: ", len(unpassed_text))
    return db

In [129]:
print("train_len: ", len(training_data_2) * 0.6, "valid: ", len(training_data_2) * 0.8, "unseen: ",  len(training_data_2))
print("test_len: ", len(training_data_3) * 0.6, "valid: ", len(training_data_3) * 0.8, "unseen: ", len(training_data_3))
print("evaluate_len: ", len(training_data_4) * 0.6, "valid: ", len(training_data_4) * 0.8, "unseen: ", len(training_data_4))


train_len:  5787.0 valid:  7716.0 unseen:  9645
test_len:  1551.6 valid:  2068.8 unseen:  2586
evaluate_len:  109.2 valid:  145.6 unseen:  182


In [65]:
#the data the machine never seen, not (train/valid)
unseen = training_data_2[7716:] + training_data_3[2070:] + training_data_4[146:]

In [114]:

train_data = create_training((training_data_2[0:5787] + training_data_3[0: 1552] + training_data_4[0: 109]))
#train_data.to_disk("./train.spacy")
valid_data = create_training((training_data_2[5787:7716] + training_data_3[1552:2070] + training_data_4[109: 146]))
#valid_data.to_disk("./valid.spacy")
evaluate_data = create_training(unseen)
#evaluate_data.to_disk("./evaluate.spacy")

unpassed:  293
unpassed:  47
unpassed:  10


In [133]:
len(evaluate_data)

2481

In [138]:
statistical = spacy.load("output/model-best")
r = random.randint(0, len(unseen))
print("The randon row is : ", r)
doc1 = statistical(unseen[r][0])
displacy.render(doc1, style="ent", options=e.get_entity_options())

In [118]:
training_data_1[random.randint(0, len(training_data_1))]

('Upper: 100% Bovine leather Lining: Leather Inner Sole: Leather Sole: Rubber Fit Comments: Regular: True to Size. Half sizes should size up if regular-wide width or size down if narrow width.',
 [(7, 11, 'SIZE'),
  (19, 26, 'ACCESSORIES'),
  (35, 42, 'ACCESSORIES'),
  (55, 62, 'ACCESSORIES')])

In [125]:
training_data_2[random.randint(0, len(training_data_2))]

('Description Soft, breathable Linen fabric in a tailored shirt style. A soft line on the collar opens the neckline, and buttons down the front allow for easy dressing and versatile styling. The Samantha is great paired with capri pants or layered under a cardigan on cooler days. You can even wear it unbuttoned, for a cool shirt-jacket look. Features - Collared neckline - Buttons down the front - Princess seams from shoulder to hem Blue Sky fit guide - true to size. Fabric - 100% Woven Bamboo',
 [(29, 34, 'COLOR'),
  (56, 61, 'TOPS'),
  (88, 94, 'ACCESSORIES'),
  (223, 228, 'BOTTOMS'),
  (229, 234, 'BOTTOMS'),
  (254, 262, 'TOPS'),
  (323, 328, 'TOPS'),
  (329, 335, 'TOPS'),
  (434, 438, 'COLOR'),
  (478, 482, 'SIZE')])

In [127]:
training_data_3[random.randint(0, len(training_data_3))]

('We are thrilled to introduce Seeking Splendor as part of the 2022 SweetLegs patterned cropped leggings collection. SweetLegs Crops feature the same signature legging fit and style that you love, but with a shorter inseam that’s cropped above the ankle and perfect for warmer weather! Seeking Splendor is a stunning, large-scale floral print in shades of pink, lavender, grey and emerald, all set on a dark grey background. Seeking Splendor looks incredible when paired with a Swing Tank in Lavender and your go-to summer sandals. This print is perfect for easy Saturday’s at the Farmer’s Market or evenings spent in the garden.',
 [(61, 65, 'SIZE'),
  (94, 102, 'BOTTOMS'),
  (158, 165, 'BOTTOMS'),
  (206, 213, 'BOTTOMS'),
  (316, 321, 'SIZE'),
  (322, 327, 'PATTERN'),
  (354, 358, 'COLOR'),
  (360, 368, 'COLOR'),
  (370, 374, 'COLOR'),
  (406, 410, 'COLOR'),
  (482, 486, 'TOPS'),
  (490, 498, 'COLOR'),
  (521, 528, 'SHOES')])

In [121]:
training_data_4[random.randint(0, len(training_data_4))]

('CLUB JERSEY (GENDER NEUTRAL) A SOFT JERSEY WITH 3-STRIPES. A jersey-style t-shirt from adidas Skateboarding with 3-Stripes on the sleeves. It manages moisture to keep you dry, and it is made of recycled polyester doubleknit fabric with a ribbed collar and cuffs for added comfort. A small Trefoil pops on the chest. SPECIFICATIONS Regular fit Crewneck 100% recycled polyester doubleknit Soft feel adidas Skateboarding jersey Cuffed sleeves Moisture-absorbing AEROREADY Doubleknit Imported Product colour: White / Black',
 [(5, 11, 'TOPS'),
  (20, 27, 'GENDER'),
  (36, 42, 'TOPS'),
  (61, 67, 'TOPS'),
  (74, 81, 'TOPS'),
  (130, 137, 'ACCESSORIES'),
  (245, 251, 'ACCESSORIES'),
  (283, 288, 'SIZE'),
  (289, 296, 'PATTERN'),
  (352, 356, 'SIZE'),
  (418, 424, 'TOPS'),
  (432, 439, 'ACCESSORIES'),
  (505, 510, 'COLOR'),
  (513, 518, 'COLOR')])

In [123]:
training_data_5[random.randint(0, len(training_data_5))]

('Hunter rubber buffer. The best way to clean your Hunter Boots.', [])