In [1]:
import spacy
nlp = spacy.load("en_core_web_sm")

Init Plugin
Init Graph Optimizer
Init Kernel


In [2]:
import common as c

#The codes below are for specific example cleaning testing.
raw_tag = "{what :?}"
raw_title = 'Gemi Top, Red Plaid, Bamboo'
raw_product_type = 'top'
#line 31 example
raw_product_description = 'You may find yourself in a sticky situation. \
    You may need to secure that mushroom you came across in the woods. \
    You may need a tablecloth for a very small table. \
    You may have a runny nose or cold neck or be having a bad hair day and just want to cover it up. \
    You may ask yourself why you left home without a square of fabric. \
    You may have just found the solution — 20 square inches of pure possibility.\
    Details:Set of threeMeasures 20'' H x 20'' W100% Organic Cotton\
    About the Brand:Founded in 2007, BAGGU set out to create a reusable bag that was as functional as it was adorable. \
    Today their goal is to make every bag you need for your every day life. \
    They have stuck with their mission of creating useful products that are made with you and the planet in mind. \
    BAGGU is manufactured ethically and environmentally responsibly in China.'
#pre-process the tag/title/product_type
content_1 = c.clean_tags_text(raw_title, raw_product_type, raw_tag)
#pre-process the production_description
content_2 = c.clean_product_description(raw_product_description)


In [3]:
content_test = c.clean_tags_text("Pearl Top, Secret Garden, Bamboo", "top", "{20,Bamboo,Navy,sale,Short Sleeve,Sunny,Top,White}")
print(content_test)


pearl top  secret garden  bamboo, top, 20 bamboo navy sale short sleeve sunny top white


Create TOPS type rule based matcher --> tops_matcher

In [4]:
type_matcher = c.create_patterns_matcher()

from spacy.matcher import Matcher

tops_patterns = c.create_tops_patterns()

tops_matcher = Matcher(nlp.vocab)
#This rule_based matcher is only to detect "TOPS"
tops_matcher.add("TOPS_TYPE", tops_patterns)


In [5]:
doc = nlp("womens tops. Jackson Rowe Cholla Henley, womens top, 22Mar22, cholla, dollar, henley, jackson, new arrival, rowe, sand, top, womens")
tops_matcher(doc)


[(7479416363363656451, 1, 2)]

In [6]:
from spacy import displacy
displacy.render(doc)

#### 1. First we are trainning model to detect all products belonging to 'TOPS'
(TODO: overlap type in matcher eg. "t-shirt" -> 'shirt' and 't-shirt') 

(Solved) Overlap, duplicates, named entities

In [7]:
def parse_train_data(text):
    doc = nlp(text)
    #ignore for now 
    #detections = [(doc[start:end].start_char, doc[start:end].end_char, 'TOPS') for idx, start, end in type_matcher(doc) ]
    
    spans = [doc[start:end] for _, start, end in tops_matcher(doc)]
    detections =  [(span.start_char, span.end_char, 'TOPS') for span in spacy.util.filter_spans(spans)] #remove duplicates or overlaps using spacy.util.filter_spans
    
    return (doc.text, {'entities': detections})

#parse_train_data("top") #testing, which should show the entities location
#parse_train_data("These camisole and T-shirt and are so good. I did have similar pattern jacket which is so fancy. They all belong to tops. Gemi top, The top blue top is cloak")

#### 2. Next step: We need to operate sample dataset to seperate the 'product_type_number == 2' to create classifier of 'TOPS' trainning model


In [8]:
import pandas as pd

In [9]:
tops_df = pd.read_csv('tops.csv')

For rules_based matcher, we can use the tops_total.csv directly to see how the matcher works

In [10]:

#text_type = pd.read_csv('sample_v1.csv', usecols=["title", "tags", "product_type", "body_html", "product_type_number"])

#tops_df = text_type.loc[text_type['product_type_number'] == '2'].reset_index()
#pd.concat([pd.DataFrame([i], columns=['label']) for i in range(70)])

tops_df.insert(len(tops_df.columns), 'label', 1, allow_duplicates=True)
#print("columns number : ", len(tops_df.columns))

#prdiction(mixed_all) is based on prediction1 + prdiction 2 (all words) a & b
tops_df.insert(len(tops_df.columns), 'prediction(mixed_all)', 0, allow_duplicates=True)
#prediction_1 is based on tags + title + product_type
tops_df.insert(len(tops_df.columns), 'prediction_1(title+pt+tags)', 0, allow_duplicates=True)
#prediction_2 is based on product_description(body_html)
tops_df.insert(len(tops_df.columns), 'prediction_2(body_html)', 0, allow_duplicates=True)

#fill all empty cells 
tops_df.fillna("Not mention", inplace=True)
tops_df.insert(len(tops_df.columns), 'based_on_title', 0, allow_duplicates=True)
tops_df.insert(len(tops_df.columns), 'based_on_tags', 0, allow_duplicates=True)
tops_df.insert(len(tops_df.columns), 'based_on_product_type', 0, allow_duplicates=True)

In [11]:
tops_df.head()

Unnamed: 0,title,product_type,tags,body_html,label,prediction(mixed_all),prediction_1(title+pt+tags),prediction_2(body_html),based_on_title,based_on_tags,based_on_product_type
0,"Mia Top, Ice Pinecones, Bamboo",top,"{""3/4 Sleeve"",333,50%,Bamboo,fw2020,fw2020repo...",DescriptionFlattering whether worn loose or bo...,1,0,0,0,0,0,0
1,Ladybug Long Sleeve,tops,"{_tab1_free-people-sizing,_tab2_atb-free-peopl...",Sweet corset-inspired long sleeve top featured...,1,0,0,0,0,0,0
2,"Pearl Top, Secret Garden, Bamboo",top,"{20,Bamboo,Navy,sale,""Short Sleeve"",Sunny,Top,...",Cross over top for all shapes. The dolman slee...,1,0,0,0,0,0,0
3,CHROMA SLEEVELESS HOODIE,premium sleeveless hoodie,"{Nu,Psychedelic,""Sacred Geometry""}","Vibrant all over front, back &amp; hood design...",1,0,0,0,0,0,0
4,Rib Hacci Vagabond Tank,tops,"{_tab1_zsupply-sizing,_tab2_atb-zsupply,_tab3_...",We updated our popular Vagabond tank! The Rib ...,1,0,0,0,0,0,0


In [12]:
#loop all rows with each row 'title', 'tags' and 'product_type' to create new column called 'title+tag'

errorcount = 0
for i in range(len(tops_df)):
    content_2_raw = ''
    content_1_raw = ''
    content_raw = ''
    titles_raw = ''
    tags_raw = ''
    ptype_raw = ''
    try:
        content_1_raw = c.clean_tags_text(tops_df.loc[i, 'title'], tops_df.loc[i, 'product_type'], tops_df.loc[i, 'tags'])
        
        titles_raw = c.clean_tags_text(tops_df.loc[i, 'title'], None, None)
        
        tags_raw = c.clean_tags_text(None, None, tops_df.loc[i, 'tags'])
        
        ptype_raw = c.clean_tags_text(None, tops_df.loc[i, 'product_type'], None)
        
        content_2_raw = c.clean_product_description(tops_df.loc[i, 'body_html'])
        
        content_raw = content_1_raw + ', ' + content_2_raw
        
    except:
        print("line type error " + str(i) + ' '  + content_1_raw + '\n' + titles_raw + '\n' + tags_raw + '\n' + ptype_raw + '\n')
    
    content = nlp(content_raw)
    content_1 = nlp(content_1_raw)
    content_2 = nlp(content_2_raw)
    titles = nlp(titles_raw)
    tags = nlp(tags_raw)
    ptype = nlp(ptype_raw)
    
    if len(tops_matcher(titles)) > 0:
        tops_df.loc[i, 'based_on_title'] = 1
    
    if len(tops_matcher(tags)) > 0:
        tops_df.loc[i, 'based_on_tags'] = 1
        
    if len(tops_matcher(ptype)) > 0:
        tops_df.loc[i, 'based_on_product_type'] = 1   
    
    if len(tops_matcher(content_1)) > 0:
        tops_df.loc[i, 'prediction_1(title+pt+tags)'] = 1
        
    if len(tops_matcher(content_2)) > 0:
        tops_df.loc[i, 'prediction_2(body_html)'] = 1
    
    if len(tops_matcher(content)) > 0:
        tops_df.loc[i, 'prediction(mixed_all)'] = 1
    else:
        errorcount += 1
        print(errorcount, content_1_raw)
        print(errorcount, content_2_raw)
        


1 gemi top  red plaid  bamboo, top, 
1 flattering notched detail at neckline. flat pleat cb for easy clearance across the back. in addition, a slightly wider back panel controls fabric fall, for a trimly finished front and generous finish at back. touch of a-line, small vents. falls to low hip.blue sky fit guide - true to size; great fit for all shapes, but exceptionally easy to wear for women who are wide in the back and/or full in the bust. (4c5)fabric - 95% bamboo, 5% lycra


#### 3. After every column using type_matcher, we update the 'prediction' column.


In [13]:
#tops_df.rename(columns={"index": "index_in_original_sample"}, inplace=True)
tops_df.to_csv('matcher_reault.csv')

In [14]:
tops_df.loc[tops_df['prediction(mixed_all)'] == 0]

Unnamed: 0,title,product_type,tags,body_html,label,prediction(mixed_all),prediction_1(title+pt+tags),prediction_2(body_html),based_on_title,based_on_tags,based_on_product_type
56,"Gemi Top, Red Plaid, Bamboo",top,{},Flattering notched detail at neckline. Flat pl...,1,0,1,0,0,0,1


In [15]:

#print out the accuracy of each type matcher
rows_count = tops_df[tops_df.columns[0]].count()
all_mixed_prediction_correctness = (tops_df.loc[tops_df['prediction(mixed_all)'] == 1].shape[0]) / rows_count
based_on_title_correctness = tops_df.loc[tops_df['based_on_title'] == 1].shape[0] / rows_count
based_on_tags_correctness = tops_df.loc[tops_df['based_on_tags'] == 1].shape[0] / rows_count
based_on_product_type_correctness = tops_df.loc[tops_df['based_on_product_type'] == 1].shape[0] / rows_count
based_on_TagsTitleTags_correctness = tops_df.loc[tops_df['prediction_1(title+pt+tags)'] == 1].shape[0]/ rows_count
based_on_body_html_correctness =tops_df.loc[tops_df['prediction_2(body_html)'] == 1].shape[0] / rows_count

print("all_mixed_prediction_correctness : ", all_mixed_prediction_correctness)
print("based_on_TagsTitleProductType_correctness", based_on_TagsTitleTags_correctness)

print("based_on_title_correctness : ", based_on_title_correctness)
print("based_on_tags_correctness : ", based_on_tags_correctness)
print("based_on_product_type_correctness : ", based_on_product_type_correctness)
print("based_on_body_html_correctness : ", based_on_body_html_correctness)


all_mixed_prediction_correctness :  0.995475113122172
based_on_TagsTitleProductType_correctness 0.995475113122172
based_on_title_correctness :  0.8099547511312217
based_on_tags_correctness :  0.6380090497737556
based_on_product_type_correctness :  0.8959276018099548
based_on_body_html_correctness :  0.5701357466063348


#### 4. Build TRIAN_DATA for 'tops'

In [33]:
#first trainning set is containing title+productType together (concatenate three)

title_tags_type_df = tops_df[['title',  'product_type', 'tags']]
title_tags_type_df.insert(len(title_tags_type_df.columns), 'raw_combined_text', '')
for i in range(len(title_tags_type_df)):
    try:
        raw_combined_text = c.clean_tags_text(title_tags_type_df.loc[i, 'title'], title_tags_type_df.loc[i, 'product_type'], title_tags_type_df.loc[i, 'tags'])
        
        title_tags_type_df.loc[i, 'raw_combined_text'] = raw_combined_text
    except:
        print("something wrong in line# : ", i)
    



In [17]:
title_tags_type_df.loc[:,'raw_combined_text']

0      mia top  ice pinecones  bamboo, top, 3/4 sleev...
1      ladybug long sleeve, tops, _tab1_free-people-s...
2      pearl top  secret garden  bamboo, top, 20 bamb...
3      chroma sleeveless hoodie, premium sleeveless h...
4      rib hacci vagabond tank, tops, _tab1_zsupply-s...
                             ...                        
216    charisse tank  white  bamboo, tank, basic carr...
217    peace top  lagoon  bamboo, top, 20 bamboo pink...
218    the clash sold out! merch tee, tops, _tab1_day...
219    tara blouse  fresh sprig, blouse, 20 3/4 sleev...
220    cotopaxi women's teca half-zip windbreaker pos...
Name: raw_combined_text, Length: 221, dtype: object

In [18]:
#check our trian-dataframe title+tags+product_type
TRAIN_DATA = [parse_train_data(d) for d in nlp.pipe(title_tags_type_df.loc[:,'raw_combined_text'])]



In [19]:
TRAIN_DATA [5:8]

[('big trippin hoodie, premium hoodie, ',
  {'entities': [(12, 18, 'TOPS'), (28, 34, 'TOPS')]}),
 ('tessa tee  white/black small stripe  bamboo, top, 1019 bamboo basic basic2022 regular stripe sunny top',
  {'entities': [(6, 9, 'TOPS'), (99, 102, 'TOPS')]}),
 ('michaelangelo portal t, premium tee, galaxy nu psychedelic space',
  {'entities': [(21, 22, 'TOPS'), (32, 35, 'TOPS')]})]

TRAINNING LOOP 

In [20]:
def creat_blank_nlp(train_data):
    nlp = spacy.blank("en")
    ner = nlp.create_pipe("ner")
    nlp.add_pipe("ner", last=True)
    ner = nlp.get_pipe("ner")
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            try:
                ner.add_label(ent[2])#'label' tops, ent[0], ent[1] are start_char and end_char
            except:
                print(ent[2])
    return nlp



In [26]:
import random
import datetime as dt
from spacy.training import Example
nlp = creat_blank_nlp(TRAIN_DATA)
optimizer = nlp.begin_training()
for i in range(50):
    random.shuffle(TRAIN_DATA)
    losses = {}
    for text, annotation in TRAIN_DATA:
        try:
            example = Example.from_dict(nlp.make_doc(text), annotation)
            nlp.update([example], sgd=optimizer, losses=losses)
        except:
            print("Error happens on : ", text, annotation)
    print(f"Losses at iteration {i} - {dt.datetime.now()}", losses)
        

Losses at iteration 0 - 2022-05-31 11:31:18.665522 {'ner': 371.52938328947914}
Losses at iteration 1 - 2022-05-31 11:31:21.219976 {'ner': 82.7123791451673}
Losses at iteration 2 - 2022-05-31 11:31:23.726439 {'ner': 63.99003619251098}
Losses at iteration 3 - 2022-05-31 11:31:26.272012 {'ner': 44.789298574258325}
Losses at iteration 4 - 2022-05-31 11:31:28.761222 {'ner': 27.487731222996736}
Losses at iteration 5 - 2022-05-31 11:31:31.225592 {'ner': 31.95491197605409}
Losses at iteration 6 - 2022-05-31 11:31:33.738162 {'ner': 16.056736924777248}
Losses at iteration 7 - 2022-05-31 11:31:36.272449 {'ner': 16.825693582887567}
Losses at iteration 8 - 2022-05-31 11:31:38.883422 {'ner': 12.725863313068448}
Losses at iteration 9 - 2022-05-31 11:31:41.300773 {'ner': 19.08417900768569}
Losses at iteration 10 - 2022-05-31 11:31:43.762317 {'ner': 27.079436022965652}
Losses at iteration 11 - 2022-05-31 11:31:46.255669 {'ner': 26.636459189015433}
Losses at iteration 12 - 2022-05-31 11:31:48.731639 {'n

In [27]:

from spacy import displacy

#description is not accurate
s = "I wear a fancy T-SHirt and I got another button-down wonderful crop tee . \
    Long shirt and coat are necessary for keeping warm in winter. \
    Sweater and blouse are important for people living in the north. \
    UA students have their own hoodies. The Tank Top is new stylish top-clothes. \
    What about trying our new camisole which is fantastic? "    
doc2 = nlp(s)
displacy.render(doc2, style='ent')



In [28]:
s = "There are certain pieces that will always bring a boho style aesthetic to mind, and this boxy top is one of them. \
    It's crafted from an open floral crochet with a scalloped hem and short sleeves. \
    We're showcasing the circle crochet trim along the round neckline with a turquoise necklace to really knock it out of the park. "
doc3 = nlp(s)
displacy.render(doc3, style='ent')

In [29]:
s4 = "The Tempo Hoodie is the UPF 50+ activewear you've been looking for! \
    It has thumbholes, a kangaroo pocket, and a hood for when the sun is too hot or you forgot your hat. \
    Our Fitness Hoodie is made out of our Active Athlon fabric with the added bonus of our Cooltect™ technology. \
    You can be active in this fitted Fitness Hoodie without getting uncomfortably hot. \
    So go ahead and enjoy sun-safe biking, walking, running and so much more!Highlights:UPF 50+Raglan long sleeves with thumbholesWelt \
    kangaroo pocketHoodedActive Athlon™ fabric: Lightweight and breathable with moisture wicking for quick dry performanceCooltect™ technology \
    accelerates moisture wicking to keep you cooler and more comfortable"
doc4 = nlp(s4)
displacy.render(doc4, style='ent')

In [36]:
s5 = "Style Deals - When temps start to rise, slip into this supremely cute babydoll dress outfitted with a vibrant floral print! Its warm-weather details include short sleeves, round neckline, and a slightly shorter length. Team this with your sandals, straw hat, and of course, your oversized sunnies. "
doc5 = nlp(s5)
displacy.render(doc5, style="ent")

In [30]:
nlp.pipeline #We start with 'empty' pipeline, 

[('ner', <spacy.pipeline.ner.EntityRecognizer at 0x1682285f0>)]

In [34]:
nlp.to_disk("ML_based_model")

#### 5. Next step I will try to add more pipelines like tokenizer, tagger, parser, ner... 
label 
        ---> model ----> rules
        
data   

Entity Ruler