In [85]:
import spacy
nlp = spacy.load("en_core_web_md")

In [86]:
import common as c

#The codes below and next cell are for specific example testing.
raw_tag = '{BAGGU,groupbycolor,"home goods",Textiles}'
raw_title = 'Reusable Cloth Set / Backyard Fruit'
raw_product_type = 'textiles'
#line 31 example
raw_product_description = 'You may find yourself in a sticky situation. \
    You may need to secure that mushroom you came across in the woods. \
    You may need a tablecloth for a very small table. \
    You may have a runny nose or cold neck or be having a bad hair day and just want to cover it up. \
    You may ask yourself why you left home without a square of fabric. \
    You may have just found the solution — 20 square inches of pure possibility.\
    Details:Set of threeMeasures 20'' H x 20'' W100% Organic Cotton\
    About the Brand:Founded in 2007, BAGGU set out to create a reusable bag that was as functional as it was adorable. \
    Today their goal is to make every bag you need for your every day life. \
    They have stuck with their mission of creating useful products that are made with you and the planet in mind. \
    BAGGU is manufactured ethically and environmentally responsibly in China.'
#pre-process the tag/title/product_type
content_1 = c.clean_tags_text(raw_tag, raw_title, raw_product_type)
#pre-process the production_description
content_2 = c.clean_product_description(raw_product_description)


In [171]:
type_matcher = c.create_patterns_matcher()


#### 1. First we are trainning model to detect all products belonging to 'TOPS'
(TODO: overlap type in matcher eg. "t-shirt" -> 'shirt' and 't-shirt') 

(Solved) Overlap, duplicates, named entities

In [88]:
def parse_train_data(text):
    doc = nlp(text)
    #ignore for now 
    #detections = [(doc[start:end].start_char, doc[start:end].end_char, 'TOPS') for idx, start, end in type_matcher(doc) ]
    
    spans = [doc[start:end] for _, start, end in type_matcher(doc)]
    detections =  [(span.start_char, span.end_char, 'TOPS') for span in spacy.util.filter_spans(spans)] #remove duplicates or overlaps using spacy.util.filter_spans
    
    return (doc.text, {'entities': detections})

#parse_train_data("I like my jacket")
parse_train_data("These camisole and T-shirt and are so good. I did have similar pattern jacket which is so fancy")

('These camisole and T-shirt and are so good. I did have similar pattern jacket which is so fancy',
 {'entities': [(6, 14, 'TOPS'), (19, 26, 'TOPS'), (71, 77, 'TOPS')]})

#### 2. Next step: We need to operate sample dataset to seperate the 'product_type_number == 2' to create classifier of 'TOPS' trainning model


In [89]:
import pandas as pd
text_type = pd.read_csv('sample_v1.csv', usecols=["title", "tags", "product_type", "body_html", "product_type_number"])

tops_df = text_type.loc[text_type['product_type_number'] == '2'].reset_index()
#pd.concat([pd.DataFrame([i], columns=['label']) for i in range(70)])

tops_df.insert(len(tops_df.columns), 'label', 1, allow_duplicates=True)
#print("columns number : ", len(tops_df.columns))

#prdiction(mixed_all) is based on prediction1 + prdiction 2 (all words) a & b
tops_df.insert(len(tops_df.columns), 'prediction(mixed_all)', 0, allow_duplicates=True)
#prediction_1 is based on tags + title + product_type
tops_df.insert(len(tops_df.columns), 'prediction_1(title+pt+tags)', 0, allow_duplicates=True)
#prediction_2 is based on product_description(body_html)
tops_df.insert(len(tops_df.columns), 'prediction_2(body_html)', 0, allow_duplicates=True)

#fill all empty cells 
tops_df.fillna("Not mention", inplace=True)
tops_df.insert(len(tops_df.columns), 'based_on_title', 0, allow_duplicates=True)
tops_df.insert(len(tops_df.columns), 'based_on_tags', 0, allow_duplicates=True)
tops_df.insert(len(tops_df.columns), 'based_on_product_type', 0, allow_duplicates=True)

In [90]:
#loop all rows with each row 'title', 'tags' and 'product_type' to create new column called 'title+tag'
type_matcher = c.create_patterns_matcher()
errorcount = 0
for i in range(len(tops_df)):
    content_2_raw = ''
    content_1_raw = ''
    content_raw = ''
    titles_raw = ''
    tags_raw = ''
    ptype_raw = ''
    try:
        content_1_raw = c.clean_tags_text(tops_df.loc[i, 'tags'], tops_df.loc[i, 'title'], tops_df.loc[i, 'product_type'])
        
        titles_raw = c.clean_tags_text(None, tops_df.loc[i, 'title'], None)
        
        tags_raw = c.clean_tags_text(tops_df.loc[i, 'tags'], None, None)
        
        ptype_raw = c.clean_tags_text(None, None,  tops_df.loc[i, 'product_type'])
        
        content_2_raw = c.clean_product_description(tops_df.loc[i, 'body_html'])
        
        content_raw = content_1_raw + ', ' + content_2_raw
        
    except:
        print("line type error ",i)
    
    content = nlp(content_raw)
    content_1 = nlp(content_1_raw)
    content_2 = nlp(content_2_raw)
    titles = nlp(titles_raw)
    tags = nlp(tags_raw)
    ptype = nlp(ptype_raw)
    
    if len(type_matcher(titles)) > 0:
        tops_df.loc[i, 'based_on_title'] = 1
    
    if len(type_matcher(tags)) > 0:
        tops_df.loc[i, 'based_on_tags'] = 1
        
    if len(type_matcher(ptype)) > 0:
        tops_df.loc[i, 'based_on_product_type'] = 1   
    
    if len(type_matcher(content_1)) > 0:
        tops_df.loc[i, 'prediction_1(title+pt+tags)'] = 1
        
    if len(type_matcher(content_2)) > 0:
        tops_df.loc[i, 'prediction_2(body_html)'] = 1
    
    if len(type_matcher(content)) > 0:
        tops_df.loc[i, 'prediction(mixed_all)'] = 1
    else:
        errorcount += 1
        print(errorcount, content_1_raw)
        print(errorcount, content_2_raw)
        


#### 3. After every column using type_matcher, we update the 'prediction' column.


In [91]:
tops_df.rename(columns={"index": "index_in_original_sample"}, inplace=True)
tops_df.to_csv('TOPS_result.csv')

In [92]:
tops_df.loc[tops_df['prediction(mixed_all)'] == 0]

Unnamed: 0,index_in_original_sample,title,product_type,tags,product_type_number,body_html,label,prediction(mixed_all),prediction_1(title+pt+tags),prediction_2(body_html),based_on_title,based_on_tags,based_on_product_type


In [93]:

#print out the accuracy of each type matcher
rows_count = tops_df[tops_df.columns[0]].count()
all_mixed_prediction_correctness = (tops_df.loc[tops_df['prediction(mixed_all)'] == 1].shape[0]) / rows_count
based_on_title_correctness = tops_df.loc[tops_df['based_on_title'] == 1].shape[0] / rows_count
based_on_tags_correctness = tops_df.loc[tops_df['based_on_tags'] == 1].shape[0] / rows_count
based_on_product_type_correctness = tops_df.loc[tops_df['based_on_product_type'] == 1].shape[0] / rows_count
based_on_TagsTitleTags_correctness = tops_df.loc[tops_df['prediction_1(title+pt+tags)'] == 1].shape[0]/ rows_count
based_on_body_html_correctness =tops_df.loc[tops_df['prediction_2(body_html)'] == 1].shape[0] / rows_count

print("all_mixed_prediction_correctness : ", all_mixed_prediction_correctness)
print("based_on_TagsTitleProductType_correctness", based_on_TagsTitleTags_correctness)

print("based_on_title_correctness : ", based_on_title_correctness)
print("based_on_tags_correctness : ", based_on_tags_correctness)
print("based_on_product_type_correctness : ", based_on_product_type_correctness)
print("based_on_body_html_correctness : ", based_on_body_html_correctness)


all_mixed_prediction_correctness :  1.0
based_on_TagsTitleProductType_correctness 1.0
based_on_title_correctness :  0.8048780487804879
based_on_tags_correctness :  0.5487804878048781
based_on_product_type_correctness :  0.6463414634146342
based_on_body_html_correctness :  0.6097560975609756


#### 4. Build TRIAN_DATA for 'tops'

In [94]:
#first trainning set is containing title+productType together (concatenate three)

title_tags_type_df = tops_df[['title', 'tags', 'product_type']]
title_tags_type_df.insert(len(title_tags_type_df.columns), 'raw_combined_text', '')
for i in range(len(title_tags_type_df)):
    try:
        raw_combined_text = c.clean_tags_text(title_tags_type_df.loc[i, 'tags'], title_tags_type_df.loc[i, 'title'], title_tags_type_df.loc[i, 'product_type'])
        
        title_tags_type_df.loc[i, 'raw_combined_text'] = raw_combined_text
    except:
        print("something wrong in line# : ", i)
    



In [95]:
title_tags_type_df.loc[:,'raw_combined_text']

0     nu psychedelic, word is world sports bra, spor...
1     free people tanks tops womens, scoop me up rac...
2     mens national standard s/s t tops white, tribl...
3     nu psychedelic, abstract waves black string bi...
4     ready to ship, bees sweatshirt (clearance), re...
                            ...                        
77    basic carryover new regular sleeveless ss2021 ...
78    20 bamboo pink sale ss2020 sunny top, peace to...
79    _tab1_daydreamer-sizing _tab2_atb-daydreamer _...
80    20 3/4 sleeve blouse carryover long sleeve ray...
81    cotopaxi outerwear womens, cotopaxi women's te...
Name: raw_combined_text, Length: 82, dtype: object

In [96]:
#check our trian-dataframe title+tags+product_type
TRAIN_DATA = [parse_train_data(d) for d in nlp.pipe(title_tags_type_df.loc[:,'raw_combined_text'])]



In [97]:
TRAIN_DATA [5:8]

[('animals nu psychedelic, acid gorilla hoodie, premium hoodie',
  {'entities': [(37, 43, 'TOPS'), (53, 59, 'TOPS')]}),
 ('groupbycolor stateside tops, lounge l/s boat neck " lightweight rib" / black, tops',
  {'entities': [(23, 27, 'TOPS'), (78, 82, 'TOPS')]}),
 ('_tab1_free-people-sizing _tab2_atb-free-people _tab3_care-free-people blouse clothing free-people tops, check on it wrap top, tops',
  {'entities': [(70, 76, 'TOPS'),
    (77, 85, 'TOPS'),
    (98, 102, 'TOPS'),
    (121, 124, 'TOPS'),
    (126, 130, 'TOPS')]})]

TRAINNING LOOP 

In [98]:
def creat_blank_nlp(train_data):
    nlp = spacy.blank("en")
    ner = nlp.create_pipe("ner")
    nlp.add_pipe("ner", last=True)
    ner = nlp.get_pipe("ner")
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            try:
                ner.add_label(ent[2])#'label' tops, ent[0], ent[1] are start_char and end_char
            except:
                print(ent[2])
    return nlp



In [161]:
import random
import datetime as dt
from spacy.training import Example
nlp = creat_blank_nlp(TRAIN_DATA)
optimizer = nlp.begin_training()
for i in range(20):
    random.shuffle(TRAIN_DATA)
    losses = {}
    for text, annotation in TRAIN_DATA:
        try:
            example = Example.from_dict(nlp.make_doc(text), annotation)
            nlp.update([example], sgd=optimizer, losses=losses)
        except:
            print("Error happens on : ", text, annotation)
    print(f"Losses at iteration {i} - {dt.datetime.now()}", losses)
        

Losses at iteration 0 - 2022-05-23 21:00:15.829118 {'ner': 314.43091249386043}
Losses at iteration 1 - 2022-05-23 21:00:16.816551 {'ner': 41.87719376894409}
Losses at iteration 2 - 2022-05-23 21:00:17.797378 {'ner': 62.68215404425492}
Losses at iteration 3 - 2022-05-23 21:00:18.776701 {'ner': 46.93266654468059}
Losses at iteration 4 - 2022-05-23 21:00:19.697314 {'ner': 12.1619733861654}
Losses at iteration 5 - 2022-05-23 21:00:20.670280 {'ner': 26.378976128524783}
Losses at iteration 6 - 2022-05-23 21:00:21.634569 {'ner': 7.085080688516168}
Losses at iteration 7 - 2022-05-23 21:00:22.585445 {'ner': 9.90187884219191}
Losses at iteration 8 - 2022-05-23 21:00:23.565554 {'ner': 1.2482176453512215}
Losses at iteration 9 - 2022-05-23 21:00:24.522437 {'ner': 2.064205024468288}
Losses at iteration 10 - 2022-05-23 21:00:25.469443 {'ner': 3.6830311851043436}
Losses at iteration 11 - 2022-05-23 21:00:26.381656 {'ner': 5.751324400664479}
Losses at iteration 12 - 2022-05-23 21:00:27.303555 {'ner': 

In [189]:

from spacy import displacy

#description is not accurate
s = "I wear a fancy T-SHirt and I got another button-down wonderful crop tee . Long shirt and coat are necessary for keeping warm in winter. Sweater and blouse are important for people living in the north. UA students have their own hoodies. The Tank Top is new stylish top-clothes. What about trying our new camisole which is fantastic? "    
doc2 = nlp(s)
displacy.render(doc2, style='ent')



In [173]:
s = "There are certain pieces that will always bring a boho style aesthetic to mind, and this boxy top is one of them. \
    It's crafted from an open floral crochet with a scalloped hem and short sleeves. \
    We're showcasing the circle crochet trim along the round neckline with a turquoise necklace to really knock it out of the park. "
doc3 = nlp(s)
displacy.render(doc3, style='ent')

In [188]:
nlp.pipeline #We start with 'empty' pipeline, 

[('ner', <spacy.pipeline.ner.EntityRecognizer at 0x28a832e40>)]

#### 5. Next step I will try to add more pipelines like tokenizer, tagger, parser, ner... 
label 
        ---> model ----> rules
data   