In [2]:
import spacy
from spacy import displacy
nlp = spacy.load("en_core_web_sm")

In [3]:
import common as c

content_test = c.clean_tags_text("PASTEL FUNDAMENTAL T", None, '{Nu,Psychedelic}')
print(content_test)
nlp_stat = spacy.load("ML_based_model")
nlp_rule = spacy.load("rule_model_TOPS")
print(nlp_stat(content_test).ents)

print(nlp_rule(content_test).ents)

print(nlp(content_test).ents)

displacy.render(nlp(content_test))

pastel fundamental t; nu psychedelic
(t,)
(t,)
()


Create TOPS type rule based matcher --> tops_matcher

In [None]:
#type_matcher = c.create_patterns_matcher()

from spacy.matcher import Matcher

tops_patterns = c.create_tops_patterns()

tops_matcher = Matcher(nlp.vocab, validate=True)
#This rule_based matcher is only to detect "TOPS"
tops_matcher.add("TOPS_TYPE", tops_patterns)


In [None]:
#doc = nlp("the tempo hoodie tops is the UPF 50+ activewear you've been looking for! It has thumbholes, a kangaroo pocket, and a hood for when the sun is too hot or you forgot your hat. Our Fitness hoodie tops is made out of our Active Athlon fabric with the added bonus of our Cooltect™ technology. You can be active in this fitted Fitness Hoodie TOPS without getting uncomfortably hot. So go ahead and enjoy sun-safe biking, walking, running and so much more!Highlights:UPF 50+Raglan long sleeves with thumbholesWelt kangaroo pocketHoodedActive Athlon™ fabric: Lightweight and breathable with moisture wicking for quick dry performanceCooltect™ technology accelerates moisture wicking to keep you cooler and more comfortable")
doc = nlp(content_test)
tops_matcher(doc)
for match_id, start, end in tops_matcher(doc):
        print( doc[start:end].lemma_)

In [None]:
from spacy import displacy
displacy.render(doc)

#### 1. First we are trainning model to detect all products belonging to 'TOPS'
(TODO: overlap type in matcher eg. "t-shirt" -> 'shirt' and 't-shirt') 

(Solved) Overlap, duplicates, named entities

In [None]:
def parse_train_data(text):
    
    doc = nlp(text)
    #ignore for now 
    #detections = [(doc[start:end].start_char, doc[start:end].end_char, 'TOPS') for idx, start, end in type_matcher(doc) ]
    
    spans = [doc[start:end] for _, start, end in tops_matcher(doc)]
    detections =  [(span.start_char, span.end_char, 'TOPS') for span in spacy.util.filter_spans(spans)] #remove duplicates or overlaps using spacy.util.filter_spans
    
    return (doc.text, {'entities': detections})

#parse_train_data("top") #testing, which should show the entities location
#parse_train_data("These camisole and T-shirt and are so good. I did have similar pattern jacket which is so fancy. They all belong to tops. Gemi top, The top blue top is cloak")

#### 2. Next step: We need to operate sample dataset to seperate the 'product_type_number == 2' to create classifier of 'TOPS' trainning model


In [1]:
import pandas as pd

In [2]:
tops_df = pd.read_csv('train_data/tops_train.csv')

In [3]:
tops_df.head()

Unnamed: 0,title,product_type,tags,body_html
0,"Mia Top, Ice Pinecones, Bamboo",top,"{""3/4 Sleeve"",333,50%,Bamboo,fw2020,fw2020repo...",DescriptionFlattering whether worn loose or bo...
1,Ladybug Long Sleeve,tops,"{_tab1_free-people-sizing,_tab2_atb-free-peopl...",Sweet corset-inspired long sleeve top featured...
2,"Pearl Top, Secret Garden, Bamboo",top,"{20,Bamboo,Navy,sale,""Short Sleeve"",Sunny,Top,...",Cross over top for all shapes. The dolman slee...
3,CHROMA SLEEVELESS HOODIE,premium sleeveless hoodie,"{Nu,Psychedelic,""Sacred Geometry""}","Vibrant all over front, back &amp; hood design..."
4,Rib Hacci Vagabond Tank,tops,"{_tab1_zsupply-sizing,_tab2_atb-zsupply,_tab3_...",We updated our popular Vagabond tank! The Rib ...


For rules_based matcher, we can use the tops_total.csv directly to see how the matcher works

In [None]:

#text_type = pd.read_csv('sample_v1.csv', usecols=["title", "tags", "product_type", "body_html", "product_type_number"])

#tops_df = text_type.loc[text_type['product_type_number'] == '2'].reset_index()
#pd.concat([pd.DataFrame([i], columns=['label']) for i in range(70)])

tops_df.insert(len(tops_df.columns), 'label', 1, allow_duplicates=True)
#print("columns number : ", len(tops_df.columns))

#prdiction(mixed_all) is based on prediction1 + prdiction 2 (all words) a & b
tops_df.insert(len(tops_df.columns), 'prediction(mixed_all)', 0, allow_duplicates=True)
#prediction_1 is based on tags + title + product_type
tops_df.insert(len(tops_df.columns), 'prediction_1(title+pt+tags)', 0, allow_duplicates=True)
#prediction_2 is based on product_description(body_html)
tops_df.insert(len(tops_df.columns), 'prediction_2(body_html)', 0, allow_duplicates=True)

#fill all empty cells 
tops_df.fillna("Not mention", inplace=True)
tops_df.insert(len(tops_df.columns), 'based_on_title', 0, allow_duplicates=True)
tops_df.insert(len(tops_df.columns), 'based_on_tags', 0, allow_duplicates=True)
tops_df.insert(len(tops_df.columns), 'based_on_product_type', 0, allow_duplicates=True)

In [None]:
tops_df.head()

In [None]:
#loop all rows with each row 'title', 'tags' and 'product_type' to create new column called 'title+tag'

errorcount = 0
for i in range(len(tops_df)):
    content_2_raw = ''
    content_1_raw = ''
    content_raw = ''
    titles_raw = ''
    tags_raw = ''
    ptype_raw = ''
    try:
        content_1_raw = c.clean_tags_text(tops_df.loc[i, 'title'], tops_df.loc[i, 'product_type'], tops_df.loc[i, 'tags'])
        
        titles_raw = c.clean_tags_text(tops_df.loc[i, 'title'], None, None)
        
        tags_raw = c.clean_tags_text(None, None, tops_df.loc[i, 'tags'])
        
        ptype_raw = c.clean_tags_text(None, tops_df.loc[i, 'product_type'], None)
        
        content_2_raw = c.clean_product_description(tops_df.loc[i, 'body_html'])
        
        content_raw = content_1_raw + ', ' + content_2_raw
        
    except:
        print("line type error " + str(i) + ' '  + content_1_raw + '\n' + titles_raw + '\n' + tags_raw + '\n' + ptype_raw + '\n')
    
    content = nlp(content_raw)
    content_1 = nlp(content_1_raw)
    content_2 = nlp(content_2_raw)
    titles = nlp(titles_raw)
    tags = nlp(tags_raw)
    ptype = nlp(ptype_raw)
    
    if len(tops_matcher(titles)) > 0:
        tops_df.loc[i, 'based_on_title'] = 1
    
    if len(tops_matcher(tags)) > 0:
        tops_df.loc[i, 'based_on_tags'] = 1
        
    if len(tops_matcher(ptype)) > 0:
        tops_df.loc[i, 'based_on_product_type'] = 1   
    
    if len(tops_matcher(content_1)) > 0:
        tops_df.loc[i, 'prediction_1(title+pt+tags)'] = 1
        
    if len(tops_matcher(content_2)) > 0:
        tops_df.loc[i, 'prediction_2(body_html)'] = 1
    
    if len(tops_matcher(content)) > 0:
        tops_df.loc[i, 'prediction(mixed_all)'] = 1
    else:
        errorcount += 1
        print(errorcount, content_1_raw)
        print(errorcount, content_2_raw)
        


#### 3. After every column using type_matcher, we update the 'prediction' column.


In [None]:
#tops_df.rename(columns={"index": "index_in_original_sample"}, inplace=True)
tops_df.to_csv('train_data/train_matcher_result.csv')

In [None]:
tops_df.loc[tops_df['prediction(mixed_all)'] == 0]
failed_TPT_df = tops_df.loc[tops_df['prediction_1(title+pt+tags)'] == 0, ["title", "product_type", "tags"]].reset_index()
failed_TPT_df.head()

In [None]:
for i in range(failed_TPT_df.shape[0]):
    s = c.clean_tags_text(failed_TPT_df.loc[i, 'title'], failed_TPT_df.loc[i, 'product_type'], failed_TPT_df.loc[i, 'tags'])
    doc = nlp(s)
    print(s)
    displacy.render(doc)

In [1]:

#print out the accuracy of each type matcher
rows_count = tops_df[tops_df.columns[0]].count()
all_mixed_prediction_correctness = (tops_df.loc[tops_df['prediction(mixed_all)'] == 1].shape[0]) / rows_count
based_on_title_correctness = tops_df.loc[tops_df['based_on_title'] == 1].shape[0] / rows_count
based_on_tags_correctness = tops_df.loc[tops_df['based_on_tags'] == 1].shape[0] / rows_count
based_on_product_type_correctness = tops_df.loc[tops_df['based_on_product_type'] == 1].shape[0] / rows_count
based_on_TagsTitleTags_correctness = tops_df.loc[tops_df['prediction_1(title+pt+tags)'] == 1].shape[0]/ rows_count
based_on_body_html_correctness =tops_df.loc[tops_df['prediction_2(body_html)'] == 1].shape[0] / rows_count

print("all_mixed_prediction_correctness : ", all_mixed_prediction_correctness)
print("based_on_TagsTitleProductType_correctness", based_on_TagsTitleTags_correctness)
print("based_on_title_correctness : ", based_on_title_correctness)
print("based_on_tags_correctness : ", based_on_tags_correctness)
print("based_on_product_type_correctness : ", based_on_product_type_correctness)
print("based_on_body_html_correctness : ", based_on_body_html_correctness)


NameError: name 'tops_df' is not defined

#### 4. Build TRIAN_DATA for 'tops'

In [None]:
#first trainning set is containing title+productType together (concatenate three)
train_tops_df = pd.read_csv("train_data/train_matcher_result.csv")
#disgard the prediction_1(title+pt+tags) != 0 (unknown type)
title_tags_type_df = train_tops_df.loc[train_tops_df['prediction_1(title+pt+tags)'] == 1, ['title',  'product_type', 'tags']].reset_index()
title_tags_type_df.insert(len(title_tags_type_df.columns), 'raw_combined_text', '')
for i in range(len(title_tags_type_df)):
    try:
        raw_combined_text = c.clean_tags_text(title_tags_type_df.loc[i, 'title'], title_tags_type_df.loc[i, 'product_type'], title_tags_type_df.loc[i, 'tags'])
        title_tags_type_df.loc[i, 'raw_combined_text'] = raw_combined_text
    except:
        print("something wrong in line# : ", i)
    



In [None]:
title_tags_type_df.loc[:,'raw_combined_text']

In [None]:
#check our trian-dataframe title+tags+product_type
TRAIN_DATA = [parse_train_data(d) for d in nlp.pipe(title_tags_type_df.loc[:,'raw_combined_text'])]



In [None]:
len(TRAIN_DATA)

In [None]:
TRAIN_DATA [5:8]

TRAINNING LOOP 

In [None]:
def creat_blank_nlp(train_data):
    nlp = spacy.blank("en")
    #ner = nlp.create_pipe("ner")
    nlp.add_pipe("ner", last=True)
    ner = nlp.get_pipe("ner")
    for _, annotations in train_data:
        for ent in annotations.get("entities"):
            try:
                ner.add_label(ent[2])#'label' tops, ent[0], ent[1] are start_char and end_char
            except:
                print(ent[2])
    return nlp



In [None]:
import random
import datetime as dt
from spacy.training import Example
nlp = creat_blank_nlp(TRAIN_DATA)

optimizer = nlp.begin_training()
for i in range(50):
    random.shuffle(TRAIN_DATA)
    losses = {}
    for text, annotation in TRAIN_DATA:
        try:
            example = Example.from_dict(nlp.make_doc(text), annotation)
            nlp.update([example], sgd=optimizer, losses=losses)
        except:
            print("Error happens on : ", text, annotation)
    print(f"Losses at iteration {i} - {dt.datetime.now()}", losses)
        

In [None]:
ner = nlp.get_pipe('ner')

In [None]:
ner.labels

In [4]:

from spacy import displacy

#description is not accurate
s = "I wear a fancy T-SHirt and I got another button-down wonderful crop tee . \
    Long shirt and coat are necessary for keeping warm in winter. \
    Sweater and blouse are important for people living in the north. \
    UA students have their own hoodies. The Tank Top is new stylish top-clothes. \
    What about trying our new camisole which is fantastic? "    
doc2 = nlp(s)
displacy.render(doc2, style='ent')



In [None]:
s = "There are certain pieces that will always bring a boho style aesthetic to mind, and this boxy top is one of them. \
    It's crafted from an open floral crochet with a scalloped hem and short sleeves. \
    We're showcasing the circle crochet trim along the round neckline with a turquoise necklace to really knock it out of the park. "
doc3 = nlp(s)
displacy.render(doc3, style='ent')

In [None]:
s4 = "The Tempo Hoodie is the UPF 50+ activewear you've been looking for! \
    It has thumbholes, a kangaroo pocket, and a hood for when the sun is too hot or you forgot your hat. \
    Our Fitness Hoodie is made out of our Active Athlon fabric with the added bonus of our Cooltect™ technology. \
    You can be active in this fitted Fitness Hoodie without getting uncomfortably hot. \
    So go ahead and enjoy sun-safe biking, walking, running and so much more!Highlights:UPF 50+Raglan long sleeves with thumbholesWelt \
    kangaroo pocketHoodedActive Athlon™ fabric: Lightweight and breathable with moisture wicking for quick dry performanceCooltect™ technology \
    accelerates moisture wicking to keep you cooler and more comfortable"
doc4 = nlp(s4)
displacy.render(doc4, style='ent')

In [None]:
s5 = ""
s5 = c.clean_tags_text("Fox Men's hoodie", "mens clothing", "{}")
doc5 = nlp(s5)
displacy.render(doc5, style="ent")

In [None]:
nlp.pipeline #We start with 'empty' pipeline, 

In [None]:
nlp.to_disk("ML_based_model")

#### 5. Next step I will try to add more pipelines like tokenizer, tagger, parser, ner... 
label 
        ---> model ----> rules
        
data   

Entity Ruler