In [1]:
from spacy.lang.en import English
from common import create_tops_patterns
import spacy

Init Plugin
Init Graph Optimizer
Init Kernel


In [2]:
create_tops_patterns()

[[{'LOWER': 't'}, {'IS_PUNCT': True, 'OP': '?'}, {'LOWER': 'shirt'}],
 [{'LEMMA': {'IN': ['jacket',
     'camisole',
     'shirt',
     'coat',
     'sweater',
     'blouse',
     'kimono',
     'cardigan',
     'hoodie',
     'vest',
     'poncho',
     'blazer',
     'sweatshirt',
     '2aistcoat',
     'bralette',
     'bra',
     'jersey',
     't',
     'tee',
     'tank',
     'crop',
     'croptee',
     'croptop',
     'tanktop',
     'top',
     'coverup']},
   'POS': {'NOT_IN': ['ADJ']}}],
 [{'LEMMA': 'tank'}, {'IS_PUNCT': True, 'OP': '?'}, {'LEMMA': 'top'}]]

In [40]:
'''
#old_version spaCy 2.0 and not usable anymore
nlp = English()
ruler = EntityRuler(nlp)
ruler.add_patterns([{'label':'TOPS', 'pattern': p} for p in create_tops_patterns()])
nlp.add_pipe(ruler)
'''


In [20]:
#initilize empty pipeline model
nlp = English()
#nlp = spacy.load("en_core_web_sm")
#nlp = spacy.blank("en")
#intilize EntityRuler
entity_ruler_TOPS = nlp.add_pipe("entity_ruler")


In [21]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x281e29ac0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x281e29ee0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x2820bc270>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x281385bc0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x281392580>),
 ('ner', <spacy.pipeline.ner.EntityRecognizer at 0x2820bc2e0>),
 ('entity_ruler', <spacy.pipeline.entityruler.EntityRuler at 0x28335f680>)]

In [22]:
nlp.remove_pipe('ner')
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x281e29ac0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x281e29ee0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x2820bc270>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x281385bc0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x281392580>),
 ('entity_ruler', <spacy.pipeline.entityruler.EntityRuler at 0x28335f680>)]

In [23]:
entity_ruler_TOPS.add_patterns([{'label':'TOPS', 'pattern': p} for p in create_tops_patterns()])

In [24]:
nlp.pipeline

[('tok2vec', <spacy.pipeline.tok2vec.Tok2Vec at 0x281e29ac0>),
 ('tagger', <spacy.pipeline.tagger.Tagger at 0x281e29ee0>),
 ('parser', <spacy.pipeline.dep_parser.DependencyParser at 0x2820bc270>),
 ('attribute_ruler',
  <spacy.pipeline.attributeruler.AttributeRuler at 0x281385bc0>),
 ('lemmatizer', <spacy.lang.en.lemmatizer.EnglishLemmatizer at 0x281392580>),
 ('entity_ruler', <spacy.pipeline.entityruler.EntityRuler at 0x28335f680>)]

In [48]:
doc = nlp("The Tempo Hoodie TOPS is the UPF 50+ activewear you've been looking for! It has thumbholes, a kangaroo pocket, and a hood for when the sun is too hot or you forgot your hat. Our Fitness Hoodie TOPS is made out of our Active Athlon fabric with the added bonus of our Cooltect™ technology. You can be active in this fitted Fitness Hoodie TOPS without getting uncomfortably hot. So go ahead and enjoy sun-safe biking, walking, running and so much more!Highlights:UPF 50+Raglan long sleeves with thumbholesWelt kangaroo pocketHoodedActive Athlon™ fabric: Lightweight and breathable with moisture wicking for quick dry performanceCooltect™ technology accelerates moisture wicking to keep you cooler and more comfortable")
print([(ent.text, ent.label_) for ent in doc.ents])

[('Fitness Hoodie', 'PERSON'), ('Active Athlon', 'ORG'), ('Cooltect™', 'ORG'), ('Fitness Hoodie', 'PERSON'), ('Athlon™', 'ORG')]


In [26]:
from spacy import displacy
displacy.render(doc, style='ent')

Save the entity_ruler for 'TOPS' to disk

In [27]:
entity_ruler_TOPS.to_disk("entity_ruler_TOPS.jsonl")
nlp.to_disk("rule_model_TOPS")

Preparing Data for Scoring
Need some data to compare both method.(certain format)

In [28]:
from spacy.training import docs_to_json
from spacy.training.iob_utils import biluo_tags_from_offsets, spans_from_biluo_tags
import srsly


In [29]:
#for parsing data to TRIAN_DATA purpose
from spacy.matcher import Matcher

tops_matcher = Matcher(nlp.vocab)
#This rule_based matcher is only to detect "TOPS"
tops_matcher.add("TOPS_TYPE", create_tops_patterns())

def parse_train_data(text):
    doc = nlp(text)
    #ignore for now 
    #detections = [(doc[start:end].start_char, doc[start:end].end_char, 'TOPS') for idx, start, end in type_matcher(doc) ]
    
    spans = [doc[start:end] for _, start, end in tops_matcher(doc)]
    detections =  [(span.start_char, span.end_char, 'TOPS') for span in spacy.util.filter_spans(spans)] #remove duplicates or overlaps using spacy.util.filter_spans
    
    return (doc.text, {'entities': detections})

In [30]:
import pandas as pd
tops_train_data = pd.read_csv("test_data/tops_test.csv")


In [31]:
import common as c
title_tags_type_df = tops_train_data[['title',  'product_type', 'tags']]
title_tags_type_df.insert(len(title_tags_type_df.columns), 'raw_combined_text', '')
for i in range(len(title_tags_type_df)):
    try:
        raw_combined_text = c.clean_tags_text(title_tags_type_df.loc[i, 'title'], title_tags_type_df.loc[i, 'product_type'], title_tags_type_df.loc[i, 'tags'])
        
        title_tags_type_df.loc[i, 'raw_combined_text'] = raw_combined_text
    except:
        print("something wrong in line# : ", i)

In [51]:
TRAIN_DATA  = [parse_train_data(d) for d in nlp.pipe(title_tags_type_df.loc[:,'raw_combined_text'])]

In [61]:

docs = []
for text, annot in TRAIN_DATA:
    
    doc = nlp(text)
    tags = biluo_tags_from_offsets(doc, annot['entities'])
    entities = spans_from_biluo_tags(doc, tags)
    doc.ents = entities
    docs.append(doc)

srsly.write_json("spacy_format.json", [docs_to_json(docs)])

In [62]:
TRAIN_DATA[0:5]

[('the gift sports bra, sports bra, galaxy nu psychedelic sacred geometry space',
  {'entities': [(16, 19, 'TOPS'), (28, 31, 'TOPS')]}),
 ('tee shirt - burnt orange, tee shirt, fw20 gastown jack jones long mens ochre orange red short sleeve tee tees long sleeves white',
  {'entities': [(0, 3, 'TOPS'),
    (4, 9, 'TOPS'),
    (26, 29, 'TOPS'),
    (30, 35, 'TOPS'),
    (101, 104, 'TOPS'),
    (105, 109, 'TOPS')]}),
 ('disambiguation drop tee, drop tee, nu psychedelic sacred geometry',
  {'entities': [(20, 23, 'TOPS'), (30, 33, 'TOPS')]}),
 ('geometric print tunic top - new arrival, tunic, 221266 joseph ribkoff regular price resort s22 tank top tunic',
  {'entities': [(95, 103, 'TOPS')]}),
 ('billabong arch s/s tee, mens tees, 20-40 25-50 availability_in-stock billabong color_white gender_men macro-mens-clothing mens-tees size_large size_medium size_x-large size_xxl submacro-mens-shirts',
  {'entities': [(19, 22, 'TOPS'),
    (29, 33, 'TOPS'),
    (127, 131, 'TOPS'),
    (191, 197, 'TOPS

In [54]:
nlp_stat = spacy.load("ML_based_model")
nlp_rule = spacy.load("rule_model_TOPS")

In [55]:
from spacy import displacy
txt1 = "SweetLegs is excited to introduce Funky Bunch Plus SweetLegs, a bold and electrifying addition to our 2021 Plus Size leggings collection! This black-based print features a bright neon geometric pattern that wouldn't be out of place in a 90's party! It's perfect for those who like to make an entrance.Funky Bunch Plus Size Leggings look rad styled with a relaxed 90's-inspired denim jacket, a SweetTops Classic Tee in white, and a pair of orange converse."
displacy.render(nlp_stat(txt1), style="ent")
displacy.render(nlp_rule(txt1), style="ent")


In [56]:
txt2 = "This item is already made and ships out within 5 business days.Out of your size? Don't worry, order HERE100% soft-spun premium polyesterVibrant all over front &amp; back print Fade, crack &amp; wrinkle resistantLight fleece liningOur sweaters are made with love one at a time,Cut, sewn &amp; hand finished.. just for you!"
displacy.render(nlp_stat(txt2), style="ent")
displacy.render(nlp_rule(txt2), style="ent")

In [57]:
txt3 = "Style #85143 – 08/0827Cozy up in this beautiful cable knit sweater! Made with recycled cotton, this sweater features a funnel neck, long sleeves and chunky knit. It is a timeless classic that will pair with everything in your closet for years to come.Style: PulloverFit: RelaxedLength: Slightly below hipsFunnel necklineLong sleeves with ribbed knit cuffsRibbed knit hemMachine wash in cold waterComposition: 75% Recycled Cotton, 25% Polyester"
displacy.render(nlp_stat(txt3), style="ent")
displacy.render(nlp_rule(txt3), style="ent")

In [58]:
txt4 = "I wear a fancy T-SHirt TOPS and I got another button-down wonderful crop TOPS tee TOPS . Long shirt TOPS and coat TOPS are necessary for keeping warm in winter. Sweater TOPS and blouse TOPS are important for people living in the north. UA students have their own hoodies TOPS . The Tank Top TOPS is new stylish top-clothes. What about trying our new camisole which is fantastic?"
displacy.render(nlp_stat(txt4), style="ent")
displacy.render(nlp_rule(txt4), style="ent")

In [60]:
txt5 = "The Tempo Hoodie TOPS is the UPF 50+ activewear you've been looking for! It has thumbholes, a kangaroo pocket, and a hood for when the sun is too hot or you forgot your hat. Our Fitness Hoodie TOPS is made out of our Active Athlon fabric with the added bonus of our Cooltect™ technology. You can be active in this fitted Fitness Hoodie TOPS without getting uncomfortably hot. So go ahead and enjoy sun-safe biking, walking, running and so much more!Highlights:UPF 50+Raglan long sleeves with thumbholesWelt kangaroo pocketHoodedActive Athlon™ fabric: Lightweight and breathable with moisture wicking for quick dry performanceCooltect™ technology accelerates moisture wicking to keep you cooler and more comfortable"
displacy.render(nlp_stat(txt5), style="ent")
displacy.render(nlp_rule(txt5), style="ent")


