Pip Installations
-------------

In [0]:
%sh pip install --upgrade pip

In [0]:
%sh pip install BeautifulSoup4

In [0]:
%sh pip install -U spacy

In [0]:
%sh python -m spacy download en_core_web_lg

In [0]:
%sh pip install networkx

In [0]:
%sh pip install Unidecode

In [0]:
%sh pip install word2number

(1) Data Preparation
----------------

In [0]:
import re
import pandas as pd
import bs4
import requests
import spacy
from spacy import displacy
from spacy.matcher import Matcher 
from spacy.tokens import Span

import networkx as nx

import matplotlib.pyplot as plt
from tqdm import tqdm
from collections import defaultdict

pd.set_option('display.max_colwidth', 200)
%matplotlib inline

nlp = spacy.load('en_core_web_lg')

In [0]:
from collections import defaultdict
import json

In [0]:
import requests
from spacy.tokens import Token, Span
from spacy.matcher import PhraseMatcher
from spacy.matcher import Matcher

In [0]:
import pandas as pd

df1 = pd.read_csv("/dbfs/FileStore/shared_uploads/t_karthik.ragunath@tatadigital.com/box.csv", header=0)
df2 = pd.read_csv("/dbfs/FileStore/shared_uploads/t_karthik.ragunath@tatadigital.com/prod_meta.csv")
df3 = pd.read_csv("/dbfs/FileStore/shared_uploads/t_karthik.ragunath@tatadigital.com/pattern_data.csv")

In [0]:
df1.columns

In [0]:
df2.columns

In [0]:
df3.columns

In [0]:
df3.head(5)

Unnamed: 0,tag_value
0,Polka Dots
1,Plain
2,Tie and Dye
3,Tropical
4,Floral Small


In [0]:
df3['tag_value'] = df3['tag_value'].fillna('').str.lower()

```Matcher doesnt seem to work for some reason even for short texts```

In [0]:
matcher = Matcher(nlp.vocab)
patterns = []
for tag_value in df3['tag_value']:
    tag_val_toks = tag_value.split()
    tok_vals_len = len(tag_val_toks)
    pattern = []
    for index, tok in enumerate(tag_val_toks):
        pattern.append({'LOWER':tok})
        if index != tok_vals_len - 1:
            pattern.append({'IS_PUNCT': True})
    patterns.append(pattern)
print(patterns)
matcher.add('patterns', patterns)

In [0]:
gender_list = ['men', 'man', 'woman', 'women', 'boy', 'girl', 'lady']

In [0]:
doc = nlp("There is a tie and dye shirt in locker")
matches = matcher(doc)
for match_id, start, end in matches:
    print('yes')
    string_id = nlp.vocab.strings[match_id]  # Get string representation
    span = doc[start:end]  # The matched span
    print(match_id, string_id, start, end, span.text)

In [0]:
# Matcher wont work for long patterns

In [0]:
print(spacy.__version__)

```PatternMatcher seems to work```

In [0]:
matcher_lemma = PhraseMatcher(nlp.vocab, attr="LEMMA")
terms = list(df3['tag_value'])
# Only run nlp.make_doc to speed things up
patterns = [nlp(text) for text in terms]
matcher_lemma.add("patterns", patterns)

doc = nlp("There is a ties and dyes shirt in locker")
matches = matcher_lemma(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

In [0]:
matcher_lower = PhraseMatcher(nlp.vocab, attr="LOWER")
terms = list(df3['tag_value'])
# Only run nlp.make_doc to speed things up
patterns = [nlp(text) for text in terms]
matcher_lower.add("patterns", patterns)

doc = nlp("There is a Tie and Dye shirt in locker")
matches = matcher_lower(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

In [0]:
# Only run nlp.make_doc to speed things up
terms = []
df1['brand'] = df1['brand'].fillna('').str.lower()
df2['brand'] = df2['brand'].fillna('').str.lower()
for brand_json in df1['brand']:
    brands = json.loads(brand_json)
    for brand in brands:
        terms.append(brand)
for brand in df2['brand']:
    terms.append(brand)
terms = list(set(terms))   
brands = [nlp(text) for text in terms]
matcher_lower.add("brands", brands)

doc = nlp("The Jaipur story is a great fashion brand")
matches = matcher_lower(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

In [0]:
matcher_lemma.add("brands", brands)

doc = nlp("the jaipur stories is a great fashion brand")
matches = matcher_lemma(doc)
for match_id, start, end in matches:
    span = doc[start:end]
    print(span.text)

In [0]:
color_dataset = pd.read_csv("/dbfs/FileStore/shared_uploads/t_karthik.ragunath@tatadigital.com/colours_rgb_shades.csv")
color_dataset['Color Name'] = color_dataset['Color Name'].fillna('')
color_names = []
for color in color_dataset['Color Name']:
    color_names.append(''.join(' ' + c if c.isupper() else c for c in color).lower().strip())
color_set = set()
for ind_color in color_names:
    comma_sep_colors = ind_color.split(',')
    for color in comma_sep_colors:
        color = ' '.join(color.split())
        color_set.add(color)
color_list = list(color_set)

In [0]:
df_tag_1 = pd.read_csv("/dbfs/FileStore/shared_uploads/t_karthik.ragunath@tatadigital.com/tags_combined.csv", delimiter='\t', encoding='utf-8', header=0)
df_tag_2 = pd.read_csv("/dbfs/FileStore/shared_uploads/t_karthik.ragunath@tatadigital.com/pernia_tags.csv", delimiter='\t', encoding='utf-8', header=0)
df_tag_3 = pd.read_csv("/dbfs/FileStore/shared_uploads/t_karthik.ragunath@tatadigital.com/thredup_tags.csv", delimiter='\t', encoding='utf-8', header=0)

In [0]:
print(df_tag_1.columns, df_tag_2.columns, df_tag_3.columns)

In [0]:
df_tag_1['tag_type'] = df_tag_1['tag_type'].fillna('').str.lower()
df_tag_1['tag_value'] = df_tag_1['tag_value'].fillna('').str.lower()
df_tag_2['tag_type'] = df_tag_2['tag_type'].fillna('').str.lower()
df_tag_2['tag_value'] = df_tag_2['tag_value'].fillna('').str.lower()
df_tag_3['tag_type'] = df_tag_3['tag_type'].fillna('').str.lower()
df_tag_3['tag_value'] = df_tag_3['tag_value'].fillna('').str.lower()

In [0]:
tag_type_value_dict = defaultdict(set)
for tag_type, tag_value in zip(df_tag_1['tag_type'], df_tag_1['tag_value']):
    if 'color' in tag_type:
        tag_type_value_dict['color'].add(tag_value)
    elif 'pattern' in tag_type:
        tag_type_value_dict['pattern'].add(tag_value)
    elif 'sleevelength' in tag_type:
        tag_type_value_dict['sleeve length'].add(tag_value)
        if tag_value != 'sleeveless':
            concat_str = tag_value + ' ' + 'sleeve'
            tag_type_value_dict['sleeve length'].add(concat_str)
    else:
        tag_type_value_dict[tag_type].add(tag_value)
        
for tag_type, tag_value in zip(df_tag_2['tag_type'], df_tag_2['tag_value']):
    if 'color' in tag_type:
        tag_type_value_dict['color'].add(tag_value)
    elif 'pattern' in tag_type:
        tag_type_value_dict['pattern'].add(tag_value)
    elif 'sleevelength' in tag_type:
        tag_type_value_dict['sleeve length'].add(tag_value)
        if tag_value != 'sleeveless':
            concat_str = tag_value + ' ' + 'sleeve'
            tag_type_value_dict['sleeve length'].add(concat_str)
    else:
        tag_type_value_dict[tag_type].add(tag_value)
        
for tag_type, tag_value in zip(df_tag_3['tag_type'], df_tag_3['tag_value']):
    if 'color' in tag_type:
        tag_type_value_dict['color'].add(tag_value)
    elif 'pattern' in tag_type:
        tag_type_value_dict['pattern'].add(tag_value)
    elif 'sleevelength' in tag_type:
        tag_type_value_dict['sleeve length'].add(tag_value)
        if tag_value != 'sleeveless':
            concat_str = tag_value + ' ' + 'sleeve'
            tag_type_value_dict['sleeve length'].add(concat_str)
    else:
        tag_type_value_dict[tag_type].add(tag_value)

for gender in gender_list:
    tag_type_value_dict['gender'].add(gender)

In [0]:
matcher_lemma.remove("patterns")
matcher_lower.remove("patterns")
# matcher_lower.remove("colors")
# matcher_lemma.remove("colors")

In [0]:
for key, value in tag_type_value_dict.items():
    if key == 'color':
        color_list_loc = []
        for color in value:
            color_split = color.split('/')
            for loc_color in color_split:
                color_list_loc.append(loc_color)
        color_list = color_list + color_list_loc
        color_list = list(set(color_list))
        colors = [nlp(color) for color in color_list]
        matcher_lower.add("colors", colors)
        matcher_lemma.add("colors", colors)
    elif key == 'pattern':
        pattern_list = list(df3['tag_value'])
        pattern_list = pattern_list + list(value)
        pattern_list = list(set(pattern_list))
        patterns = [nlp(pattern) for pattern in pattern_list]
        matcher_lower.add('patterns', patterns)
        matcher_lemma.add('patterns', patterns)
    else:
        val_list = list(value)
        val_nlp = [nlp(val) for val in val_list]
        matcher_lower.add(key, val_nlp)
        matcher_lemma.add(key, val_nlp)

In [0]:
len(color_list)

In [0]:
import pandas as pd
candidate_sentences = pd.read_csv("/dbfs/FileStore/shared_uploads/t_karthik.ragunath@tatadigital.com/qa.csv", delimiter='\t', encoding='utf-8', index_col=0)

In [0]:
symbol_extraction_dictionary = {
    "length_symbols":{
        "km",
        "hm",
        "dam",
        "m",
        "dm",
        "cm",
        "mm"
    },

    "length_values":{
        "kilometre",
        "hectometre",
        "decametre",
        "metre",
        "decimetre",
        "centimetre",
        "millimetre",
        "kilometer",
        "hectometer",
        "decameter",
        "meter",
        "decimeter",
        "centimeter",
        "millimeter"
    },

    "weight_symbols":{
        "t",
        "kg",
        "hg",
        "dag",
        "g",
        "dg",
        "cg", 
        "mg"
    },

    "weight_values":{
        "tonne",
        "kilogram",
        "hectogram",
        "decagram",
        "gram",
        "decigram",
        "centigram",
        "milligram"
    },

    "volume_symbols":{
        "kL",
        "hL",
        "daL",
        "L",
        "dL",
        "cL",
        "mL"
    },

    "volume_values":{
        "kilolitre",
        "hectolitre",
        "decalitre",
        "litre",
        "decilitre",
        "centilitre",
        "millilitre",
        "kiloliter",
        "hectoliter",
        "decaliter",
        "liter",
        "deciliter",
        "centiliter",
        "milliliter"
    }
}

value_symbol_conversion = {
    "kilometre": "km",
    "hectometre": "hm",
    "decametre": "dam",
    "metre": "m",
    "decimetre": "dm",
    "centimetre": "cm",
    "millimetre": "mm",
    "kilometer": "km",
    "hectometer": "hm",
    "decameter": "dam",
    "meter": "m",
    "decimeter": "dm",
    "centimeter": "cm",
    "millimeter": "mm",
    "kilometres": "km",
    "hectometres": "hm",
    "decametres": "dam",
    "metres": "m",
    "decimetres": "dm",
    "centimetres": "cm",
    "millimetres": "mm",
    "kilometers": "km",
    "hectometers": "hm",
    "decameters": "dam",
    "meters": "m",
    "decimeters": "dm",
    "centimeters": "cm",
    "millimeters": "mm",

    "tonne": "t",
    "kilogram":	"kg",
    "hectogram": "hg",
    "decagram":	"dag",
    "gram":	"g",
    "decigram":	"dg",
    "centigram": "cg",
    "milligram": "mg",
    "tonnes": "t",
    "kilograms": "kg",
    "hectograms": "hg",
    "decagrams": "dag",
    "grams": "g",
    "decigrams": "dg",
    "centigrams": "cg",
    "milligrams": "mg",

    "kilolitre": "kL",
    "hectolitre": "hL",
    "decalitre": "daL",
    "litre": "L",
    "decilitre": "dL",
    "centilitre": "cL",
    "millilitre": "mL",
    "kiloliter": "kL",
    "hectoliter": "hL",
    "decaliter": "daL",
    "liter": "L",
    "deciliter": "dL",
    "centiliter": "cL",
    "milliliter": "mL",
    "kilolitres": "kL",
    "hectolitres": "hL",
    "decalitres": "daL",
    "litres": "L",
    "decilitres": "dL",
    "centilitres": "cL",
    "millilitres": "mL",
    "kiloliters": "kL",
    "hectoliters": "hL",
    "decaliters": "daL",
    "liters": "L",
    "deciliters": "dL",
    "centiliters": "cL",
    "milliliters": "mL",
    
    "dollars": "$",
    "Dollars": "$",
    "dollar": "$",
    "Dollar": "$",
    "Euro": "€",
    "euro": "€",
    "Euros": "€",
    "euros": "€",
    "Pound": "£",
    "pound": "£",
    "Pounds": "£",
    "pounds": "£",
    "Rupee": "₹",
    "rupee": "₹",
    "Rupees": "₹",
    "rupees": "₹"
}

In [0]:
def replace_with_symbols(sent):
    substring_overlap_list = list(filter(lambda x: x in sent, value_symbol_conversion.keys()))
    substring_overlap_list_sorted = sorted(substring_overlap_list, key=len, reverse=True)
    for r in substring_overlap_list_sorted:
        sent = sent.replace(r, value_symbol_conversion[r])
    return sent

In [0]:
string = "Puma Shoes above ten thousand and two hundred dollars"
replacement_string = replace_with_symbols(string)
print(replacement_string)

(2) Convert number words to numbers for semantic understanding and query parsing
--------------------------------------------------------------------------------

```
Eg: fifty thousand and five hundred - 50500, 
      Four thousand and five thousand - [4000, 5000]
```

In [0]:
from word2number import w2n

In [0]:
def convert_word_to_number(sent):
    doc_num = nlp(sent)
    doc_num_len = len(doc_num)
    k = 0
    category_tok_indices = []
    word_2_num_dict = defaultdict()
    while k < doc_num_len:
        tok_n = doc_num[k]
        if tok_n.pos_.lower() == 'num':
            if not tok_n.text.isnumeric():
                word_num = tok_n.text
                l = k + 1
                while l < doc_num_len:
                    tok_n_next = doc_num[l]
                    if (tok_n_next.pos_.lower() == 'num' and not tok_n_next.text.isnumeric()):
                        word_num += " " + tok_n_next.text
                    elif tok_n_next.pos_.lower() == 'cconj':
                        if l + 1 < doc_num_len and (doc_num[l + 1].pos_.lower() == 'num' and not doc_num[l + 1].text.isnumeric()):
                            word_num += " " + tok_n_next.text
                        else:
                            break
                    else:
                        break
                    l += 1
                try:
                    word_2_num_dict[word_num] = w2n.word_to_num(word_num)
                except:
                    nlp_word_num = nlp(word_num)
                    nlp_word_num_len = len(nlp_word_num)
                    word_num = ""
                    
                    for index in range(nlp_word_num_len):
                        word_tok = nlp_word_num[index]
                        if word_tok.pos_.lower() == 'num':
                            word_num += " " + word_tok.text
                            if index == (nlp_word_num_len - 1):
                                try:
                                    word_num = word_num.strip()
                                    word_2_num_dict[word_num] = w2n.word_to_num(word_num)
                                    word_num = ""
                                except:
                                    print("exception", "word num:", word_num)   
                        else:
                            try:
                                word_num = word_num.strip()
                                word_2_num_dict[word_num] = w2n.word_to_num(word_num)
                                word_num = ""
                            except:
                                print("exception", "word num:", word_num)

                k = l - 1
        k += 1        
    for key, val in word_2_num_dict.items():
        sent = sent.replace(key, str(val))
    return sent

In [0]:
word_num_to_be_converted = "forty thousand four hundred and eighty six"
number_string = convert_word_to_number(word_num_to_be_converted)
print("Number String:", number_string)

In [0]:
word_num_to_be_converted = "four thousand eighty six and five thousand sixty nine"
number_string = convert_word_to_number(word_num_to_be_converted)
print("Number String:", number_string)

(3) Extract Preposition and Contextual Meaning Around Them
----------------------------------------------------------

``` 
 Eg - Red Shirts between 4000 and 5000 dollars
 
  Preposition - between
  
  Preposition Meaning - 4000 and 5000 $
```

In [0]:
def get_preposition_meaning(token, cur_string):
    left_string = ""
    right_string = ""
    for left_val in token.lefts:
        left_string += " " + left_val.text
        left_string = get_preposition_meaning(left_val, left_string)
        
    for right_val in token.rights:
        right_string += " " + right_val.text
        right_string = get_preposition_meaning(right_val, right_string)
        
    cur_string = left_string + " " + cur_string + " " + right_string
    cur_string = cur_string.strip()
    cur_string = " ".join(cur_string.split())
#     print(cur_string, list(token.lefts), list(token.rights))
    return cur_string

In [0]:
def get_preposition_and_preposition_meaning(sent):
    sent = replace_with_symbols(sent)
    sent = convert_word_to_number(sent)
    doc = nlp(sent)
    doc_len = len(doc)
    i = 0
    preposition_list = []
    preposition_meaning_list = []
    while i < doc_len:
        tok = doc[i]
        if tok.dep_ == 'prep':
          prep_string = ""
          preposition_list.append(tok.text)
          preposition_meaning_list.append(get_preposition_meaning(tok, prep_string))
        i += 1
    return preposition_list, preposition_meaning_list

In [0]:
string = "Puma Shoes above ten thousand and two hundred dollars"
preposition_list, preposition_meaning_list = get_preposition_and_preposition_meaning(string)
print(preposition_list, preposition_meaning_list)

In [0]:
string = "Preeti grinder under ten kilograms"
preposition_list, preposition_meaning_list = get_preposition_and_preposition_meaning(string)
print(preposition_list, preposition_meaning_list)

(4) Reverse map facet values to facet fields and field types (Assumed to be done)
---------------------------------------------------------------------------------

Create Doc Spans to Work With Extracted Facet Values
----------------------------------------------------

In [0]:
def clean_text(str_data):
    str_data = str_data.strip()
    str_data = " ".join(str_data.split())
    return str_data

In [0]:
def merge_spans(tups=None):
    if tups and len(tups) >= 2:
        prev = tups[0]
        tups_span = []
        for tup in tups[1:]:
            if tup[0] < prev[1]:
                if tup[1] > prev[1]:
                    prev[1] = tup[1]
            else:
                tups_span.append(prev)
                prev = tup
        tups_span.append(prev)
        return tups_span
    return tups

```
Retokenezing and fetching reverse mapped values
```

In [0]:
def matcher_retokenization(queries=None, csv_name=None, columns=None):
    if not queries and not csv_name:
        candidate_sentences = pd.read_csv("/dbfs/FileStore/shared_uploads/t_karthik.ragunath@tatadigital.com/qa.csv", delimiter='\t', encoding='utf-8', index_col=0)
        candidate_sentences['Query'] = candidate_sentences['Query'].fillna('').str.lower()
        queries = list(candidate_sentences['Query'])
    elif not queries:
        if columns:
            candidate_sentences = pd.read_csv(csv_name, encoding='utf-8', usecols=columns)
        else:
            candidate_sentences = pd.read_csv(csv_name, encoding='utf-8')
        queries = list(candidate_sentences['Query'])
        
    sentence_cleaned_of_brands = []
    brand_list = []
    autotags_list = []
    retokenized_docs = []
    symbol_converted_strings = []
    word_to_num_converted_strings = []
    for k in tqdm(queries):
        brand_list_inner = []
        autotags_dict = defaultdict(list)
        
        k = replace_with_symbols(k)
        symbol_converted_strings.append(k)
        k = convert_word_to_number(k)
        word_to_num_converted_strings.append(k)
        doc = nlp(k)
        matches = matcher_lemma(doc)
        for match_id, start, end in matches:
            string_id = nlp.vocab.strings[match_id]
            if string_id == 'brands':
                span = doc[start:end]    
                brand_list_inner.append(span.text)
                k = k.replace(span.text, '')
        
        k = clean_text(k)
        doc = nlp(k)
        matches = matcher_lower(doc)
        for match_id, start, end in matches:
            string_id = nlp.vocab.strings[match_id]
            if string_id == 'brands':
                span = doc[start:end]
                brand_list_inner.append(span.text)
                k = k.replace(span.text, '')
                            
        k = clean_text(k)
        doc = nlp(k)
        matches = matcher_lemma(doc)
        tuple_tag_span_set = set()
        doc_spans = []
        for match_id, start, end in matches:
            string_id = nlp.vocab.strings[match_id]
            if string_id != 'brands':
                span = doc[start:end]
                tuple_tag_span_set.add((start, end))
                doc_spans.append(span)
                autotags_dict[string_id].append(span.text)
        
        matches = matcher_lower(doc)
        for match_id, start, end in matches:
            string_id = nlp.vocab.strings[match_id]
            if string_id != 'brands':
                if (start, end) not in tuple_tag_span_set:
                    span = doc[start:end]
                    tuple_tag_span_set.add((start, end))
                    doc_spans.append(span)
                    autotags_dict[string_id].append(span.text)
                        
        span_indices_sorted = sorted(tuple_tag_span_set, key=lambda element: element[:])
        span_indices_sorted = [list(tup) for tup in span_indices_sorted]
        merged_doc_span_indices = merge_spans(span_indices_sorted)
        merged_doc_spans = []
        for doc_span_index in merged_doc_span_indices:
            span = doc[doc_span_index[0]:doc_span_index[1]]
            merged_doc_spans.append(span)
                                                 
        with doc.retokenize() as retokenizer:
            for span in merged_doc_spans:
                retokenizer.merge(span)
        
        brand_list.append(brand_list_inner)
        autotags_list.append(autotags_dict)
        retokenized_docs.append(doc)
        
    return symbol_converted_strings, word_to_num_converted_strings, brand_list, autotags_list, retokenized_docs

(5) Unsupervised feature extraction to extract category and qualities separately and create dependency mapping between them
---------------------------------------------------------------------------------------------------------------------------

Created entirely from Spacy English Language Model (Large)

For more details regarding accuracy and performance of this model

https://spacy.io/models/en

In [0]:
def unsupervised_feature_extraction(doc):    
    is_prev_tok_prep = False
    category_tok_indices = []
    category_list = []
    quality_list = []
    preposition_list = []
    preposition_meaning_list = []
    
    prefix = ""
    modifier = ""
    category = ""
    doc_len = len(doc)
    i = 0
    while i < doc_len:
        tok = doc[i]        
        ## chunk 2: check if token is a modifier or not
        if tok.dep_.endswith("mod") == True and (tok.dep_.lower() != 'nummod' or (not is_prev_tok_prep and (not preposition_meaning_list or tok.text not in preposition_meaning_list[-1]))):
            modifier = tok.text
            j = i + 1
            while j < doc_len:
                tok_next = doc[j]
                if tok_next.dep_.endswith("mod"):
                    cleaned_text = clean_text(modifier)
                    quality_list.append(cleaned_text)
                    modifier = tok_next.text
                    j += 1
                else:
                    break
            i = j - 1
            cleaned_text = clean_text(modifier)
            quality_list.append(cleaned_text)
            prefix = ""
            modifier = ""
            category = ""
            is_prev_tok_prep = False
            
        ## chunk 2: check if token is a coumpuund word or not
        elif tok.dep_ != "punct" and (tok.dep_.lower() == 'compound' or (tok.pos_.lower() != 'propn' and tok.pos_.lower() != 'noun' and tok.dep_.lower() == 'root')):
            prefix = tok.text
            j = i + 1
            while j < doc_len:
                tok_next = doc[j]
                if tok_next.dep_.lower() == 'compound' or (tok_next.pos_.lower() != 'propn' and tok_next.pos_.lower() != 'noun' and tok_next.dep_.lower() == 'root'):
                    prefix += " " + tok_next.text
                    j += 1
                else:
                    break
            i = j - 1
            quality_list.append(prefix)
            prefix = ""
            modifier = ""
            category = ""
            is_prev_tok_prep = False
            
        ## chunk 3: check if token is a noun or not    
        elif (tok.pos_.lower() == 'propn' or tok.pos_.lower() == 'noun'):
            category = prefix + " " + tok.text
            j = i + 1
            while j < doc_len:
                tok_next = doc[j]
                if (tok_next.pos_.lower() == 'propn' or tok_next.pos_.lower() == 'noun'):
                    category += " " + tok_next.text
                    j += 1
                else:
                    break
            
            category_tok_indices.append((i, j))
            i = j - 1
            category = category.strip()
            category = " ".join(category.split())
            category_list.append(category)
            prefix = ""
            modifier = ""
            category = ""
            is_prev_tok_prep = False
            
                  
        ## chunk 4: check if token is a prep or not and to extract meaning around preposition
        elif tok.dep_ == 'prep':
            prep_string = ""
            preposition_list.append(tok.text)
            preposition_meaning_list.append(get_preposition_meaning(tok, prep_string))
            is_prev_tok_prep = True
        
        else:
            is_prev_tok_prep = False
        
        i += 1

    category_feature_relation = []
    for cat_index, cat_tuple in enumerate(category_tok_indices):
        quality_set = set()
        for tok_index in range(cat_tuple[0], cat_tuple[1]):
            tok = doc[tok_index]
            for child in tok.children:
                for quality in quality_list:
                    if child.text in quality and quality not in quality_set:
                        category_feature_relation.append(category_list[cat_index] + ":" + quality)
                        quality_set.add(quality)
                        break
            
    
    return quality_list, category_list, preposition_list, preposition_meaning_list, category_feature_relation

Lets Try Out Few Examples
-------------------------

In [0]:
symbol_conv, word_num_conv, brand, autotags, retok_doc = matcher_retokenization(["dark green shirt"])
quality_tags, category_tags, preposition_tags, preposition_meaning_tags, category_feature_relation = unsupervised_feature_extraction(retok_doc[0])
print(quality_tags, category_tags, autotags)

symbol_conv, word_num_conv, brand, autotags, retok_doc = matcher_retokenization(["dark blue shirt"])
quality_tags, category_tags, preposition_tags, preposition_meaning_tags, category_feature_relation = unsupervised_feature_extraction(retok_doc[0])
print(quality_tags, category_tags, autotags)

In [0]:
symbol_conv, word_num_conv, brand, autotags, retok_doc = matcher_retokenization(["dark green long sleeve shirt with black pants"])
quality_tags, category_tags, preposition_tags, preposition_meaning_tags, category_feature_relation = unsupervised_feature_extraction(retok_doc[0])
print(quality_tags, category_tags, autotags, category_feature_relation, preposition_tags, preposition_meaning_tags)

Preposition Searches
--------------------

In [0]:
# queries = ["LG LED TV under twenty thousand and five hundred $", "Nike grinder under ten kilograms", "Puma Shoes above ten thousand and two hundred dollars", "Playstation between four thousand and five thousand rupees", "Android Phone 6GB RAM below 7 inches"]

Curated NLP Search
------------------

In [0]:
input_columns = ['Query', 'Preposition', 'Preposition Meaning', 'Brand', 'Autotags', 'Quality', 'Category', 'Category_Feature_Relation', 'Old Tagger', 'Is New Tagger Not Ideal?', 'Search Query Type']
output_columns = ['Query', 'Symbols', 'Symbols GT', 'Symbols Validation', 'Word Num', 'Word Num GT', 'Word Num Validation', 'Preposition', 'Preposition GT', 'Preposition Validation', 'Preposition Meaning', 'Preposition Meaning GT', 'Preposition Meaning Validation', 'Brand', 'Brand GT', 'Brand Validation', 'Autotags', 'Autotags GT', 'Autotags Validation', 'Quality', 'Quality GT', 'Quality Validation', 'Category', 'Category GT', 'Cateegory Validation', 'Category_Feature_Relation', 'Category_Feature_Relation GT', 'Category_Feature_Relation Validation', 'Old Tagger', 'Is New Tagger Not Ideal?', 'Search Query Type']
csv_name = "/dbfs/FileStore/shared_uploads/t_karthik.ragunath@tatadigital.com/nlp_search_curated_new.csv"
candidate_sentences = pd.read_csv(csv_name, encoding='utf-8', usecols=input_columns)

Applying Tagger To Test Set
---------------------------

In [0]:
custom_queries = True
quality_tags_list = []
category_tags_list = []
preposition_list = [] 
preposition_meaning_list = []
category_feature_relation_list = []
if custom_queries:
    symbol_conv_list, word_num_conv_list, brand_list, autotags_list, retokenized_docs = matcher_retokenization(columns=input_columns, csv_name=csv_name)
else:
    symbol_conv_list, word_num_conv_list, brand_list, autotags_list, retokenized_docs = matcher_retokenization()
for doc in retokenized_docs:
    quality_tags, category_tags, preposition_tags, preposition_meaning_tags, category_feature_relation = unsupervised_feature_extraction(doc)
    quality_tags_list.append(quality_tags)
    category_tags_list.append(category_tags)
    preposition_list.append(preposition_tags)
    preposition_meaning_list.append(preposition_meaning_tags)
    category_feature_relation_list.append(category_feature_relation)

In [0]:
set_pre_values = True
if not output_columns:
    qa_tag_extraction = pd.DataFrame(columns=['Query','Brand', 'Autotags', 'Quality', 'Category', 'Preposition', 'Preposition Meaning', 'Category_Feature_Relation'])
else:
    qa_tag_extraction = pd.DataFrame(columns=output_columns)

qa_tag_extraction['Query'] = list(candidate_sentences["Query"])
qa_tag_extraction['Symbols'] = symbol_conv_list
qa_tag_extraction['Word Num'] = word_num_conv_list
qa_tag_extraction['Brand'] = brand_list
qa_tag_extraction['Autotags'] = autotags_list
qa_tag_extraction['Quality'] = quality_tags_list
qa_tag_extraction['Category'] = category_tags_list
qa_tag_extraction['Preposition'] = preposition_list
qa_tag_extraction['Preposition Meaning'] = preposition_meaning_list
qa_tag_extraction['Category_Feature_Relation'] = category_feature_relation_list

if output_columns and set_pre_values:
    for output_column in output_columns:
        if ' GT' in output_column:
            qa_tag_extraction[output_column] = qa_tag_extraction[output_column.split(' GT')[0]]
    qa_tag_extraction['Old Tagger'] = candidate_sentences['Old Tagger']
    qa_tag_extraction['Is New Tagger Not Ideal?'] = candidate_sentences['Is New Tagger Not Ideal?']
    qa_tag_extraction['Search Query Type'] = candidate_sentences['Search Query Type']

qa_tag_extraction.to_csv('/dbfs/mnt/nemo/gt_validation.csv', header=True, columns=output_columns, encoding='utf-8')

Query Expansion
---------------
```
Using WordNet for Query Expansion
```

In [0]:
from nltk.corpus import wordnet as wn

```
Finding Synonyms for Bag
```

In [0]:
actual_word = 'bag'

In [0]:
wn.synsets(actual_word)

```
We are only interested in Noun Components
```

In [0]:
wn.synsets(actual_word, pos='n')

```
Fetching Synonyms, Synset and Hypernyms
```

In [0]:
synonyms = set()
synset = set()
synonyms_synset = set()
hypernyms_set = set()
for syn in wn.synsets(actual_word, pos='n'):
    for l in syn.lemmas():
        synonyms.add(l.name())
        synonyms_synset.update(wn.synsets(l.name()))
        synset.add(syn)
    for hyp in syn.hypernyms():
        for l in hyp.lemmas():
            hypernyms_set.add(l.name())

```
Lets view the lists generated
```

In [0]:
print(synonyms)
print('------------')
print(synset)
print('------------')
print(hypernyms_set)

```
Combining with Spacy to get the most similar word
```

In [0]:
# word_list = ['bagful', 'udder', 'handbag', 'pocketbook', 'suitcase', 'dish', 'cup_of_tea', 'grip', 'purse', 'travelling_bag', 'bag', 'old_bag', 'traveling_bag', 'base']
word_list = list(synonyms)
similarity_scores = []
for word in word_list:
    word_str = actual_word + " " + word
    tokens = nlp(word_str)
    token1, token2 = tokens[0], tokens[1]
    similarity_scores.append((token1, token2, token1.similarity(token2)))
    print(token1.text, token2.text, "Similarity:", token1.similarity(token2))

In [0]:
similarity_scores = sorted(similarity_scores, key = lambda tup_element: tup_element[2], reverse=True) 

In [0]:
threshold_similarity = 0.6

In [0]:
filtered_tups = list(filter(lambda x: x[2] >= threshold_similarity, similarity_scores))

In [0]:
filtered_tups

Hyperlinks to related POC's which are to be integrated into this work
---------------------------------------------------------------------

[Auto Correct](https://adb-7874961220978185.5.azuredatabricks.net/?o=7874961220978185#notebook/3868090962077216/command/3868090962077222)


[Customly Training Spacy Models To Improve Default Models - NER, POS, Dependency Parser](https://adb-7874961220978185.5.azuredatabricks.net/?o=7874961220978185#notebook/3822701336911906/command/3822701336911907)


[Learning To Rank](https://adb-7874961220978185.5.azuredatabricks.net/?o=7874961220978185#notebook/56327818486021/command/3872413151480651)

ROUGH WORK

In [0]:
doc = nlp('men OFF WHITE blue full sleeve tshirt')
matches = matcher_lemma(doc)
for match_id, start, end in matches:
    string_id = nlp.vocab.strings[match_id]
    span = doc[start:end]
    print(span.text, match_id, string_id)

In [0]:
string = "Playstation between four thousand and five thousand ₹"

doc = nlp(string)
for tok in doc:
    print(tok.text, tok.dep_, tok.pos_, tok.lemma_)
    for child in tok.children:
        print('child:', child, 'type:', type(child))
    for child in tok.rights:
        print('rights:', child, 'type:', type(child))
    for child in tok.lefts:
        print('lefts:', child, 'type:', type(child))
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)