In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install BeautifulSoup4

In [None]:
import re
import pandas as pd
import bs4
import requests
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import Matcher 
from spacy.tokens import Span 

import networkx as nx

import matplotlib.pyplot as plt
from tqdm import tqdm

pd.set_option('display.max_colwidth', 200)
%matplotlib inline

In [None]:
candidate_sentences = pd.read_csv("../input/wiki-sentences/wiki_sentences_v2.csv")
candidate_sentences.shape

In [None]:
candidate_sentences['sentence'].sample(5)

In [None]:
doc = nlp("the drawdown process is governed by astm standard d823")

for tok in doc:
  print(tok.text, "...", tok.dep_)

In [None]:
def get_entities(sent):
  ## chunk 1
  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""    # dependency tag of previous token in the sentence
  prv_tok_text = ""   # previous token in the sentence

  prefix = ""
  modifier = ""

  #############################################################
  
  for tok in nlp(sent):
    ## chunk 2
    # if token is a punctuation mark then move on to the next token
    if tok.dep_ != "punct":
      # check: token is a compound word or not
      if tok.dep_ == "compound":
        prefix = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " "+ tok.text
      
      # check: token is a modifier or not
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          modifier = prv_tok_text + " "+ tok.text
      
      ## chunk 3
      if tok.dep_.find("subj") == True:
        ent1 = modifier +" "+ prefix + " "+ tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""      

      ## chunk 4
      if tok.dep_.find("obj") == True:
        ent2 = modifier +" "+ prefix +" "+ tok.text
        
      ## chunk 5  
      # update variables
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text
  #############################################################

  return [ent1.strip(), ent2.strip()]

In [None]:
get_entities("the film had 200 patents")

In [None]:
entity_pairs = []

for i in tqdm(candidate_sentences["sentence"]):
  entity_pairs.append(get_entities(i))

In [None]:
entity_pairs[10:20]

In [None]:
def get_relation(sent):

  doc = nlp(sent)

#   for tok in doc:
#     print(tok.text, " ", tok.dep_, " ", tok.pos_)

  # Matcher class object 
  matcher = Matcher(nlp.vocab)

  #define the pattern 
  pattern = [{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}] 

  matcher.add("matching_1", None, pattern) 

  matches = matcher(doc)
#   print("Matches:", matches)
  k = len(matches) - 1
#   print("K:", k)
  span = doc[matches[k][1]:matches[k][2]] 

  return(span.text)

In [None]:
get_relation("John completed the task")

In [None]:
get_relation("John and Jan")

In [None]:
relations = [get_relation(i) for i in tqdm(candidate_sentences['sentence'])]

In [None]:
a = 1

In [None]:
pd.Series(relations).value_counts()[:50]

In [None]:
# extract subject
source = [i[0] for i in entity_pairs]

# extract object
target = [i[1] for i in entity_pairs]

kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations})

In [None]:
kg_df.head()

In [None]:
G=nx.from_pandas_edgelist(kg_df, "source", "target", 
                          edge_attr=True, create_using=nx.MultiDiGraph())

In [None]:
plt.figure(figsize=(12,12))

pos = nx.spring_layout(G)
nx.draw(G, with_labels=True, node_color='skyblue', edge_cmap=plt.cm.Blues, pos = pos)
plt.show()

In [None]:
G=nx.from_pandas_edgelist(kg_df[kg_df['edge']=="composed by"], "source", "target", 
                          edge_attr=True, create_using=nx.MultiDiGraph())

plt.figure(figsize=(12,12))
pos = nx.spring_layout(G, k = 0.5) # k regulates the distance between nodes
nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)
plt.show()

In [None]:
G=nx.from_pandas_edgelist(kg_df[kg_df['edge']=="released in"], "source", "target", 
                          edge_attr=True, create_using=nx.MultiDiGraph())

plt.figure(figsize=(12,12))
pos = nx.spring_layout(G, k = 0.5)
nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)
plt.show()

In [None]:
!pip install pytorch-pretrained-bert pytorch-nlp

In [None]:
# Import Libraries

import tensorflow as tf
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv("../input/cola-the-corpus-of-linguistic-acceptability/cola_public/raw/in_domain_train.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.sample(10)

In [None]:
# Create sentence and label lists
sentences = df.sentence.values

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
labels = df.label.values

In [None]:
sentences[:10]

In [None]:
labels[:10]

In [None]:
df[:10]

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])

In [None]:
MAX_LEN = 128

In [None]:
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

In [None]:
len(input_ids)

In [None]:
# input_ids[:2]

In [None]:
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [None]:
len(input_ids)

In [None]:
# input_ids[:2]

In [None]:
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

In [None]:
# attention_masks[:1]

In [None]:
df.label_notes.unique()

In [None]:
# Use train_test_split to split our data into train and validation sets for training

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [None]:
print(len(train_inputs), len(validation_inputs))

In [None]:
7695 + 856

In [None]:
# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [None]:
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

In [None]:
param_optimizer = list(model.named_parameters())
no_decay = ['bias', 'gamma', 'beta']
optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay_rate': 0.01},
                                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],'weight_decay_rate': 0.0}]

In [None]:
optimizer = BertAdam(optimizer_grouped_parameters,lr=2e-5,warmup=.1)

In [None]:
del df

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
USE_GPU = True

if USE_GPU and torch.cuda.is_available():
    print('using device: cuda')
else:
    print('using device: cpu')

In [None]:
device = torch.device("cuda:0")
device

In [None]:
device = "cuda:0"
model = model.to(device)

In [None]:
t = [] 

# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs 
epochs = 2

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
  
  
  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()
  
  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
#     Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    train_loss_set.append(loss.item())    
    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    
    
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    
  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables 
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:
#     Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

  print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

In [None]:
plt.figure(figsize=(15,8))
plt.title("Training loss")
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.plot(train_loss_set)
plt.show()

In [None]:
del df

In [None]:
df = pd.read_csv("../input/cola-the-corpus-of-linguistic-acceptability/cola_public/raw/out_of_domain_dev.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])

In [None]:
# Create sentence and label lists
sentences = df.sentence.values

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
labels = df.label.values

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]


MAX_LEN = 128

# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 

prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)
  
batch_size = 32  


prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [None]:
# Prediction on test set

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  # Telling the model not to compute or store gradients, saving memory and speeding up prediction
  with torch.no_grad():
    # Forward pass, calculate logit predictions
    logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

In [None]:
print(predictions[0], true_labels[0])
print(len(predictions[0]), len(true_labels[0]))

In [None]:
# Import and evaluate each test batch using Matthew's correlation coefficient
from sklearn.metrics import matthews_corrcoef
matthews_set = []

for i in range(len(true_labels)):
  matthews = matthews_corrcoef(true_labels[i],
                 np.argmax(predictions[i], axis=1).flatten())
  matthews_set.append(matthews)

In [None]:
prediction_set = []
for i in range(len(true_labels)):
  matthews = np.argmax(predictions[i], axis=1).flatten()
  prediction_set.append(matthews)

In [None]:
matthews_set

In [None]:
prediction_set

In [None]:
true_labels

In [1]:
# ---------------------------

In [2]:
!pip install BeautifulSoup4

Collecting BeautifulSoup4
  Downloading beautifulsoup4-4.9.3-py3-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 405 kB/s eta 0:00:01
[?25hCollecting soupsieve>1.2
  Downloading soupsieve-2.2.1-py3-none-any.whl (33 kB)
Installing collected packages: soupsieve, BeautifulSoup4
Successfully installed BeautifulSoup4-4.9.3 soupsieve-2.2.1


In [3]:
import re
import pandas as pd
import bs4
import requests
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import Matcher 
from spacy.tokens import Span 

import networkx as nx

import matplotlib.pyplot as plt
from tqdm import tqdm

pd.set_option('display.max_colwidth', 200)
%matplotlib inline

In [4]:
candidate_sentences = pd.read_csv('../input/qa-csv/qa.csv', delimiter='\t', encoding='utf-8', index_col=0)
candidate_sentences.shape

(613, 1)

In [5]:
candidate_sentences.columns

Index(['Query'], dtype='object')

In [6]:
candidate_sentences['Query'].sample(5)

536                    socks
585     platform espadrilles
188               pajama top
452    TOM FORD pencil skirt
525            multiway bras
Name: Query, dtype: object

In [7]:
candidate_sentences['Query'] = candidate_sentences['Query'].fillna('')

In [8]:
candidate_sentences['Query'][:5]

0                    bag
1          Fendi handbag
2    Fendi beige handbag
3              backpacks
4        black backpacks
Name: Query, dtype: object

In [11]:
def get_entities(sent):
    ## chunk 1
    ent1 = ""
    ent2 = ""
    sub_list = []
    obj_list = []

    prv_tok_dep = ""    # dependency tag of previous token in the sentence
    prv_tok_text = ""   # previous token in the sentence

    prefix = ""
    modifier = ""
    subject = ""

    for tok in nlp(sent):
        print('text:', tok.text, 'dep:', tok.dep_, 'pos:', tok.pos_, 'tag:', tok.tag_)
        ## chunk 2
        # if token is a punctuation mark then move on to the next token
        if tok.dep_ != "punct":
            # check: token is a compound word or not
            if tok.dep_ == "compound":
                prefix = tok.text
                # if the previous word was also a 'compound' then add the current word to it
                if prv_tok_dep == "compound":
                    prefix = prv_tok_text + " " + tok.text

        # check: token is a modifier or not
        if tok.dep_.endswith("mod") == True:
            # if the previous word was also a 'compound' then add the current word to it
            if prv_tok_dep == "compound":
                modifier = prefix + " " + tok.text
                prefix = " "
            elif prv_tok_dep.endswith("mod"):
                modifier = prv_tok_text + " " + tok.text
            else:
                modifier = tok.text

#           ## chunk 3
#         if tok.dep_.find("subj") == True:
#             ent1 = modifier +" "+ prefix + " "+ tok.text
#             sub_list.append(ent1.strip())
#             prefix = ""
#             modifier = ""
#             prv_tok_dep = ""
#             prv_tok_text = ""
        
        if (tok.dep_.find("subj") == True and tok.tag_.lower() not in ('prp')):
            subject = modifier + " " + prefix + " " + tok.text
            prefix = ""
            modifier = ""
            prv_tok_dep = ""
            prv_tok_text = "" 
        
        elif tok.pos_.lower() == 'propn':
            

          ## chunk 4
        elif tok.dep_.find("obj") == True or (tok.dep_ == 'ROOT' and tok.tag_.lower() not in ('vbp')):
            ent2 = (subject + " " + modifier + " " + prefix + " " + tok.text).strip()
            ent2 = " ".join(ent2.split())
#             print('subject:', subject, 'modifier:', modifier, 'prefix:', prefix, 'text:', tok.text)
            obj_list.append(ent2.strip())
            prefix = ""
            modifier = ""
            prv_tok_dep = ""
            prv_tok_text = ""
            subject = ""

        ## chunk 5  
        # update variables
        else:
            prv_tok_dep = tok.dep_
            prv_tok_text = tok.text

#     return [ent1.strip(), ent2.strip()]
    return sub_list, obj_list

In [12]:
def feature_extraction(sent):
    ## chunk 1
    ent1 = ""
    ent2 = ""
    sub_list = []
    obj_list = []

    prv_tok_dep = ""    # dependency tag of previous token in the sentence
    prv_tok_text = ""   # previous token in the sentence

    prefix = ""
    modifier = ""
    subject = ""
    doc = nlp(sent)
    doc_len = len(doc)
    i = 0
    while i < doc_len:
        tok = doc[i]
        print('text:', tok.text, 'dep:', tok.dep_, 'pos:', tok.pos_, 'tag:', tok.tag_)
        ## chunk 2
        # if token is a punctuation mark then move on to the next token

        # check: token is a modifier or not
        if tok.dep_.endswith("mod") == True:
            # if the previous word was also a 'compound' then add the current word to it
            if prv_tok_dep == "compound":
                modifier += " " + prefix + " " + tok.text
                prefix = " "
            elif prv_tok_dep.endswith("mod"):
                modifier += " " + prv_tok_text + " " + tok.text
            else:
                modifier += " " + tok.text

        ## chunk 3        
        elif tok.pos_.lower() == 'propn' or tok.pos_.lower() == 'noun':
            if tok.dep_.endswith("mod"):
                subject = prefix + " " + tok.text
            else:
                subject = modifier + " " + prefix + " " + tok.text
            j = i + 1
            while j < doc_len:
                tok_next = doc[j]
                if tok_next.pos_.lower() == 'propn' or tok_next.pos_.lower() == 'noun' :
                    subject += " " + tok_next.text
                    j += 1
                else:
                    break
            i = j - 1
            subject = subject.strip()
            subject = " ".join(subject.split())
            sub_list.append(subject)
            prefix = ""
            modifier = ""
            prv_tok_dep = ""
            prv_tok_text = ""
            subject = ""
            
        elif tok.dep_ != "punct":
            # check: token is a compound word or not
            if tok.dep_ == "compound":
                prefix = tok.text
                # if the previous word was also a 'compound' then add the current word to it
                if prv_tok_dep == "compound":
                    prefix = prv_tok_text + " " + tok.text
                if prv_tok_dep.endswith("mod"):
                    prefix = modifier + " " + tok.text

#         ## chunk 4
#         elif tok.dep_.find("obj") == True or (tok.dep_ == 'ROOT' and tok.tag_.lower() not in ('vbp')):
#             obj = (modifier + " " + prefix + " " + tok.text).strip()
#             obj = " ".join(obj.split())
#             obj_list.append(obj.strip())
#             prefix = ""
#             modifier = ""
#             prv_tok_dep = ""
#             prv_tok_text = ""
#             subject = ""

        ## chunk 5  
        # update variables
        else:
            prv_tok_dep = tok.dep_
            prv_tok_text = tok.text
            modifier = ""
        i += 1

    return sub_list

In [9]:
def unsupervised_feature_extraction(sent):
    ## chunk 1
    category_list = []
    quality_list = []

    prefix = ""
    modifier = ""
    category = ""
    doc = nlp(sent)
    doc_len = len(doc)
    i = 0
    while i < doc_len:
        tok = doc[i]
#         print('text:', tok.text, 'dep:', tok.dep_, 'pos:', tok.pos_, 'tag:', tok.tag_)
        
        ## chunk 3: check if token is a modifier or not
        if tok.dep_.endswith("mod") == True:
            modifier = prefix + " " + tok.text
            j = i + 1
            while j < doc_len:
                tok_next = doc[j]
                if tok_next.dep_.endswith("mod"):
                    modifier += " " + tok_next.text
                    j += 1
                else:
                    break
            i = j - 1
            modifier = modifier.strip()
            modifier = " ".join(modifier.split())
            quality_list.append(modifier)
            prefix = ""
            modifier = ""
            category = ""

        ## chunk 2: check if token is a noun or not    
        elif tok.pos_.lower() == 'propn' or tok.pos_.lower() == 'noun':
            category = prefix + " " + tok.text
            j = i + 1
            while j < doc_len:
                tok_next = doc[j]
                if (tok_next.pos_.lower() == 'propn' or tok_next.pos_.lower() == 'noun'):
                    category += " " + tok_next.text
                    j += 1
                else:
                    break
            i = j - 1
            category = category.strip()
            category = " ".join(category.split())
            category_list.append(category)
            prefix = ""
            modifier = ""
            category = ""

        
        ## chunk 4: check if token is a coumpuund word or not
        elif tok.dep_ != "punct":
            if tok.dep_.lower() == 'compound':
                prefix += " " + tok.text
                j = i + 1
                while j < doc_len:
                    tok_next = doc[j]
                    if tok_next.dep_.lower() == 'compound':
                        prefix += " " + tok_next.text
                        j += 1
                    else:
                        break
        i += 1

    return quality_list, category_list

In [13]:
# entity_pairs = []

# for i in tqdm(candidate_sentences["Query"][:100]):
#   entity_pairs.append(get_entities(i))

In [36]:
feature_extraction('TOMMY HILFIGER blue jacket')

text: TOMMY dep: compound pos: PROPN tag: NNP
text: blue dep: amod pos: ADJ tag: JJ
text: jacket dep: ROOT pos: NOUN tag: NN


['TOMMY HILFIGER', 'blue jacket']

In [79]:
unsupervised_feature_extraction('TOMMY HILFIGER blue jacket')

(['blue'], ['TOMMY HILFIGER', 'jacket'])

In [15]:
feature_extraction('Long Sleeve blue jacket')

text: Long dep: advmod pos: ADV tag: RB
text: Sleeve dep: nmod pos: PROPN tag: NNP
text: blue dep: amod pos: ADJ tag: JJ
text: jacket dep: ROOT pos: NOUN tag: NN


['Long Sleeve blue jacket']

In [80]:
unsupervised_feature_extraction('I have a dark green shirt and blue jean')

(['dark green', 'blue'], ['shirt', 'jean'])

In [17]:
feature_extraction('I have a dark green shirt and blue jean')

text: I dep: nsubj pos: PRON tag: PRP
text: have dep: ROOT pos: AUX tag: VBP
text: a dep: det pos: DET tag: DT
text: dark dep: amod pos: ADJ tag: JJ
text: green dep: amod pos: ADJ tag: JJ
text: shirt dep: dobj pos: NOUN tag: NN
text: and dep: cc pos: CCONJ tag: CC
text: blue dep: amod pos: PROPN tag: NNP
text: jean dep: conj pos: PROPN tag: NNP


['dark green shirt', 'blue jean']

In [20]:
get_entities('maison margiela beige argyle jumper')

text: maison dep: amod pos: PROPN tag: NNP
text: margiela dep: nsubj pos: PROPN tag: NNP
text: beige dep: amod pos: ADJ tag: JJ
text: argyle dep: compound pos: PROPN tag: NNP
text: jumper dep: ROOT pos: NOUN tag: NN


([], ['maison margiela beige argyle jumper'])

In [81]:
unsupervised_feature_extraction('maison margiela beige argyle jumper')

(['maison', 'beige'], ['margiela', 'argyle jumper'])

In [42]:
feature_extraction('maison margiela beige argyle jumper')

text: maison dep: amod pos: PROPN tag: NNP
text: beige dep: amod pos: ADJ tag: JJ
text: argyle dep: compound pos: PROPN tag: NNP


[['maison margiela', 'beige argyle jumper'], []]

In [43]:
feature_extraction('alexander mcqueen midi dress')

text: alexander dep: nsubj pos: PROPN tag: NNP


[['alexander mcqueen midi dress'], []]

In [87]:
# unsupervised_feature_extraction('alexander mcqueen midi dress')
unsupervised_feature_extraction('yellow short sleeve mini dress')

(['yellow short', 'mini'], ['sleeve', 'dress'])

In [60]:
for tok in nlp('yellow short sleeve mini dress'):
    print(tok.dep_, tok.pos_)

amod PROPN
amod ADJ
ROOT NOUN
amod ADJ
dobj NOUN


In [75]:
feature_pairs = []

for i in tqdm(candidate_sentences["Query"]):
    feature_pairs.append(feature_extraction(i))

100%|██████████| 613/613 [00:05<00:00, 118.76it/s]


In [10]:
quality_tags_list = []
category_tags_list = []

for i in tqdm(candidate_sentences["Query"]):
    quality_tags, category_tags = unsupervised_feature_extraction(i)
    quality_tags_list.append(quality_tags)
    category_tags_list.append(category_tags)

100%|██████████| 613/613 [00:03<00:00, 164.19it/s]


In [46]:
feature_extraction("I like yellow jackets")

text: I dep: nsubj pos: PRON tag: PRP
text: like dep: ROOT pos: VERB tag: VBP
text: yellow dep: amod pos: ADJ tag: JJ
text: jackets dep: dobj pos: NOUN tag: NNS


['yellow jackets']

In [83]:
unsupervised_feature_extraction("I like yellow jackets")

(['yellow'], ['jackets'])

In [48]:
feature_extraction("I like combo of blue jean jackets and white tshirts")

text: I dep: nsubj pos: PRON tag: PRP
text: like dep: ROOT pos: VERB tag: VBP
text: combo dep: dobj pos: NOUN tag: NN
text: of dep: prep pos: ADP tag: IN
text: blue dep: amod pos: ADJ tag: JJ
text: jean dep: compound pos: ADJ tag: JJ
text: jackets dep: pobj pos: NOUN tag: NNS
text: and dep: cc pos: CCONJ tag: CC
text: white dep: amod pos: ADJ tag: JJ
text: tshirts dep: conj pos: NOUN tag: NNS


['combo', 'blue jean jackets', 'white tshirts']

In [84]:
unsupervised_feature_extraction("I like combo of blue jean jackets and white tshirts")

(['blue', 'white'], ['combo', 'jean jackets', 'tshirts'])

In [85]:
unsupervised_feature_extraction("striped knee length dress")

(['striped'], ['knee length dress'])

In [58]:
for tok in nlp('striped knee length dress'):
    print(tok.dep_, tok.pos_)

amod VERB
compound NOUN
compound NOUN
ROOT NOUN


In [65]:
for tok in nlp('Fendi handbag'):
    print(tok.dep_, tok.pos_)

compound PROPN
ROOT NOUN


In [74]:
feature_pairs

[['bag'],
 ['Fendi handbag'],
 ['Fendi', 'beige handbag'],
 ['backpacks'],
 ['black backpacks'],
 ['blue bucket bag'],
 ['bucketbag'],
 ['crocodile print bag'],
 ['animal print bag'],
 ['tote bag'],
 ['totes'],
 ['FURLA pochette'],
 ['red CELINE crossbody'],
 ['Coach work bag'],
 ['pink coccinelle bum bag'],
 ['belt bag'],
 ['duffel bag'],
 ['faux leather wallet'],
 ['pink wallet'],
 ['Floral print midi dress'],
 ['red sheath dress'],
 ['black dress'],
 ['short sleeve', 'mini dress'],
 ['burgundy crewneck dress'],
 ['crew neck dress'],
 ['Off Shoulder Dress'],
 ['line dress'],
 ['animal print dress'],
 ['leopard print dress'],
 ['lace dress'],
 ['shirt dress'],
 ['knee length dress'],
 ['full sleeve dress'],
 ['shift sack dress'],
 ['maison margiela', 'long dress'],
 ['polka dot mini dress'],
 ['needle thread', 'black dress'],
 ['flared dress'],
 ['DRESS'],
 ['red shoulder midi dress'],
 ['alberta ferreti dress'],
 ['black maxi dress'],
 ['Bodycon'],
 ['Alice Mccall midi dress'],
 ['st

In [117]:
entity_pairs[:100]

[([], ['bag']),
 ([], ['Fendi handbag']),
 ([], ['Fendi beige handbag']),
 ([], ['backpacks']),
 ([], ['black backpacks']),
 ([], ['blue bucket bag']),
 ([], ['bucketbag']),
 ([], ['crocodile print bag']),
 ([], ['animal print bag']),
 ([], ['tote bag']),
 ([], ['totes']),
 ([], ['FURLA pochette']),
 ([], ['red CELINE crossbody']),
 ([], ['Coach work bag']),
 ([], ['pink coccinelle bum bag']),
 ([], ['belt bag']),
 ([], ['duffel bag']),
 ([], ['faux leather wallet']),
 ([], ['pink wallet']),
 ([], ['Floral print midi dress']),
 ([], ['red sheath dress']),
 ([], ['black dress']),
 ([], ['short sleeve', 'mini dress']),
 ([], ['burgundy crewneck dress']),
 ([], ['crew neck dress']),
 ([], ['Off', 'Shoulder Dress']),
 ([], ['line dress']),
 ([], ['animal print dress']),
 ([], ['leopard print dress']),
 ([], ['lace dress']),
 ([], ['shirt dress']),
 ([], ['striped', 'knee length dress']),
 ([], ['full sleeve dress']),
 ([], ['shift sack dress']),
 ([], ['long dress']),
 ([], ['polka dot min

In [54]:
candidate_sentences['Query'][:50]

0                                bag
1                      Fendi handbag
2                Fendi beige handbag
3                          backpacks
4                    black backpacks
5                    blue bucket bag
6                          bucketbag
7                crocodile print bag
8                   animal print bag
9                           tote bag
10                             totes
11                    FURLA pochette
12             red CELINE crossbody 
13                    Coach work bag
14           pink coccinelle bum bag
15                          belt bag
16               Burberry duffel bag
17               faux leather wallet
18                  pale pink wallet
19           Floral print midi dress
20                  red sheath dress
21                little black dress
22    yellow short sleeve mini dress
23           burgundy crewneck dress
24                   crew neck dress
25                Off Shoulder Dress
26                      A line dress
2

In [None]:
determinants = []

In [None]:
for i in tqdm(candidate_sentences["Query"]):
    local_dep = []
    for tok in nlp(i):
        local_dep.append(tok.dep_)
    determinants.append(local_dep)

In [None]:
determinants[:20]

In [45]:
candidate_sentences['Query'][:20]

0                         bag
1               Fendi handbag
2         Fendi beige handbag
3                   backpacks
4             black backpacks
5             blue bucket bag
6                   bucketbag
7         crocodile print bag
8            animal print bag
9                    tote bag
10                      totes
11             FURLA pochette
12      red CELINE crossbody 
13             Coach work bag
14    pink coccinelle bum bag
15                   belt bag
16        Burberry duffel bag
17        faux leather wallet
18           pale pink wallet
19    Floral print midi dress
Name: Query, dtype: object

In [None]:
entity_pairs[:50]

In [101]:
get_entities('TOMMY HILFIGER blue jacket')

['', 'HILFIGER blue TOMMY HILFIGER jacket']

In [76]:
candidate_sentences['features'] = feature_pairs

In [77]:
!mkdir /kaggle/working/training_models

In [78]:
candidate_sentences.to_csv('/kaggle/working/training_models/feature_extraction.csv')

In [57]:
quality_category_pairs[:30]

[[[], ['bag']],
 [[], ['Fendi handbag']],
 [['beige'], ['Fendi', 'handbag']],
 [[], ['backpacks']],
 [['black'], ['backpacks']],
 [['blue'], ['bucket bag']],
 [[], ['bucketbag']],
 [[], ['crocodile print bag']],
 [[], ['animal print bag']],
 [[], ['tote bag']],
 [[], ['totes']],
 [['FURLA'], ['pochette']],
 [['red'], ['CELINE crossbody']],
 [[], ['Coach work bag']],
 [['pink'], ['coccinelle bum bag']],
 [[], ['belt bag']],
 [['Burberry duffel'], ['bag']],
 [[], ['faux leather wallet']],
 [['pale pink'], ['wallet']],
 [['Floral'], ['print midi dress']],
 [['red'], ['sheath dress']],
 [['little black'], ['dress']],
 [['yellow short', 'mini'], ['sleeve', 'dress']],
 [[], ['burgundy crewneck dress']],
 [[], ['crew neck dress']],
 [[], ['Off Shoulder Dress']],
 [[], ['line dress']],
 [[], ['animal print dress']],
 [[], ['leopard print dress']],
 [[], ['lace dress']]]

In [58]:
candidate_sentences['Query'][:30]

0                                bag
1                      Fendi handbag
2                Fendi beige handbag
3                          backpacks
4                    black backpacks
5                    blue bucket bag
6                          bucketbag
7                crocodile print bag
8                   animal print bag
9                           tote bag
10                             totes
11                    FURLA pochette
12             red CELINE crossbody 
13                    Coach work bag
14           pink coccinelle bum bag
15                          belt bag
16               Burberry duffel bag
17               faux leather wallet
18                  pale pink wallet
19           Floral print midi dress
20                  red sheath dress
21                little black dress
22    yellow short sleeve mini dress
23           burgundy crewneck dress
24                   crew neck dress
25                Off Shoulder Dress
26                      A line dress
2

In [61]:
candidate_sentences.columns

Index(['Query'], dtype='object')

In [71]:
quality_tags_list[:5]

['beige', 'black', 'blue', 'FURLA', 'red']

In [11]:
candidate_sentences['Quality Tags'] = quality_tags_list
candidate_sentences['Category Tags'] = category_tags_list

In [12]:
quality_category_pairs[:10]

NameError: name 'quality_category_pairs' is not defined

In [13]:
candidate_sentences.columns

Index(['Query', 'Quality Tags', 'Category Tags'], dtype='object')

In [14]:
!mkdir ./dev_csvs

In [15]:
candidate_sentences.to_csv('./dev_csvs/quality_category_extraction.csv', sep='\t', encoding='utf-8')

In [16]:
color_dataset = pd.read_csv('../input/color-rgb-dataset/colours_rgb_shades.csv')

In [17]:
color_dataset.columns

Index(['Color Name', 'Credits', 'R;G;B Dec', 'RGB Hex', 'CSS Hex',
       'BG/FG color sample'],
      dtype='object')

In [20]:
color_dataset['Color Name'] = color_dataset['Color Name'].fillna('')

In [25]:
color_names = []
for color in color_dataset['Color Name']:
    color_names.append(''.join(' ' + c if c.isupper() else c for c in color).lower().strip())

In [26]:
color_names[:10]

['grey',
 'grey,  silver',
 'grey',
 'light gray',
 'light slate grey',
 'slate gray',
 'slate gray1',
 'slate gray2',
 'slate gray3',
 'slate gray4']

In [23]:
color_dataset['Color Name'][:10]

0              Grey
1      Grey, Silver
2              grey
3         LightGray
4    LightSlateGrey
5         SlateGray
6        SlateGray1
7        SlateGray2
8        SlateGray3
9        SlateGray4
Name: Color Name, dtype: object

In [28]:
len(color_names)

660

In [29]:
color_dataset['Color Names Cleaned'] = color_names

In [30]:
color_dataset.to_csv('color_names_and_codes.csv', sep='\t', encoding='utf-8')

In [31]:
a = 1

In [36]:
check_df = pd.read_csv('../input/check-set/check1.csv', header=0)

ParserError: Error tokenizing data. C error: Expected 1 fields in line 161, saw 2
