In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install BeautifulSoup4

In [None]:
import re
import pandas as pd
import bs4
import requests
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import Matcher 
from spacy.tokens import Span 

import networkx as nx

import matplotlib.pyplot as plt
from tqdm import tqdm

pd.set_option('display.max_colwidth', 200)
%matplotlib inline

In [None]:
candidate_sentences = pd.read_csv("../input/wiki-sentences/wiki_sentences_v2.csv")
candidate_sentences.shape

In [None]:
candidate_sentences['sentence'].sample(5)

In [None]:
doc = nlp("the drawdown process is governed by astm standard d823")

for tok in doc:
  print(tok.text, "...", tok.dep_)

In [None]:
def get_entities(sent):
  ## chunk 1
  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""    # dependency tag of previous token in the sentence
  prv_tok_text = ""   # previous token in the sentence

  prefix = ""
  modifier = ""

  #############################################################
  
  for tok in nlp(sent):
    ## chunk 2
    # if token is a punctuation mark then move on to the next token
    if tok.dep_ != "punct":
      # check: token is a compound word or not
      if tok.dep_ == "compound":
        prefix = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " "+ tok.text
      
      # check: token is a modifier or not
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          modifier = prv_tok_text + " "+ tok.text
      
      ## chunk 3
      if tok.dep_.find("subj") == True:
        ent1 = modifier +" "+ prefix + " "+ tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""      

      ## chunk 4
      if tok.dep_.find("obj") == True:
        ent2 = modifier +" "+ prefix +" "+ tok.text
        
      ## chunk 5  
      # update variables
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text
  #############################################################

  return [ent1.strip(), ent2.strip()]

In [None]:
get_entities("the film had 200 patents")

In [None]:
entity_pairs = []

for i in tqdm(candidate_sentences["sentence"]):
  entity_pairs.append(get_entities(i))

In [None]:
entity_pairs[10:20]

In [None]:
def get_relation(sent):

  doc = nlp(sent)

#   for tok in doc:
#     print(tok.text, " ", tok.dep_, " ", tok.pos_)

  # Matcher class object 
  matcher = Matcher(nlp.vocab)

  #define the pattern 
  pattern = [{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}] 

  matcher.add("matching_1", None, pattern) 

  matches = matcher(doc)
#   print("Matches:", matches)
  k = len(matches) - 1
#   print("K:", k)
  span = doc[matches[k][1]:matches[k][2]] 

  return(span.text)

In [None]:
get_relation("John completed the task")

In [None]:
get_relation("John and Jan")

In [None]:
relations = [get_relation(i) for i in tqdm(candidate_sentences['sentence'])]

In [None]:
a = 1

In [None]:
pd.Series(relations).value_counts()[:50]

In [None]:
# extract subject
source = [i[0] for i in entity_pairs]

# extract object
target = [i[1] for i in entity_pairs]

kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations})

In [None]:
kg_df.head()

In [None]:
G=nx.from_pandas_edgelist(kg_df, "source", "target", 
                          edge_attr=True, create_using=nx.MultiDiGraph())

In [None]:
plt.figure(figsize=(12,12))

pos = nx.spring_layout(G)
nx.draw(G, with_labels=True, node_color='skyblue', edge_cmap=plt.cm.Blues, pos = pos)
plt.show()

In [None]:
G=nx.from_pandas_edgelist(kg_df[kg_df['edge']=="composed by"], "source", "target", 
                          edge_attr=True, create_using=nx.MultiDiGraph())

plt.figure(figsize=(12,12))
pos = nx.spring_layout(G, k = 0.5) # k regulates the distance between nodes
nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)
plt.show()

In [None]:
G=nx.from_pandas_edgelist(kg_df[kg_df['edge']=="released in"], "source", "target", 
                          edge_attr=True, create_using=nx.MultiDiGraph())

plt.figure(figsize=(12,12))
pos = nx.spring_layout(G, k = 0.5)
nx.draw(G, with_labels=True, node_color='skyblue', node_size=1500, edge_cmap=plt.cm.Blues, pos = pos)
plt.show()

In [None]:
!pip install pytorch-pretrained-bert pytorch-nlp

In [None]:
# Import Libraries

import tensorflow as tf
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from pytorch_pretrained_bert import BertTokenizer, BertConfig
from pytorch_pretrained_bert import BertAdam, BertForSequenceClassification
from tqdm import tqdm, trange
import pandas as pd
import io
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_csv("../input/cola-the-corpus-of-linguistic-acceptability/cola_public/raw/in_domain_train.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.sample(10)

In [None]:
# Create sentence and label lists
sentences = df.sentence.values

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
labels = df.label.values

In [None]:
sentences[:10]

In [None]:
labels[:10]

In [None]:
df[:10]

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]
print ("Tokenize the first sentence:")
print (tokenized_texts[0])

In [None]:
MAX_LEN = 128

In [None]:
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

In [None]:
len(input_ids)

In [None]:
input_ids[:2]

In [None]:
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

In [None]:
len(input_ids)

In [None]:
input_ids[:2]

In [None]:
# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask)

In [None]:
attention_masks[1]

In [None]:
df.label_notes.unique()

In [None]:
# Use train_test_split to split our data into train and validation sets for training

train_inputs, validation_inputs, train_labels, validation_labels = train_test_split(input_ids, labels, 
                                                            random_state=2018, test_size=0.1)
train_masks, validation_masks, _, _ = train_test_split(attention_masks, input_ids,
                                             random_state=2018, test_size=0.1)

In [None]:
print(len(train_inputs), len(validation_inputs))

In [None]:
7695 + 856

In [None]:
# Convert all of our data into torch tensors, the required datatype for our model

train_inputs = torch.tensor(train_inputs)
validation_inputs = torch.tensor(validation_inputs)
train_labels = torch.tensor(train_labels)
validation_labels = torch.tensor(validation_labels)
train_masks = torch.tensor(train_masks)
validation_masks = torch.tensor(validation_masks)

In [None]:
# Select a batch size for training. For fine-tuning BERT on a specific task, the authors recommend a batch size of 16 or 32
batch_size = 32

# Create an iterator of our data with torch DataLoader. This helps save on memory during training because, unlike a for loop, 
# with an iterator the entire dataset does not need to be loaded into memory

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

validation_data = TensorDataset(validation_inputs, validation_masks, validation_labels)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

In [None]:
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)

In [None]:
param_optimizer = list(model.named_parameters())

In [None]:
print('==== Embedding Layer ====\n')
for p in param_optimizer[0:5]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [None]:
print('\n==== First Transformer ====\n')

for p in param_optimizer[5:21]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

print('\n==== Output Layer ====\n')

for p in param_optimizer[-4:]:
    print("{:<55} {:>12}".format(p[0], str(tuple(p[1].size()))))

In [None]:
for n, p in param_optimizer:
    print(n)
    break

In [None]:
# no_decay = ['bias', 'gamma', 'beta']
no_decay = ['bias', 'LayerNorm.weight']
optimizer_grouped_parameters = [{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],'weight_decay_rate': 0.01},
                                {'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],'weight_decay_rate': 0.0}]

In [None]:
len(optimizer_grouped_parameters)

In [None]:
optimizer = BertAdam(optimizer_grouped_parameters,lr=2e-5,warmup=.1)

In [None]:
del df

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
USE_GPU = True

if USE_GPU and torch.cuda.is_available():
    print('using device: cuda')
else:
    print('using device: cpu')

In [None]:
device = torch.device("cuda:0")
device

In [None]:
device = "cuda:0"
model = model.to(device)

In [None]:
t = [] 

# Store our loss and accuracy for plotting
train_loss_set = []

# Number of training epochs 
epochs = 2

# trange is a tqdm wrapper around the normal python range
for _ in trange(epochs, desc="Epoch"):
  
  
  # Training
  
  # Set our model to training mode (as opposed to evaluation mode)
  model.train()
  
  # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0
  
  # Train the data for one epoch
  for step, batch in enumerate(train_dataloader):
#     Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Clear out the gradients (by default they accumulate)
    optimizer.zero_grad()
    # Forward pass
    loss = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask, labels=b_labels)
    train_loss_set.append(loss.item())    
    # Backward pass
    loss.backward()
    # Update parameters and take a step using the computed gradient
    optimizer.step()
    
    
    # Update tracking variables
    tr_loss += loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  print("Train loss: {}".format(tr_loss/nb_tr_steps))
    
    
  # Validation

  # Put model in evaluation mode to evaluate loss on the validation set
  model.eval()

  # Tracking variables 
  eval_loss, eval_accuracy = 0, 0
  nb_eval_steps, nb_eval_examples = 0, 0

  # Evaluate data for one epoch
  for batch in validation_dataloader:
#     Add batch to GPU
    batch = tuple(t.to(device) for t in batch)
    # Unpack the inputs from our dataloader
    b_input_ids, b_input_mask, b_labels = batch
    # Telling the model not to compute or store gradients, saving memory and speeding up validation
    with torch.no_grad():
      # Forward pass, calculate logit predictions
      logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)
    
    # Move logits and labels to CPU
    logits = logits.detach().cpu().numpy()
    label_ids = b_labels.to('cpu').numpy()

    tmp_eval_accuracy = flat_accuracy(logits, label_ids)
    
    eval_accuracy += tmp_eval_accuracy
    nb_eval_steps += 1

  print("Validation Accuracy: {}".format(eval_accuracy/nb_eval_steps))

In [None]:
plt.figure(figsize=(15,8))
plt.title("Training loss")
plt.xlabel("Batch")
plt.ylabel("Loss")
plt.plot(train_loss_set)
plt.show()

In [None]:
del df

In [None]:
df = pd.read_csv("../input/cola-the-corpus-of-linguistic-acceptability/cola_public/raw/out_of_domain_dev.tsv", delimiter='\t', header=None, names=['sentence_source', 'label', 'label_notes', 'sentence'])

In [None]:
# Create sentence and label lists
sentences = df.sentence.values

# We need to add special tokens at the beginning and end of each sentence for BERT to work properly
sentences = ["[CLS] " + sentence + " [SEP]" for sentence in sentences]
labels = df.label.values

tokenized_texts = [tokenizer.tokenize(sent) for sent in sentences]


MAX_LEN = 128

# Use the BERT tokenizer to convert the tokens to their index numbers in the BERT vocabulary
input_ids = [tokenizer.convert_tokens_to_ids(x) for x in tokenized_texts]

# Pad our input tokens
input_ids = pad_sequences(input_ids, maxlen=MAX_LEN, dtype="long", truncating="post", padding="post")

# Create attention masks
attention_masks = []

# Create a mask of 1s for each token followed by 0s for padding
for seq in input_ids:
  seq_mask = [float(i>0) for i in seq]
  attention_masks.append(seq_mask) 

prediction_inputs = torch.tensor(input_ids)
prediction_masks = torch.tensor(attention_masks)
prediction_labels = torch.tensor(labels)
  
batch_size = 32  


prediction_data = TensorDataset(prediction_inputs, prediction_masks, prediction_labels)
prediction_sampler = SequentialSampler(prediction_data)
prediction_dataloader = DataLoader(prediction_data, sampler=prediction_sampler, batch_size=batch_size)

In [None]:
# Prediction on test set

# Put model in evaluation mode
model.eval()

# Tracking variables 
predictions , true_labels = [], []

# Predict 
for batch in prediction_dataloader:
  # Add batch to GPU
  batch = tuple(t.to(device) for t in batch)
  # Unpack the inputs from our dataloader
  b_input_ids, b_input_mask, b_labels = batch
  # Telling the model not to compute or store gradients, saving memory and speeding up prediction
  with torch.no_grad():
    # Forward pass, calculate logit predictions
    logits = model(b_input_ids, token_type_ids=None, attention_mask=b_input_mask)

  # Move logits and labels to CPU
  logits = logits.detach().cpu().numpy()
  label_ids = b_labels.to('cpu').numpy()
  
  # Store predictions and true labels
  predictions.append(logits)
  true_labels.append(label_ids)

In [None]:
print(predictions[0], true_labels[0])
print(len(predictions[0]), len(true_labels[0]))

In [None]:
# Import and evaluate each test batch using Matthew's correlation coefficient
from sklearn.metrics import matthews_corrcoef
matthews_set = []

for i in range(len(true_labels)):
  matthews = matthews_corrcoef(true_labels[i],
                 np.argmax(predictions[i], axis=1).flatten())
  matthews_set.append(matthews)

In [None]:
prediction_set = []
for i in range(len(true_labels)):
  matthews = np.argmax(predictions[i], axis=1).flatten()
  prediction_set.append(matthews)

In [None]:
matthews_set

In [None]:
prediction_set

In [None]:
true_labels

In [None]:
# ---------------------------

In [1]:
!pip install BeautifulSoup4

Collecting BeautifulSoup4
  Downloading beautifulsoup4-4.9.3-py3-none-any.whl (115 kB)
[K     |████████████████████████████████| 115 kB 905 kB/s eta 0:00:01
[?25hCollecting soupsieve>1.2
  Downloading soupsieve-2.2.1-py3-none-any.whl (33 kB)
Installing collected packages: soupsieve, BeautifulSoup4
Successfully installed BeautifulSoup4-4.9.3 soupsieve-2.2.1


In [2]:
import re
import pandas as pd
import bs4
import requests
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from spacy.matcher import Matcher 
from spacy.tokens import Span 

import networkx as nx

import matplotlib.pyplot as plt
from tqdm import tqdm
from collections import defaultdict

pd.set_option('display.max_colwidth', 200)
%matplotlib inline

In [None]:
candidate_sentences = pd.read_csv('../input/qa-csv/qa.csv', delimiter='\t', encoding='utf-8', index_col=0)
candidate_sentences.shape

In [None]:
candidate_sentences.columns

In [None]:
candidate_sentences['Query'].sample(5)

In [None]:
candidate_sentences['Query'] = candidate_sentences['Query'].fillna('')

In [None]:
candidate_sentences['Query'][:5]

In [None]:
# def get_entities(sent):
#     ## chunk 1
#     ent1 = ""
#     ent2 = ""
#     sub_list = []
#     obj_list = []

#     prv_tok_dep = ""    # dependency tag of previous token in the sentence
#     prv_tok_text = ""   # previous token in the sentence

#     prefix = ""
#     modifier = ""
#     subject = ""

#     for tok in nlp(sent):
#         print('text:', tok.text, 'dep:', tok.dep_, 'pos:', tok.pos_, 'tag:', tok.tag_)
#         ## chunk 2
#         # if token is a punctuation mark then move on to the next token
#         if tok.dep_ != "punct":
#             # check: token is a compound word or not
#             if tok.dep_ == "compound":
#                 prefix = tok.text
#                 # if the previous word was also a 'compound' then add the current word to it
#                 if prv_tok_dep == "compound":
#                     prefix = prv_tok_text + " " + tok.text

#         # check: token is a modifier or not
#         if tok.dep_.endswith("mod") == True:
#             # if the previous word was also a 'compound' then add the current word to it
#             if prv_tok_dep == "compound":
#                 modifier = prefix + " " + tok.text
#                 prefix = " "
#             elif prv_tok_dep.endswith("mod"):
#                 modifier = prv_tok_text + " " + tok.text
#             else:
#                 modifier = tok.text

# #           ## chunk 3
# #         if tok.dep_.find("subj") == True:
# #             ent1 = modifier +" "+ prefix + " "+ tok.text
# #             sub_list.append(ent1.strip())
# #             prefix = ""
# #             modifier = ""
# #             prv_tok_dep = ""
# #             prv_tok_text = ""
        
#         if (tok.dep_.find("subj") == True and tok.tag_.lower() not in ('prp')):
#             subject = modifier + " " + prefix + " " + tok.text
#             prefix = ""
#             modifier = ""
#             prv_tok_dep = ""
#             prv_tok_text = "" 
        
#         elif tok.pos_.lower() == 'propn':
            

#           ## chunk 4
#         elif tok.dep_.find("obj") == True or (tok.dep_ == 'ROOT' and tok.tag_.lower() not in ('vbp')):
#             ent2 = (subject + " " + modifier + " " + prefix + " " + tok.text).strip()
#             ent2 = " ".join(ent2.split())
# #             print('subject:', subject, 'modifier:', modifier, 'prefix:', prefix, 'text:', tok.text)
#             obj_list.append(ent2.strip())
#             prefix = ""
#             modifier = ""
#             prv_tok_dep = ""
#             prv_tok_text = ""
#             subject = ""

#         ## chunk 5  
#         # update variables
#         else:
#             prv_tok_dep = tok.dep_
#             prv_tok_text = tok.text

# #     return [ent1.strip(), ent2.strip()]
#     return sub_list, obj_list

In [None]:
def feature_extraction(sent):
    ## chunk 1
    ent1 = ""
    ent2 = ""
    sub_list = []
    obj_list = []

    prv_tok_dep = ""    # dependency tag of previous token in the sentence
    prv_tok_text = ""   # previous token in the sentence

    prefix = ""
    modifier = ""
    subject = ""
    doc = nlp(sent)
    doc_len = len(doc)
    i = 0
    while i < doc_len:
        tok = doc[i]
        print('text:', tok.text, 'dep:', tok.dep_, 'pos:', tok.pos_, 'tag:', tok.tag_)
        ## chunk 2
        # if token is a punctuation mark then move on to the next token

        # check: token is a modifier or not
        if tok.dep_.endswith("mod") == True:
            # if the previous word was also a 'compound' then add the current word to it
            if prv_tok_dep == "compound":
                modifier += " " + prefix + " " + tok.text
                prefix = " "
            elif prv_tok_dep.endswith("mod"):
                modifier += " " + prv_tok_text + " " + tok.text
            else:
                modifier += " " + tok.text

        ## chunk 3        
        elif tok.pos_.lower() == 'propn' or tok.pos_.lower() == 'noun':
            if tok.dep_.endswith("mod"):
                subject = prefix + " " + tok.text
            else:
                subject = modifier + " " + prefix + " " + tok.text
            j = i + 1
            while j < doc_len:
                tok_next = doc[j]
                if tok_next.pos_.lower() == 'propn' or tok_next.pos_.lower() == 'noun' :
                    subject += " " + tok_next.text
                    j += 1
                else:
                    break
            i = j - 1
            subject = subject.strip()
            subject = " ".join(subject.split())
            sub_list.append(subject)
            prefix = ""
            modifier = ""
            prv_tok_dep = ""
            prv_tok_text = ""
            subject = ""
            
        elif tok.dep_ != "punct":
            # check: token is a compound word or not
            if tok.dep_ == "compound":
                prefix = tok.text
                # if the previous word was also a 'compound' then add the current word to it
                if prv_tok_dep == "compound":
                    prefix = prv_tok_text + " " + tok.text
                if prv_tok_dep.endswith("mod"):
                    prefix = modifier + " " + tok.text

#         ## chunk 4
#         elif tok.dep_.find("obj") == True or (tok.dep_ == 'ROOT' and tok.tag_.lower() not in ('vbp')):
#             obj = (modifier + " " + prefix + " " + tok.text).strip()
#             obj = " ".join(obj.split())
#             obj_list.append(obj.strip())
#             prefix = ""
#             modifier = ""
#             prv_tok_dep = ""
#             prv_tok_text = ""
#             subject = ""

        ## chunk 5  
        # update variables
        else:
            prv_tok_dep = tok.dep_
            prv_tok_text = tok.text
            modifier = ""
        i += 1

    return sub_list

In [None]:
def unsupervised_feature_extraction(sent):
    ## chunk 1
    category_list = []
    quality_list = []

    prefix = ""
    modifier = ""
    category = ""
    doc = nlp(sent)
    doc_len = len(doc)
    i = 0
    while i < doc_len:
        tok = doc[i]
#         print('text:', tok.text, 'dep:', tok.dep_, 'pos:', tok.pos_, 'tag:', tok.tag_)
        
        ## chunk 3: check if token is a modifier or not
        if tok.dep_.endswith("mod") == True:
            modifier = prefix + " " + tok.text
            j = i + 1
            while j < doc_len:
                tok_next = doc[j]
                if tok_next.dep_.endswith("mod"):
                    modifier += " " + tok_next.text
                    j += 1
                else:
                    break
            i = j - 1
            modifier = modifier.strip()
            modifier = " ".join(modifier.split())
            quality_list.append(modifier)
            prefix = ""
            modifier = ""
            category = ""

        ## chunk 2: check if token is a noun or not    
        elif tok.pos_.lower() == 'propn' or tok.pos_.lower() == 'noun':
            category = prefix + " " + tok.text
            j = i + 1
            while j < doc_len:
                tok_next = doc[j]
                if (tok_next.pos_.lower() == 'propn' or tok_next.pos_.lower() == 'noun'):
                    category += " " + tok_next.text
                    j += 1
                else:
                    break
            i = j - 1
            category = category.strip()
            category = " ".join(category.split())
            category_list.append(category)
            prefix = ""
            modifier = ""
            category = ""

        
        ## chunk 4: check if token is a coumpuund word or not
        elif tok.dep_ != "punct":
            if tok.dep_.lower() == 'compound':
                prefix += " " + tok.text
                j = i + 1
                while j < doc_len:
                    tok_next = doc[j]
                    if tok_next.dep_.lower() == 'compound':
                        prefix += " " + tok_next.text
                        j += 1
                    else:
                        break
        i += 1

    return quality_list, category_list

In [None]:
# entity_pairs = []

# for i in tqdm(candidate_sentences["Query"][:100]):
#   entity_pairs.append(get_entities(i))

In [None]:
feature_extraction('TOMMY HILFIGER blue jacket')

In [None]:
unsupervised_feature_extraction('TOMMY HILFIGER blue jacket')

In [None]:
feature_extraction('Long Sleeve blue jacket')

In [None]:
unsupervised_feature_extraction('I have a dark green shirt and blue jean')

In [None]:
feature_extraction('I have a dark green shirt and blue jean')

In [None]:
# get_entities('maison margiela beige argyle jumper')

In [None]:
unsupervised_feature_extraction('maison margiela beige argyle jumper')

In [None]:
feature_extraction('maison margiela beige argyle jumper')

In [None]:
feature_extraction('alexander mcqueen midi dress')

In [None]:
# unsupervised_feature_extraction('alexander mcqueen midi dress')
unsupervised_feature_extraction('yellow short sleeve mini dress')

In [None]:
for tok in nlp('yellow short sleeve mini dress'):
    print(tok.dep_, tok.pos_)

In [None]:
# feature_pairs = []

# for i in tqdm(candidate_sentences["Query"]):
#     feature_pairs.append(feature_extraction(i))

In [None]:
quality_tags_list = []
category_tags_list = []

for i in tqdm(candidate_sentences["Query"]):
    quality_tags, category_tags = unsupervised_feature_extraction(i)
    quality_tags_list.append(quality_tags)
    category_tags_list.append(category_tags)

In [None]:
feature_extraction("I like yellow jackets")

In [None]:
unsupervised_feature_extraction("I like yellow jackets")

In [None]:
feature_extraction("I like combo of blue jean jackets and white tshirts")

In [None]:
unsupervised_feature_extraction("I like combo of blue jean jackets and white tshirts")

In [None]:
unsupervised_feature_extraction("striped knee length dress")

In [None]:
for tok in nlp('striped knee length dress'):
    print(tok.dep_, tok.pos_)

In [None]:
for tok in nlp('Fendi handbag'):
    print(tok.dep_, tok.pos_)

In [None]:
feature_pairs

In [None]:
entity_pairs[:100]

In [None]:
candidate_sentences['Query'][:50]

In [None]:
determinants = []

In [None]:
for i in tqdm(candidate_sentences["Query"]):
    local_dep = []
    for tok in nlp(i):
        local_dep.append(tok.dep_)
    determinants.append(local_dep)

In [None]:
determinants[:20]

In [None]:
candidate_sentences['Query'][:20]

In [None]:
entity_pairs[:50]

In [None]:
get_entities('TOMMY HILFIGER blue jacket')

In [None]:
candidate_sentences['features'] = feature_pairs

In [None]:
!mkdir /kaggle/working/training_models

In [None]:
candidate_sentences.to_csv('/kaggle/working/training_models/feature_extraction.csv')

In [None]:
quality_category_pairs[:30]

In [None]:
candidate_sentences['Query'][:30]

In [None]:
candidate_sentences.columns

In [None]:
quality_tags_list[:5]

In [None]:
candidate_sentences['Quality Tags'] = quality_tags_list
candidate_sentences['Category Tags'] = category_tags_list

In [None]:
quality_category_pairs[:10]

In [None]:
candidate_sentences.columns

In [None]:
!mkdir ./dev_csvs

In [None]:
candidate_sentences.to_csv('./dev_csvs/quality_category_extraction.csv', sep='\t', encoding='utf-8')

In [None]:
color_dataset = pd.read_csv('../input/color-rgb-dataset/colours_rgb_shades.csv')

In [None]:
color_dataset.columns

In [None]:
color_dataset['Color Name'] = color_dataset['Color Name'].fillna('')

In [None]:
color_names = []
for color in color_dataset['Color Name']:
    color_names.append(''.join(' ' + c if c.isupper() else c for c in color).lower().strip())

In [None]:
color_names[:10]

In [None]:
color_dataset['Color Name'][:10]

In [None]:
len(color_names)

In [None]:
color_dataset['Color Names Cleaned'] = color_names

In [None]:
color_dataset.to_csv('color_names_and_codes.csv', sep='\t', encoding='utf-8')

In [None]:
a = 1

In [12]:
check_df = pd.read_csv('../input/datacleaner/data_cleanser.csv', header=0, delimiter='\t', error_bad_lines=False)

b'Skipping line 1114: expected 1 fields, saw 2\nSkipping line 2126: expected 1 fields, saw 2\nSkipping line 2296: expected 1 fields, saw 2\nSkipping line 4922: expected 1 fields, saw 2\nSkipping line 9500: expected 1 fields, saw 2\nSkipping line 12578: expected 1 fields, saw 2\nSkipping line 15295: expected 1 fields, saw 2\nSkipping line 18309: expected 1 fields, saw 2\nSkipping line 21193: expected 1 fields, saw 2\nSkipping line 24723: expected 1 fields, saw 2\nSkipping line 35928: expected 1 fields, saw 2\nSkipping line 38161: expected 1 fields, saw 2\nSkipping line 47049: expected 1 fields, saw 2\nSkipping line 51626: expected 1 fields, saw 2\n'


In [None]:
check_df.columns

In [None]:
len(check_df)

In [None]:
color_names_set = set(color_names)

In [None]:
if 'dark green' in color_names:
    print('yes')

In [None]:
max_len = 0
for ind_color in color_names:
    comma_sep_colors = ind_color.split(',')
    for color in comma_sep_colors:
        color = ' '.join(color.split())
        try:
            length = len(color.split(' '))
        except:
            print(color, comma_sep_colors)
        if length > max_len:
            max_len = length

In [None]:
max_len

In [16]:
from collections import defaultdict

In [None]:
color_dictionary = defaultdict(list)
for ind_color in color_names:
    comma_sep_colors = ind_color.split(',')
    for color in comma_sep_colors:
        color = ' '.join(color.split())
        split_color = color.split(' ')
        length = len(split_color)
        for i in range(length):
            color_dictionary[i + 1].append(' '.join(split_color[0:(i + 1)]))

In [None]:
color_dictionary[3]

In [None]:
color_dictionary[5]

In [17]:
check_df['data_checker'] = check_df['data_checker'].fillna('')

In [18]:
data_checker_list = list(check_df['data_checker'])

In [19]:
max_len = 0
for ind_data in data_checker_list:
    comma_sep_data = ind_data.split(',')
    for datum in comma_sep_data:
        datum = ' '.join(datum.split())
        try:
            length = len(datum.split(' '))
        except:
            print(datum, comma_sep_data)
        if length > max_len:
            max_len = length

In [20]:
max_len

11

In [21]:
data_checker_dict = defaultdict(list)
max_len = 0
for ind_data in data_checker_list:
    comma_sep_data = ind_data.split(',')
    for datum in comma_sep_data:
        datum = ' '.join(datum.split())
        datum_split = datum.split(' ')
        length = len(datum_split)
        for i in range(length):
            data_checker_dict[i + 1].append(' '.join(datum_split[0:(i + 1)]))

In [22]:
data_checker_dict[2][:5]

['E-Land American', 'JoJo Maman', 'Souris Mini', 'Janie and', 'CCH Collection']

In [None]:
def unsupervised_feature_extraction(sent):
    ## chunk 1
    category_list = []
    quality_list = []

    prefix = ""
    modifier = ""
    category = ""
    doc = nlp(sent)
    doc_len = len(doc)
    i = 0
    while i < doc_len:
        tok = doc[i]
#         print('text:', tok.text, 'dep:', tok.dep_, 'pos:', tok.pos_, 'tag:', tok.tag_)
        
        ## chunk 3: check if token is a modifier or not
        if tok.dep_.endswith("mod") == True:
            modifier = prefix + " " + tok.text
            j = i + 1
            while j < doc_len:
                tok_next = doc[j]
                if tok_next.dep_.endswith("mod"):
                    modifier += " " + tok_next.text
                    j += 1
                else:
                    break
            i = j - 1
            modifier = modifier.strip()
            modifier = " ".join(modifier.split())
            quality_list.append(modifier)
            prefix = ""
            modifier = ""
            category = ""

        ## chunk 2: check if token is a noun or not    
        elif tok.pos_.lower() == 'propn' or tok.pos_.lower() == 'noun':
            category = prefix + " " + tok.text
            j = i + 1
            while j < doc_len:
                tok_next = doc[j]
                if (tok_next.pos_.lower() == 'propn' or tok_next.pos_.lower() == 'noun'):
                    category += " " + tok_next.text
                    j += 1
                else:
                    break
            i = j - 1
            category = category.strip()
            category = " ".join(category.split())
            category_list.append(category)
            prefix = ""
            modifier = ""
            category = ""

        
        ## chunk 4: check if token is a coumpuund word or not
        elif tok.dep_ != "punct":
            if tok.dep_.lower() == 'compound':
                prefix += " " + tok.text
                j = i + 1
                while j < doc_len:
                    tok_next = doc[j]
                    if tok_next.dep_.lower() == 'compound':
                        prefix += " " + tok_next.text
                        j += 1
                    else:
                        break
        i += 1

    return quality_list, category_list

In [None]:
quality_tags_list = []
category_tags_list = []
brand_list = []
for k in tqdm(candidate_sentences["Query"]):
    sentence_split = k.split()
    split_len = len(sentence_split)
    brand_str = ""
    for i in range(split_len):
        if sentence_split[i] in data_checker_dict[1]:
#             print('1:', sentence_split[i])
#             print(k)
            start_index = i
            j = 2
            loop_index = i + 1
            while loop_index < split_len and sentence_split[loop_index] in data_checker_dict[j]:
                j += 1
                loop_index += 1
            end_index = loop_index - 1
#             print(start_index, end_index)
            brand_str = ' '.join(sentence_split[start_index:(end_index+1)])
            sentence_split[start_index:(end_index+1)] = []
            k = ' '.join(sentence_split)
#             print(k)
            break
    brand_list.append(brand_str)
#     quality_tags, category_tags = unsupervised_feature_extraction(i)
#     quality_tags_list.append(quality_tags)
#     category_tags_list.append(category_tags)

In [None]:
len(brand_list)

In [None]:
len(candidate_sentences["Query"])

In [None]:
brand_set = set(brand_list)

In [None]:
brand_set

In [None]:
'white' in data_checker_list

In [None]:
max_len = 0
for ind_color in color_names:
    comma_sep_colors = ind_color.split(',')
    for color in comma_sep_colors:
        color = ' '.join(color.split())
        try:
            length = len(color.split(' '))
        except:
            print(color, comma_sep_colors)
        if length > max_len:
            max_len = length

color_dictionary = defaultdict(set)
color_set = set(color_names)
for ind_color in color_names:
    comma_sep_colors = ind_color.split(',')
    for color in comma_sep_colors:
        color = ' '.join(color.split())
        split_color = color.split(' ')
        length = len(split_color)
        initial_key = ""
        for i in range(length):
            color_dictionary[initial_key].add(' '.join(split_color[0:(i + 1)]))
            initial_key = ' '.join(split_color[0:(i + 1)])

In [23]:
check_df['data_checker'] = check_df['data_checker'].fillna('')
data_checker_list = list(check_df['data_checker'])
max_len = 0
for ind_data in data_checker_list:
    comma_sep_data = ind_data.split(',')
    for datum in comma_sep_data:
        datum = ' '.join(datum.split())
        try:
            length = len(datum.split(' '))
        except:
            print(datum, comma_sep_data)
        if length > max_len:
            max_len = length

data_checker_dict = defaultdict(set)
data_checker_set = set(data_checker_list)
words_to_remove = ['Dress', 'Skirt', 'Black']
data_checker_set = set(filter(lambda x:(len(x)!=1 and x not in words_to_remove), data_checker_set))
for ind_data in data_checker_list:
    comma_sep_data = ind_data.split(',')
    for datum in comma_sep_data:
        datum = ' '.join(datum.split())
        datum_split = datum.split(' ')
        length = len(datum_split)
        initial_key = ""
        for i in range(length):
            data_checker_dict[initial_key].add(' '.join(datum_split[0:(i + 1)]))
            initial_key = ' '.join(datum_split[0:(i + 1)])

In [24]:
len(data_checker_set)

54741

In [25]:
len(data_checker_dict.keys())

33574

In [None]:
(color_dictionary.keys())

In [None]:
quality_tags_list = []
category_tags_list = []
brand_list = []
for k in tqdm(candidate_sentences["Query"]):
    sentence_split = k.split()
    split_len = len(sentence_split)
    brand_str = ""
    initial_key = ""
    
    for i in range(split_len):
        if sentence_split[i] in data_checker_dict[""]:
            start_index = i
            j = 2
            loop_index = i + 1
            brand_tuple_index = tuple()
            while loop_index <= split_len and " ".join(sentence_split[start_index:loop_index]) in data_checker_dict[" ".join(sentence_split[start_index:(loop_index - 1)])]:
                if " ".join(sentence_split[start_index:loop_index]) in data_checker_set:
                    brand_tuple_index = (start_index, loop_index) 
                j += 1
                loop_index += 1
            if brand_tuple_index:
                brand_str = ' '.join(sentence_split[brand_tuple_index[0]:brand_tuple_index[1]])
                brand_list.append(brand_str)
                print('brand:', brand_str, 'sentence:', k)
                sentence_split[brand_tuple_index[0]:brand_tuple_index[1]] = []
                break
    if not brand_str:
        brand_list.append("")
    k = ' '.join(sentence_split)
    quality_tags, category_tags = unsupervised_feature_extraction(k)
    quality_tags_list.append(quality_tags)
    category_tags_list.append(category_tags)

In [None]:
'A' in data_checker_set

In [None]:
a = ['a', 'b']

In [None]:
" ".join(a[0:0])

In [None]:
checker = defaultdict(set)

In [None]:
checker['a'].add('v')

In [None]:
b = tuple()

In [None]:
b = (1,2)

In [None]:
type(b)

In [None]:
a = ['a']

In [None]:
a[0:1]

In [None]:
b[0]

In [None]:
brand_list

In [None]:
'Alexander Mcqueen' in data_checker_list

In [None]:
myset = set(['a','b', 'ab'])
myset = set(filter(lambda x:len(x)!=1, myset))
print(myset)

In [None]:
candidate_sentences['Quality Tags'] = quality_tags_list
candidate_sentences['Category Tags'] = category_tags_list
candidate_sentences['Brand List'] = brand_list
!mkdir ./dev_csvs
candidate_sentences.to_csv('./dev_csvs/quality_category_brand_extraction.csv', sep='\t', encoding='utf-8')

In [None]:
d = {'1':['2']}

In [None]:
import json
json.dumps(d)

In [None]:
import pandas as pd
df = pd.DataFrame(columns=['tags'])

In [None]:
df.to_csv('./current_tagger.csv')

In [None]:
df = pd.read_csv('./current_tagger.csv', header=None, names=['tags'])

In [None]:
df

In [None]:
df = df.append({'tags':json.dumps(d)}, ignore_index=True)

In [None]:
df.to_csv('./current_tagger.csv')

In [None]:
df = pd.read_csv('./current_tagger.csv')

In [None]:
df

In [None]:
# ------------------ Entity Recognition

In [None]:
text = "LG Premium 50L cooker under 50000"
# text = "LG Premium Cooker with capacity under 30L"
# text = 'yellow short sleeve mini dress'
# text = 'yellow short sleeve mini dress between 1000 and 2000'
# text = "LG Premium cooker under 50 dollars"
# text = "LG Premium cooker under 50 dollars with discount"

In [None]:
nlp = spacy.load("en_core_web_sm")
doc = nlp(text)

In [None]:
for tok in doc:
    print(tok.text, tok.dep_, tok.pos_)
print(doc.ents)
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [None]:
unsupervised_feature_extraction(text)

In [26]:
def get_preposition_meaning(token, cur_string):
    for child_val in token.children:
        if child_val in token.rights:
            cur_string += " " + child_val.text
        elif child_val in token.lefts:
            cur_string = child_val.text + " " + cur_string
        cur_string = cur_string.strip()
        if child_val.children:
            cur_string = get_preposition_meaning(child_val, cur_string)
    return cur_string

In [None]:
for token in doc:
    print(token.text, token.dep_, token.head.text, token.head.pos_,
            [child for child in token.children])
    if token.dep_ == 'prep':
#         print('rights:', token.rights)
#         for right_val in token.rights:
#             print('val:', right_val)
#         for descendent in token.subtree:
#             print('descendent:', descendent, token.is_ancestor(descendent))
#         print(token.n_rights)
        cur_string = ""
        prep_meaning = get_preposition_meaning(token, cur_string)

In [None]:
prep_meaning

In [None]:
for tok in doc:
    for child in tok.children:
        print('child:', child, 'type:', type(child))

In [28]:
def unsupervised_feature_extraction_dependency_extraction(sent):
    ## chunk 1
    category_list = []
    quality_list = []
    preposition_list = []
    preposition_meaning_list = []
    
    prefix = ""
    modifier = ""
    category = ""
    doc = nlp(sent)
    doc_len = len(doc)
    i = 0
    while i < doc_len:
        tok = doc[i]
#         print('text:', tok.text, 'dep:', tok.dep_, 'pos:', tok.pos_, 'tag:', tok.tag_)
        
        ## chunk 3: check if token is a modifier or not
        if tok.dep_.endswith("mod") == True:
            modifier = prefix + " " + tok.text
            j = i + 1
            while j < doc_len:
                tok_next = doc[j]
                if tok_next.dep_.endswith("mod"):
                    modifier += " " + tok_next.text
                    j += 1
                else:
                    break
            i = j - 1
            modifier = modifier.strip()
            modifier = " ".join(modifier.split())
            quality_list.append(modifier)
            prefix = ""
            modifier = ""
            category = ""

        ## chunk 2: check if token is a noun or not    
        elif tok.pos_.lower() == 'propn' or tok.pos_.lower() == 'noun':
            category = prefix + " " + tok.text
            j = i + 1
            while j < doc_len:
                tok_next = doc[j]
                if (tok_next.pos_.lower() == 'propn' or tok_next.pos_.lower() == 'noun'):
                    category += " " + tok_next.text
                    j += 1
                else:
                    break
            i = j - 1
            category = category.strip()
            category = " ".join(category.split())
            category_list.append(category)
            prefix = ""
            modifier = ""
            category = ""

        
        ## chunk 4: check if token is a coumpuund word or not
        elif tok.dep_ != "punct" and tok.dep_.lower() == 'compound':
            prefix += " " + tok.text
            j = i + 1
            while j < doc_len:
                tok_next = doc[j]
                if tok_next.dep_.lower() == 'compound':
                    prefix += " " + tok_next.text
                    j += 1
                else:
                    break
                    
        elif tok.dep_ == 'prep':
            prep_string = ""
            preposition_list.append(tok.text)
            preposition_meaning_list.append(get_preposition_meaning(tok, prep_string))
        
        i += 1

    return quality_list, category_list, preposition_list, preposition_meaning_list

In [29]:
prep_df = pd.read_csv('../input/preposition-sample-test/preposition_searches.csv', index_col=0, header=0)

In [30]:
prep_df = prep_df.drop('index', axis=1)

In [31]:
prep_df.columns

Index(['search_text'], dtype='object')

In [32]:
prep_df[:5]

Unnamed: 0,search_text
0,'iphone under 30k'
1,samsung phones under 5000
2,earphones between 1000 to 2000
3,[1:37 pm] ayush pushkar 8 gb ram mobile between 10000 & 30000
4,[1:37 pm] ayush pushkar 8 gb ram mobile between 10000 & 30000 --> ye v not working


In [None]:
# preposition_sentences = [
#     "32 inch tv under 30000",
#     "Air Conditioners under 5000",
#     "6gb mobiles under 1000"
# ]

In [34]:
quality_tags_list = []
category_tags_list = []
brand_list = []
preposition_list = [] 
preposition_meaning_list = []
for k in tqdm(prep_df["search_text"]):
    sentence_split = k.split()
    split_len = len(sentence_split)
    brand_str = ""
    initial_key = ""
    
    for i in range(split_len):
        if sentence_split[i] in data_checker_dict[""]:
            start_index = i
            j = 2
            loop_index = i + 1
            brand_tuple_index = tuple()
            while loop_index <= split_len and " ".join(sentence_split[start_index:loop_index]) in data_checker_dict[" ".join(sentence_split[start_index:(loop_index - 1)])]:
                if " ".join(sentence_split[start_index:loop_index]) in data_checker_set:
                    brand_tuple_index = (start_index, loop_index) 
                j += 1
                loop_index += 1
            if brand_tuple_index:
                brand_str = ' '.join(sentence_split[brand_tuple_index[0]:brand_tuple_index[1]])
                brand_list.append(brand_str)
                print('brand:', brand_str, 'sentence:', k)
                sentence_split[brand_tuple_index[0]:brand_tuple_index[1]] = []
                break
    if not brand_str:
        brand_list.append("")
    k = ' '.join(sentence_split)
    quality_tags, category_tags, preposition_tags, preposition_meaning_tags = unsupervised_feature_extraction_dependency_extraction(k)
    quality_tags_list.append(quality_tags)
    category_tags_list.append(category_tags)
    preposition_list.append(preposition_tags)
    preposition_meaning_list.append(preposition_meaning_tags)

100%|██████████| 26/26 [00:00<00:00, 92.10it/s]


In [36]:
prep_df['Quality Tags'] = quality_tags_list
prep_df['Category Tags'] = category_tags_list
prep_df['Brand List'] = brand_list
prep_df['Preposition List'] = preposition_list
prep_df['Preposition Meaning List'] = preposition_meaning_list
!mkdir ./dev_csvs
prep_df.to_csv('./dev_csvs/quality_category_brand_preposition_extraction.csv', sep='\t', encoding='utf-8')