### Imports

In [1]:
import json
import spacy
import numpy as np
import pandas as pd
from typing import Callable, Union, List, Dict, Any, Tuple
from tqdm.notebook import tqdm

In [2]:
from keyphrase_vectorizers import KeyphraseCountVectorizer
from keybert import KeyBERT

In [3]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline

In [4]:
from index_utils import IndexUtil
from experiment_utils import ExperimentUtil

### Setup

In [5]:
kw_model = KeyBERT()

In [6]:
nlp = spacy.load("en_core_web_sm")



In [7]:
tokenizer = AutoTokenizer.from_pretrained("dslim/bert-base-NER")
model = AutoModelForTokenClassification.from_pretrained("dslim/bert-base-NER")

pipeline_ner = pipeline("ner", model=model, tokenizer=tokenizer)

### Load datasets

In [8]:
DOCUMENTS_SQUAD, QUESTIONS_SQUAD = ExperimentUtil.load_dataset('squad_10k')
DOCUMENTS_SWIFT, QUESTIONS_SWIFT = ExperimentUtil.load_dataset('swift_ui')

### Extract keywords

In [9]:
def prepare_documents_keywords(documents):
    texts = [doc['text'] for doc in documents]
    keywords_base = kw_model.extract_keywords(docs=texts, vectorizer=KeyphraseCountVectorizer(), top_n=20)
    keywords = [' '.join([keyword_pair[0] for keyword_pair in docs_keywords]) for docs_keywords in keywords_base]
    return keywords

In [57]:
SQUAD_KEYWORDS = prepare_documents_keywords(DOCUMENTS_SQUAD)

10000it [00:06, 1519.60it/s]


In [10]:
SWIFT_KEYWORDS = prepare_documents_keywords(DOCUMENTS_SWIFT)

185it [00:00, 1426.31it/s]


### Extract lemma

In [11]:
def lemmatize_text(text):
    doc = nlp(text)
    text_lemma = ''
    for token in doc:
        text_lemma = text_lemma + token.lemma_+' '
    return text_lemma.strip()

In [12]:
lemmatize_text('Tom Hanks was a good actor as he loves and had loved playing.')

'Tom Hanks be a good actor as he love and have love playing .'

In [56]:
SQUAD_LEMMA = [lemmatize_text(doc['text']) for doc in DOCUMENTS_SQUAD]

In [13]:
SWIFT_LEMMA = [lemmatize_text(doc['text']) for doc in DOCUMENTS_SWIFT]

### Extract Named Entities

In [14]:
def extract_named_entities(text):
    named_entities = " ".join([entity['word'] for entity in pipeline_ner(text)])
    named_entities = named_entities.replace(' ##','')
    named_entities = named_entities.replace('##','')
    return named_entities

In [15]:
extract_named_entities('Tom Hanks was a good actor as he loves and had loved playing.')

'Tom Hanks'

In [55]:
SQUAD_NE = []
for doc in tqdm(DOCUMENTS_SQUAD):
    SQUAD_NE.append(extract_named_entities(doc['text']))

  0%|          | 0/10000 [00:00<?, ?it/s]

In [16]:
SWIFT_NE = []
for doc in tqdm(DOCUMENTS_SWIFT):
    SWIFT_NE.append(extract_named_entities(doc['text']))

  0%|          | 0/185 [00:00<?, ?it/s]

### Override datasets

In [17]:
def load_full_dataset(dataset_name: str) -> Tuple[Dict[str, Any], Dict[str, Any]]:
    documents_path, _ = ExperimentUtil.get_dataset_paths(dataset_name)
    with open(documents_path) as json_file:
        document_data = json.load(json_file)
    return document_data

In [18]:
SQUAD_JSON = load_full_dataset('squad_10k')
SWIFT_JSON = load_full_dataset('swift_ui')

In [19]:
def append_fields(docs, keywords_list, lemma_list, ne_list):
    for doc, keywords, lemma, ne in zip(docs, keywords_list, lemma_list, ne_list):
        doc['keywords'] = keywords
        doc['text_lemma'] = lemma
        doc['ne'] = ne

In [20]:
append_fields(SWIFT_JSON['documents'], SWIFT_KEYWORDS, SWIFT_LEMMA, SWIFT_NE)

In [58]:
append_fields(SQUAD_JSON['documents'], SQUAD_KEYWORDS, SQUAD_LEMMA, SQUAD_NE)

### Save datasets

In [61]:
# with open('../data/processed/squad_train_d10k_q1k_additional_fields/documents.json', 'w') as f:
#     json.dump(SQUAD_JSON, f)

In [25]:
# with open('../data/processed/swift-ui-course_additional_fields/documents.json', 'w') as f:
#     json.dump(SWIFT_JSON, f)