## Import dataset

In [None]:
import pandas as pd

data = pd.read_csv('datasets/training_data.csv')
data.sample(5)

## Construct corpus

In [None]:
# corpus = ' '.join(data['text'])
# print(corpus[:1000])

Number of unique tokens in corpus

In [None]:
# from nltk.tokenize import wordpunct_tokenize
# from collections import Counter

# tokens = wordpunct_tokenize(corpus)
# word_counts = Counter(tokens)

# print("Number of unique tokens:", len(word_counts))

## Tokenization

In [None]:
# from nltk import word_tokenize, sent_tokenize
# from nltk.tokenize import wordpunct_tokenize

# data['word_tokens'] = data['text'].apply(word_tokenize) # slower than wordpunct_tokenize
# data['word_tokens'] = data['text'].apply(wordpunct_tokenize)
# data['word_counts'] = data['word_tokens'].apply(lambda x: len(x))
# data['unique_word_counts'] = data['word_tokens'].apply(lambda x: len(set(x)))

# data['sentences'] = data['text'].apply(sent_tokenize)
# data['sentence_counts'] = data['sentences'].apply(len)

# data.sample(5)

# Spacy

In [None]:
import spacy
nlp = spacy.load("en_core_web_md")

# Customize pipeline
# nlp.remove_pipe('tok2vec')
# nlp.remove_pipe('tagger')
# nlp.remove_pipe('parser')
# nlp.remove_pipe('attribute_ruler')
# nlp.remove_pipe('lemmatizer')
# nlp.remove_pipe('ner')

nlp.enable_pipe('senter')

In [None]:
# import re
# from spacy.tokens import Doc
# from spacy.language import Language

# @Language.component("regex_cleanup")
# def regex_cleanup(doc):
#     cleaned_tokens = []
#     for token in doc:
#         if not re.match(r'((\w+))', token.text):
#             cleaned_tokens.append(token)
#     return Doc(doc.vocab, words=[token.text for token in cleaned_tokens])

# nlp.add_pipe("regex_cleanup", first=True)

In [None]:
small_data = data.sample(1000)
small_data['tokens'] = small_data['text'].apply(nlp)
small_data.sample(5)

In [None]:
small_data['tokens_count'] = small_data['tokens'].apply(len)
small_data['tokens_count'].describe()

In [None]:
def filter_tokens(tokens):
    tokens_without_punctuation = [token for token in tokens if not token.is_punct]
    tokens_without_space = [token for token in tokens_without_punctuation if not token.is_space]
    tokens_without_stopwords = [token for token in tokens_without_space if not token.is_stop]
    return tokens_without_stopwords

In [None]:
def filter_text(text):
    tokens_lemmatized = [token.lemma_ for token in text]
    tokens_lower = [token.lower() for token in tokens_lemmatized]
    return ' '.join(tokens_lower)

In [None]:
def text_embeddings(text):
    token_embeddings = [token.vector for token in text]
    return token_embeddings

In [None]:
def text_ner(text):
    return [(token, token.pos_, token.ent_iob_, token.ent_type_) for token in text]

In [None]:
small_data['tokens_filtered'] = small_data['tokens'].apply(filter_tokens)
small_data['text_filtered'] = small_data['tokens_filtered'].apply(filter_text)
small_data['text_embeddings'] = small_data['tokens_filtered'].apply(text_embeddings)
small_data['text_ner'] = small_data['tokens_filtered'].apply(text_ner)
small_data.sample(5)

In [None]:
def process_entities(text_filtered):
    entity_dict = {}
    doc = nlp(text_filtered)
    for ent in doc.ents:
        if str(ent) not in entity_dict:
            entity_dict[ent.lemma_] = (ent.root.pos_, ent.label_)

    non_entity_strings = [token for token in doc 
                        if token.text not in entity_dict 
                        and token.ent_iob_ == "O"
                        and token.pos_ != 'SPACE']
    entity_dict.update({token.lemma_: (token.pos_, None) for token in non_entity_strings})

    return entity_dict

small_data['entity_dict'] = small_data['text_filtered'].apply(process_entities)

In [None]:
small_data.sample(5)

In [None]:
%store small_data

# On another notebook

```py
%run 'path_to_notebook.ipynb'
%store -r small_data
print(small_data)
```