# SpaCy Playground
Step by step building a custom pipeline to handle queries for Twitter database.

In [1]:
# download german package
# !python -m spacy download de_core_news_sm

In [2]:
# !python -m spacy init fill-config ../base_config.cfg config.cfg

In [1]:
import spacy

from spacy.language import Language
from spacy.matcher import Matcher
from spacy.tokens import Token
from spacy import displacy

# load german language model
nlp = spacy.load('de_core_news_sm', disable=["tok2vec", "parser", "senter", "lemmatizer", "tagger", "attribute_ruler"])

  from .autonotebook import tqdm as notebook_tqdm


Sample query for testing purposes:

In [2]:
TEXT = "#Angela Merkel im Bundestag #CDU#FDP #merkelmussweg"

---
## Investigate Tokens from SpaCy

In [3]:
doc = nlp(TEXT)

for token in doc:
    print(token.text)

# displacy.render(doc, style="dep", jupyter=True)

#
Angela
Merkel
im
Bundestag
#
CDU#FDP
#
merkelmussweg


Hashtags are treated poorly. How to detect them and prevent the tokenizer from splitting them?
- split compound hashtags
- mark hashtags in SpaCy

---
# 1. Preprocessing
FIrstly, make sure the whitespaces are set correctly in between the hashtags.

In [4]:
def seperate_hashtags(text: str):
    """
    Insert a whitespace if hashtags are missing a gap in between.  
    """
    for i, j in enumerate(text):
        if (text[i] == "#" and i > 0):
            if text[i-1] != " ":
                    text = text[:i] + " " + text[i:]
                    i+=1
    return text

TEXT = seperate_hashtags(TEXT)

print(TEXT)

#Angela Merkel im Bundestag #CDU #FDP #merkelmussweg


---
## 1.1 Tokenizer
Customize the Matcher to handle Tweet-specific syntax - i.e. hashtags.
- Tokenize by Hashtag


In [5]:
@Language.factory("hashtag_finder")
def create_hashtag_finder(nlp, name):
    return HashtagFinder(nlp.vocab)

class HashtagFinder:
    """
    The purpose of this class is to detect hashtags and mark them.
    """
    def __init__(self, vocab):
        patterns = [ [{"ORTH": "#"}] ]

        # Register a new token extension to mark hashtags
        Token.set_extension("hashtag", default=False)
        self.matcher = Matcher(vocab)
        self.matcher.add("hashtag_finder", patterns)

    def __call__(self, doc):
        # This method is invoked when the component is called on a Doc
        matches = self.matcher(doc)
        spans = []  # Collect the matched spans here

        for match_id, start, end in matches:
            # TODO: what happens if whitespace after hashtag? 
            if (end < len(doc)):
                spans.append(doc[start+1:end+1])
            
        with doc.retokenize() as retokenizer:
            for span in spans:
                retokenizer.merge(span)
                for token in span:
                    token._.hashtag = True  # Mark token as hashtag
        return doc
     
nlp.add_pipe("hashtag_finder", before="ner")  # Add component to the pipeline

<__main__.HashtagFinder at 0x7fdb9e03ab30>

In [7]:
doc = nlp(TEXT)

for token in doc:
    print(token, token._.hashtag)

# False
Angela True
Merkel False
im False
Bundestag False
# False
CDU True
# False
FDP True
# False
merkelmussweg True


---
## 1.2 Named Entities
How are named entities detected? Especially those that are hashtags.

In [7]:
doc = nlp(TEXT)

for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, spacy.explain(ent.label_))

for token in doc:
    print(token)
    
displacy.render(doc, style="ent")

Angela Merkel 1 14 Named person or family.
Bundestag 18 27 Companies, agencies, institutions, etc.
CDU 29 32 Companies, agencies, institutions, etc.
#
Angela
Merkel
im
Bundestag
#
CDU
#
FDP
#
merkelmussweg


It seems that named entities as well as hashtags are treated correctly. Now, let's have a look at which terms are relevant for POS Tagging.

---
## 1.3 Part of Speech Tagging

---
# 2. Word Embeddings
The following embeddings are applied to the selected terms
-FastText
-Glove
-Word2Vec