# Query Reformulation using SpaCy
Step by step building a custom pipeline to handle queries for Twitter database.

In [1]:
import spacy

from spacy.language import Language
from spacy.matcher import Matcher
from spacy.tokens import Token
from spacy import displacy

import pandas as pd

Load a predefined model

In [2]:
# download german package
# !python -m spacy download de_core_news_sm

MODEL = 'de_core_news_sm'

In [3]:
# load german language model
nlp = spacy.load(MODEL)

Input a query for testing purposes...

In [4]:
TEXT = "Die große Koalition unter Angela Merkel ist gescheitert! #CDU#SPD"

---
## Investigate Tokens from SpaCy

In [5]:
doc = nlp(TEXT)

displacy.render(doc, style="dep", jupyter=True)

Hashtags are treated poorly. How to detect them and prevent the tokenizer from splitting them?
- split compound hashtags
- mark hashtags in SpaCy

---
# 1. Preprocessing
Firstly, make sure the whitespaces are set correctly in between the hashtags.

In [6]:
def seperate_hashtags(text: str):
    """
    Insert a whitespace if hashtags are missing a gap in between.  
    """
    for i, j in enumerate(text):
        if (text[i] == "#" and i > 0):
            if text[i-1] != " ":
                    text = text[:i] + " " + text[i:]
                    i+=1
    return text

TEXT = seperate_hashtags(TEXT)

print(TEXT)

Die große Koalition unter Angela Merkel ist gescheitert! #CDU #SPD


---
## 1.1 Tokenizer
Customize the Matcher to handle Tweet-specific syntax - i.e. hashtags.
- Tokenize by Hashtag


In [7]:
@Language.factory("hashtag_finder")
def create_hashtag_finder(nlp, name):
    return HashtagFinder(nlp.vocab)

class HashtagFinder:
    """
    The purpose of this class is to detect hashtags and mark them.
    """
    def __init__(self, vocab):
        patterns = [ [{"ORTH": "#"}] ]

        # Register a new token extension to mark hashtags
        Token.set_extension("hashtag", default=False)
        self.matcher = Matcher(vocab)
        self.matcher.add("hashtag_finder", patterns)

    def __call__(self, doc):
        # This method is invoked when the component is called on a Doc
        matches = self.matcher(doc)
        spans = []  # Collect the matched spans here

        for match_id, start, end in matches:
            # TODO: what happens if whitespace after hashtag? 
            if (end < len(doc)):
                spans.append(doc[start+1:end+1])
            
        with doc.retokenize() as retokenizer:
            for span in spans:
                retokenizer.merge(span)
                for token in span:
                    token._.hashtag = True  # Mark token as hashtag
        return doc
     
nlp.add_pipe("hashtag_finder", before="ner")  # Add component to the pipeline

<__main__.HashtagFinder at 0x7f31c1214850>

In [8]:
doc = nlp(TEXT)
data = []

for token in doc:
    data.append([token, token._.hashtag])
pd.DataFrame(data, columns=["Text", "Hashtag"])

Unnamed: 0,Text,Hashtag
0,Die,False
1,große,False
2,Koalition,False
3,unter,False
4,Angela,False
5,Merkel,False
6,ist,False
7,gescheitert,False
8,!,False
9,#,False


---
## 1.2 Named Entities
How are named entities detected? Especially those that are hashtags.

In [9]:
doc = nlp(TEXT)
data = []

for ent in doc.ents:
    data.append([ent.text, spacy.explain(ent.label_)])
    
displacy.render(doc, style="ent")
pd.DataFrame(data, columns=["Text", "NER Label"])

Unnamed: 0,Text,NER Label
0,Angela Merkel,Named person or family.
1,CDU,"Companies, agencies, institutions, etc."


It seems that named entities as well as hashtags are treated correctly. Now, let's have a look at which terms are relevant for POS Tagging.

---
## 1.3 Part of Speech Tagging

In [10]:
data = []

for token in doc:
    data.append ([token.text, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop, token._.hashtag])

pd.DataFrame(data, columns=["Text", "UPOS Tag", "Tag", "Syntactics", "Shape", "Alpha Token", "Stop Token", "Hashtag"], index=None)

Unnamed: 0,Text,UPOS Tag,Tag,Syntactics,Shape,Alpha Token,Stop Token,Hashtag
0,Die,DET,ART,nk,Xxx,True,True,False
1,große,ADJ,ADJA,nk,xxxx,True,True,False
2,Koalition,NOUN,NN,sb,Xxxxx,True,False,False
3,unter,ADP,APPR,mnr,xxxx,True,True,False
4,Angela,PROPN,NE,pnc,Xxxxx,True,False,False
5,Merkel,PROPN,NE,nk,Xxxxx,True,False,False
6,ist,AUX,VAFIN,ROOT,xxx,True,True,False
7,gescheitert,VERB,VVPP,oc,xxxx,True,False,False
8,!,PUNCT,$.,punct,!,False,False,False
9,#,PROPN,XY,ROOT,#,False,False,False


---
## 1.5 Candidate Selection
Extract terms that are used to find synonyms. The words to find synonyms for should be:
- verbs or nouns
- no hashtags or entities

In [40]:
pos_tags = ["VERB", "NOUN", "PROPN"]

candidate_terms = []

for token in doc:
    if token._.hashtag is True:
        continue
    if token.is_alpha is False:
        continue
    if token.is_stop:
        continue
    if token.pos_ not in pos_tags:
        continue

    candidate_terms.append(token)

print(candidate_terms)

[Koalition, Angela, Merkel, gescheitert]


---
# 2. Word Embeddings
The following embeddings are applied to the selected terms
- FastText
- Glove
- Word2Vec

## 2.1 Load FastText model with **SpaCy**

In [11]:
# To load the German FastText model run the following command
# !python -m spacy init vectors de ../data/fasttext/cc.de.300.zip ../models/de-fasttext-10000 --prune 10000

In [12]:
nlp = spacy.load("../models/de-fasttext-10000/")

In [13]:
doc1 = nlp("Haus")
doc2 = nlp("Katze")
doc3 = nlp("Hund")
doc4 = nlp("Bude")

print(f"Haus | Katze -> {doc1.similarity(doc2)}")
print(f"Katze | Hund -> {doc2.similarity(doc3)}")
print(f"Haus | Bude -> {doc1.similarity(doc4)}")

Haus | Katze -> 0.367066624402335
Katze | Hund -> 0.7219703586509103
Haus | Bude -> 0.6441379922375441


Since SpaCy does not support finding similar terms given a term - I refer to Gensim and load the word vectors there.

## 2.2 Load FastText model with **Gensim**

In [None]:
from gensim.models import KeyedVectors

# takes about 6 min
kv = KeyedVectors.load_word2vec_format("../data/fasttext/cc.de.300.vec")

Compare two words similarity:

In [None]:
#kv.get_mean_vector("")
kv.similarity("Jason","Marie-Christin")

Find most similar terms for a given term:

In [None]:
kv.most_similar("Jason")

[('Chris', 0.7400398850440979),
 ('Josh', 0.7222685217857361),
 ('Jeff', 0.7152098417282104),
 ('Matthew', 0.7035969495773315),
 ('Mike', 0.6966241002082825),
 ('Jennifer', 0.6949098110198975),
 ('Kevin', 0.6928116083145142),
 ('Greg', 0.6927750110626221),
 ('Aaron', 0.6925131678581238),
 ('Jeremy', 0.6914620995521545)]

Calculate cosine similarity between word vectors:

In [None]:
kv.cosine_similarities(kv.get_vector("Hallo"), [kv.get_vector("Hi"), kv.get_vector("Hallöle")])

array([0.70029044, 0.72877127], dtype=float32)

In [None]:
kv = KeyedVectors.load_word2vec_format("../data/wiki.de/wiki.de.vec")

In [None]:
kv.most_similar("Hallo")

---
## 3. Find Synonyms

In [None]:
for term in candidate_terms:
    ...