# Query Reformulation using SpaCy
Step by step building a custom pipeline to handle queries for Twitter database.

In [1]:
import spacy

from spacy.language import Language
from spacy.matcher import Matcher
from spacy.tokens import Token
from spacy import displacy

import pandas as pd

Load a predefined model

In [2]:
# download german package
# !python -m spacy download de_core_news_sm

MODEL = 'de_core_news_sm'

In [4]:
# load german language model
nlp = spacy.load(MODEL)

Input a query for testing purposes...

In [14]:
TEXT = "Die große Koalition ist gescheitert! #CDU# SPD"

---
## Investigate Tokens from SpaCy

In [15]:
doc = nlp(TEXT)

displacy.render(doc, style="dep", jupyter=True)

Hashtags are treated poorly. How to detect them and prevent the tokenizer from splitting them?
- split compound hashtags
- mark hashtags in SpaCy

---
# 1. Preprocessing
FIrstly, make sure the whitespaces are set correctly in between the hashtags.

In [16]:
def seperate_hashtags(text: str):
    """
    Insert a whitespace if hashtags are missing a gap in between.  
    """
    for i, j in enumerate(text):
        if (text[i] == "#" and i > 0):
            if text[i-1] != " ":
                    text = text[:i] + " " + text[i:]
                    i+=1
    return text

TEXT = seperate_hashtags(TEXT)

print(TEXT)

Die große Koalition ist gescheitert! #CDU # SPD


---
## 1.1 Tokenizer
Customize the Matcher to handle Tweet-specific syntax - i.e. hashtags.
- Tokenize by Hashtag


In [11]:
@Language.factory("hashtag_finder")
def create_hashtag_finder(nlp, name):
    return HashtagFinder(nlp.vocab)

class HashtagFinder:
    """
    The purpose of this class is to detect hashtags and mark them.
    """
    def __init__(self, vocab):
        patterns = [ [{"ORTH": "#"}] ]

        # Register a new token extension to mark hashtags
        Token.set_extension("hashtag", default=False)
        self.matcher = Matcher(vocab)
        self.matcher.add("hashtag_finder", patterns)

    def __call__(self, doc):
        # This method is invoked when the component is called on a Doc
        matches = self.matcher(doc)
        spans = []  # Collect the matched spans here

        for match_id, start, end in matches:
            # TODO: what happens if whitespace after hashtag? 
            if (end < len(doc)):
                spans.append(doc[start+1:end+1])
            
        with doc.retokenize() as retokenizer:
            for span in spans:
                retokenizer.merge(span)
                for token in span:
                    token._.hashtag = True  # Mark token as hashtag
        return doc
     
nlp.add_pipe("hashtag_finder", before="ner")  # Add component to the pipeline

<__main__.HashtagFinder at 0x7ff880847580>

In [17]:
doc = nlp(TEXT)
data = []

for token in doc:
    data.append([token, token._.hashtag])
pd.DataFrame(data, columns=["Text", "Hashtag"])

Unnamed: 0,Text,Hashtag
0,Die,False
1,große,False
2,Koalition,False
3,ist,False
4,gescheitert,False
5,!,False
6,#,False
7,CDU,True
8,#,False
9,SPD,True


---
## 1.2 Named Entities
How are named entities detected? Especially those that are hashtags.

In [18]:
doc = nlp(TEXT)
data = []

for ent in doc.ents:
    data.append([ent.text, spacy.explain(ent.label_)])
    
displacy.render(doc, style="ent")
pd.DataFrame(data, columns=["Text", "NER Label"])

Unnamed: 0,Text,NER Label
0,CDU,"Companies, agencies, institutions, etc."


It seems that named entities as well as hashtags are treated correctly. Now, let's have a look at which terms are relevant for POS Tagging.

---
## 1.3 Part of Speech Tagging

In [19]:
data = []

for token in doc:
    data.append ([token.text, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop, token._.hashtag])

pd.DataFrame(data, columns=["Text", "UPOS Tag", "Tag", "Syntactics", "Shape", "Alpha Token", "Stop Token", "Hashtag"], index=None)

Unnamed: 0,Text,UPOS Tag,Tag,Syntactics,Shape,Alpha Token,Stop Token,Hashtag
0,Die,DET,ART,nk,Xxx,True,True,False
1,große,ADJ,ADJA,nk,xxxx,True,True,False
2,Koalition,NOUN,NN,sb,Xxxxx,True,False,False
3,ist,AUX,VAFIN,ROOT,xxx,True,True,False
4,gescheitert,VERB,VVPP,oc,xxxx,True,False,False
5,!,PUNCT,$.,punct,!,False,False,False
6,#,PROPN,XY,ROOT,#,False,False,False
7,CDU,PROPN,NE,ROOT,XXX,True,False,True
8,#,PROPN,NE,ROOT,#,False,False,False
9,SPD,PROPN,NE,ROOT,XXX,True,False,True


The words to find synonyms for should be:
- verbs or nouns
- no hashtags or entities

---
# 2. Word Embeddings
The following embeddings are applied to the selected terms
- FastText
- Glove
- Word2Vec

## 2.1 Load FastText model

In [1]:
# To load the German FastText model run the following command
!python -m spacy init vectors de ../data/fasttext/cc.de.300.zip ../models/de-fasttext-10000 --prune 10000

[38;5;4mℹ Creating blank nlp object for language 'de'[0m
[2022-11-24 17:45:18,349] [INFO] Reading vectors from ../data/fasttext/cc.de.300.zip
2000000it [02:37, 12683.79it/s]
[2022-11-24 17:47:56,034] [INFO] Loaded vectors from ../data/fasttext/cc.de.300.zip
[38;5;2m✔ Successfully converted 10000 vectors[0m
[38;5;2m✔ Saved nlp object with vectors to output directory. You can now use
the path to it in your config as the 'vectors' setting in [initialize].[0m
/home/jsonpy/Projects/Practical/twitter-query-expansion/models/01


In [20]:
nlp = spacy.load("../models/de-fasttext-10000/")

In [21]:
doc1 = nlp("Haus")
doc2 = nlp("Katze")
doc3 = nlp("Hund")
doc4 = nlp("Bude")

print(f"Haus | Katze -> {doc1.similarity(doc2)}")
print(f"Katze | Hund -> {doc2.similarity(doc3)}")
print(f"Haus | Bude -> {doc1.similarity(doc4)}")

Haus | Katze -> 0.367066624402335
Katze | Hund -> 0.7219703586509103
Haus | Bude -> 0.6441379922375441


In [23]:
from numba import jit
import numpy as np

@jit(nopython=True)
def cosine_similarity_numba(u:np.ndarray, v:np.ndarray):
    assert(u.shape[0] == v.shape[0])
    uv = 0
    uu = 0
    vv = 0
    for i in range(u.shape[0]):
        uv += u[i]*v[i]
        uu += u[i]*u[i]
        vv += v[i]*v[i]
    cos_theta = 1
    if uu != 0 and vv != 0:
        cos_theta = uv/np.sqrt(uu*vv)
    return cos_theta

In [25]:
word = "Hallo"
by_similarity = sorted(word, key=lambda w: cosine_similarity_numba(w.vector, word.vector), reverse=True)

AttributeError: 'str' object has no attribute 'vector'