# Twitter Query Reformulation 
Step by step building a custom pipeline to handle queries for Twitter database.

In [1]:
import pandas as pd
import spacy

from spacy.language import Language
from spacy.matcher import Matcher
from spacy.tokens import Token
from spacy import displacy

Load a predefined German model

In [2]:
# download german package
# !python -m spacy download de_core_news_sm
# !python -m spacy download de_core_news_lg

MODEL = 'de_core_news_lg'

In [3]:
# load german language model
nlp = spacy.load(MODEL)

Input a query for testing purposes...

In [4]:
QUERY = "gro√üe Koalition Merkel #GroKo #CDU"

---
## Investigate Tokens from SpaCy

In [5]:
doc = nlp(QUERY)

displacy.render(doc, style="dep", jupyter=True)

for token in doc:
    print(token)

gro√üe
Koalition
Merkel
#
GroKo
#
CDU


Hashtags are treated poorly. How to detect them and prevent the tokenizer from splitting them?
- split compound hashtags
- mark hashtags in SpaCy

---
# 1. Preprocessing
Firstly, make sure the whitespaces are set correctly in between the hashtags.

In [6]:
def seperate_hashtags(text: str):
    """
    Insert a whitespace if hashtags are missing a gap in between.  
    """
    for i, j in enumerate(text):
        if (text[i] == "#" and i > 0):
            if text[i-1] != " ":
                    text = text[:i] + " " + text[i:]
                    i+=1
    return text

QUERY = seperate_hashtags(QUERY)

print(QUERY)

gro√üe Koalition Merkel #GroKo #CDU


---
## 1.1 Matcher
Customize the Matcher to handle Tweet-specific syntax - i.e. hashtags.
- Tokenize by Hashtag (#)
- by Twitter entity (@)

In [7]:
@Language.factory("hashtag_finder")
def create_hashtag_finder(nlp, name):
    return HashtagFinder(nlp.vocab)

class HashtagFinder:
    """
    The purpose of this class is to detect hashtags and mark them.
    """
    def __init__(self, vocab):
        patterns = [ [{"ORTH": "#"}] ]

        # Register a new token extension to mark hashtags
        Token.set_extension("hashtag", default=False)
        self.matcher = Matcher(vocab)
        self.matcher.add("hashtag_finder", patterns)

    def __call__(self, doc):
        # This method is invoked when the component is called on a Doc
        matches = self.matcher(doc)
        spans = []  # Collect the matched spans here

        for match_id, start, end in matches:
            # TODO: what happens if whitespace after hashtag? 
            if (end < len(doc)):
                spans.append(doc[start+1:end+1])
            
        with doc.retokenize() as retokenizer:
            for span in spans:
                retokenizer.merge(span)
                for token in span:
                    token._.hashtag = True  # Mark token as hashtag
        return doc
     
nlp.add_pipe("hashtag_finder", before="ner")  # Add component to the pipeline

<__main__.HashtagFinder at 0x7fac6db4e8f0>

In [8]:
doc = nlp(QUERY)
data = []

for token in doc:
    data.append([token, token._.hashtag])
pd.DataFrame(data, columns=["Text", "Hashtag"])

Unnamed: 0,Text,Hashtag
0,gro√üe,False
1,Koalition,False
2,Merkel,False
3,#,False
4,GroKo,True
5,#,False
6,CDU,True


---
## 1.2 Named Entities
How are named entities detected? Especially those that are hashtags.

In [9]:
doc = nlp(QUERY)
data = []

for ent in doc.ents:
    data.append([ent.text, spacy.explain(ent.label_)])
    
displacy.render(doc, style="ent")
pd.DataFrame(data, columns=["Text", "NER Label"])

Unnamed: 0,Text,NER Label
0,Merkel,Named person or family.


It seems that named entities as well as hashtags are treated correctly. However, sometimes the named entities aren't detected. Now, let's have a look at which terms are relevant for POS Tagging.

---
## 1.3 Part of Speech Tagging

In [10]:
data = []

for token in doc:
    data.append ([token.text, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop, token._.hashtag])

pd.DataFrame(data, columns=["Text", "UPOS Tag", "Tag", "Syntactics", "Shape", "Alpha Token", "Stop Token", "Hashtag"], index=None)

Unnamed: 0,Text,UPOS Tag,Tag,Syntactics,Shape,Alpha Token,Stop Token,Hashtag
0,gro√üe,ADJ,ADJA,nk,xxxx,True,True,False
1,Koalition,NOUN,NN,ROOT,Xxxxx,True,False,False
2,Merkel,PROPN,NE,nk,Xxxxx,True,False,False
3,#,ADP,NN,nk,#,False,False,False
4,GroKo,PROPN,NN,nk,XxxXx,True,False,True
5,#,NOUN,NN,ROOT,#,False,False,False
6,CDU,PROPN,NE,nk,XXX,True,False,True


---
## 1.5 Candidate Selection
Extract terms that are used to find synonyms. The words to find synonyms for should be:
- verbs or nouns
- no hashtags or entities
- only alphabet characters
- no e-mail, URLs or currencies

In [11]:
def select_candidate_terms(doc: spacy.tokens.doc.Doc, pos_tags):
    """
    Select the tokens that should be used for finding synonyms.
    """
    candidate_terms = []

    for token in doc:
        if token.pos_ not in pos_tags:
            continue

        if token._.hashtag is True:
            continue

        if token.is_alpha is False:
            continue

        #if token.is_stop:
        #    continue

        if token.like_email:
            continue

        if token.like_url:
            continue

        if token.is_currency:
            continue

        # lemmatize token at the end?
        # token.lemma_

        candidate_terms.append(token)
    
    return candidate_terms


pos_tags = ["VERB", "NOUN", "PROPN", "ADJ"]
candidate_terms = select_candidate_terms(doc, pos_tags)

print(candidate_terms)

[gro√üe, Koalition, Merkel]


---
# 2. Word Embeddings
The following embeddings are applied to the selected terms
- FastText
- Word2Vec

In [12]:
# number of most similar words (synonyms) 
NUM_SIM_TERMS = 3

## 2.1 FastText

### Load FastText model with **SpaCy**
If a large german pipeline is loaded, word vectors are already available. Otherwise, a model can be loaded manually.

In [11]:
# To load the German FastText model run the following command
# !python -m spacy init vectors de ../data/fasttext/cc.de.300.zip ../models/de-fasttext-10000 --prune 10000

In [15]:
nlp_ft = spacy.load("../models/de-fasttext-10000/")

In [16]:
doc1 = nlp_ft("Haus")
doc2 = nlp_ft("Katze")
doc3 = nlp_ft("Hund")
doc4 = nlp_ft("Bude")

print(f"Haus | Katze -> {doc1.similarity(doc2)}")
print(f"Katze | Hund -> {doc2.similarity(doc3)}")
print(f"Haus | Bude -> {doc1.similarity(doc4)}")

Haus | Katze -> 0.367066624402335
Katze | Hund -> 0.7219703586509103
Haus | Bude -> 0.6441379922375441


In [None]:
# make sure to free up space afterwards
del nlp_ft

Since SpaCy does not support finding similar terms given a term - I refer to Gensim and FastText to load the word vectors there.

---
### Load FastText model with **Gensim**

In [1]:
from gensim.models.fasttext import load_facebook_vectors, load_facebook_model

# model too big to load
gensim_ft_model = load_facebook_vectors("../data/fasttext/cc.de.300.bin")

: 

: 

Find most similar terms for a given term:

In [49]:
gensim_ft_model.most_similar("merkel")

[('merkels', 0.7103838920593262),
 ('gauck', 0.6791979670524597),
 ('sch√§uble', 0.6752827167510986),
 ('Merkel', 0.6560235023498535),
 ('kanzlerin', 0.6355306506156921),
 ('bundeskanzlerin', 0.626380980014801),
 ('westerwelle', 0.6188872456550598),
 ('steinbr√ºck', 0.6188532710075378),
 ('cdu', 0.6108233332633972),
 ('steinmeier', 0.6054732203483582)]

In [None]:
del gensim_ft_model

Loading a FastText model via Gensim takes an eternity. Also, out-of-vocabulary words aren't handled.
Thus, this approach is neglected and referred to the FastText python module of Facebook.

---
### Load FastText model with **FastText**

In [13]:
import fasttext

ft_model = fasttext.load_model('../data/fasttext/cc.de.300.bin')



In [14]:
ft_synonyms = {}

for term in candidate_terms:
    synonyms = ft_model.get_nearest_neighbors(term.text, k=NUM_SIM_TERMS)
    ft_synonyms[f"{term.text}"] = [n[1] for n in synonyms]
    
print(ft_synonyms)

{'gro√üe': ['gr√∂√üere', 'grosse', 'riesengro√üe'], 'Koalition': ['Regierungskoalition', 'Koalitionsrunde', 'Koalitionspartei'], 'Merkel': ['Kanzlerin', 'Merkels', 'Bundeskanzlerin']}


In [15]:
del ft_model

The FastText module gives pretty fancy results. Even out-of-vocabulary words are treated well as expected.


---
## 2.2 Word2Vec

### Load Word2Vec model via **Gensim**


In [16]:
from gensim.models import KeyedVectors

gensim_w2v_model = KeyedVectors.load_word2vec_format(fname="../data/devmount/german.model", no_header=False, binary=True)

In [23]:
w2v_synonyms = {}

for term in candidate_terms:

    if not gensim_w2v_model.has_index_for(term.text):
        print(f"The word '{term.text}' does not appear in this model")

    else:
        synonyms = gensim_w2v_model.most_similar(term.text)[:NUM_SIM_TERMS]
        w2v_synonyms[f"{term.text}"] = [n[0] for n in synonyms]


print(w2v_synonyms)

The word 'gro√üe' does not appear in this model
{'Koalition': ['Grosse_Koalition', 'Grossen_Koalition', 'Regierungskoalition'], 'Merkel': ['Kanzlerin_Merkel', 'Merkel_CDU', 'Bundeskanzlerin']}


In [None]:
del gensim_w2v_model

Model seems to work properly. However, it is case-sensitive and may requires to lemmatize the terms. Otherwise the model can't find the correct word vector.

---
# 3. Elastic Search

Finally, the reformulated query is used to retrieve Tweets from the Elastic Search index.

## 3.1 Data Preparation
Obtain a list of Hashtags and Entities that are included in the query.

In [28]:
hashtags = [t for t in doc if t._.hashtag ]

pd.DataFrame(hashtags, columns=["Hashtag"])

Unnamed: 0,Hashtag
0,GroKo
1,CDU


In [25]:
entities = [ent.text for ent in doc.ents]

pd.DataFrame(entities, columns=["Entity"])

Unnamed: 0,Entity
0,Merkel


---
## 3.2 Query Formulation 

In [4]:
from utils import es_connect
import json

es_cred = json.load(open('../es-credentials.json'))
es_client = es_connect(credentials=es_cred)

Connecting to Elastic Search...
Successfully connected to https://localhost:9200


In [2]:
es_query = {
      "match": {
        "txt": "Ma√ü Bier"
    }
}

In [3]:
res = es_client.search(index="tweets", size=2, query=es_query)
res["hits"]["hits"]

[{'_index': 'tweets',
  '_id': '1443707088392409095',
  '_score': 16.841162,
  '_source': {'id': 1443707088392409095,
   'conversation_id': 1443707088392409095,
   'author_id': 235703405,
   'retweet_count': 6,
   'reply_count': 5,
   'like_count': 82,
   'created_at': '2021-10-01T00:39:27+02:00',
   'txt': 'Ich finde, CSU‚Äôler wie #Ramsauer w√§ren auf dem Oktoberfest bei einem Schweinshaxen und einer Ma√ü Bier besser aufgehoben als bei einer Talkshow von Markus #Lanz!'}},
 {'_index': 'tweets',
  '_id': '1423614844863979527',
  '_score': 9.578211,
  '_ignored': ['txt.keyword'],
  '_source': {'id': 1423614844863979527,
   'conversation_id': 1423614844863979527,
   'author_id': 940691491835564033,
   'retweet_count': 1,
   'reply_count': 0,
   'like_count': 9,
   'created_at': '2021-08-06T14:00:03+02:00',
   'txt': 'Heute ist der Internationale Tag des #Bieres, d.h.:\n1Ô∏è‚É£ #Freunde treffen, um gemeinsam Bier zu genie√üen.\n2Ô∏è‚É£Die M√§nner und Frauen zu ehren, welche das Bier braue

---
## 3.3 PostgreSQL

In [50]:
from sqlalchemy import create_engine, inspect
import json

data = json.load(open('../postgres-credentials.json'))

In [None]:
# establish connection to database via postgresql client
engine = create_engine(f'postgresql://{data["USER"]}:{data["PWD"]}@localhost:5432/{data["DB"]}')

In [None]:
NUM_OF_TWEETS = 10

psql_query = (
    f"select t.txt, t.created_at  from tweet t " 
    f"where t.txt @@ to_tsquery('Merkel')"
    f"order by t.created_at DESC "
    f"limit {NUM_OF_TWEETS}"
)

In [None]:
res = pd.read_sql_query(psql_query, con=engine)