# Twitter Query Reformulation 
Step by step building a custom pipeline to handle queries for Twitter database.

In [1]:
import pandas as pd
import spacy

from spacy import displacy

Download one of the predefined German models.

In [2]:
# !python -m spacy download de_core_news_sm
# !python -m spacy download de_core_news_lg

In [3]:
# select german package
MODEL = 'de_core_news_lg'

In [4]:
# load german language model
nlp = spacy.load(MODEL)

Define a user query to test the whole pipeline

In [5]:
QUERY = "@amthor große Koalition Merkel #GroKo #CDU#SPD"

## Investigate Tokens from SpaCy

In [6]:
doc = nlp(QUERY)

# displacy.render(doc, style="dep", jupyter=True)
print([token.text for token in doc])


['@amthor', 'große', 'Koalition', 'Merkel', '#', 'GroKo', '#', 'CDU#SPD']


---
# 1. Preprocessing
Hashtags are treated poorly. Detect them and prevent the tokenizer from splitting them.
- don't split hashtag and it's text
- split compound hashtags
- mark hashtags in SpaCy

The user mentions are kept as one token. 
- mark them as well

## 1.1 Tokenizer
Modify the tokenizer such that hashtags are not split at `#`

In [7]:
from spacy.tokenizer import _get_regex_pattern
import re

# get default pattern for tokens that don't get split
re_token_match = _get_regex_pattern(nlp.Defaults.token_match)

# add your patterns (here: hashtags and in-word hyphens)
re_token_match = f"({re_token_match}|#\w+|\w+-\w+)"

# overwrite token_match function of the tokenizer
nlp.tokenizer.token_match = re.compile(re_token_match).match

In [8]:
print([token.text for token in nlp(QUERY)])

['@amthor', 'große', 'Koalition', 'Merkel', '#GroKo', '#CDU#SPD']


Then make sure the whitespaces are set correctly in between the hashtags.

In [9]:
def seperate_hashtags(text: str):
    """
    Insert a whitespace if hashtags are missing a gap in between.  
    """
    for i, j in enumerate(text):
        if (text[i] == "#" and i > 0):
            if text[i-1] != " ":
                    text = text[:i] + " " + text[i:]
                    i+=1
    return text

QUERY = seperate_hashtags(QUERY)

print([token.text for token in nlp(QUERY)])

['@amthor', 'große', 'Koalition', 'Merkel', '#GroKo', '#CDU', '#SPD']


---
## 1.2 Matcher
Customize the Matcher to handle Tweet-specific syntax - i.e. hashtags.
- Mark Hashtag (#)
- Mark Twitter User (@)

In [10]:
from src.hashtag_matcher import create_hashtag_matcher
from src.user_matcher import create_user_matcher

nlp.add_pipe("hashtag_matcher") 
nlp.add_pipe("user_matcher") 

<src.user_matcher.UserMatcher at 0x7f07709a7160>

In [11]:
doc = nlp(QUERY)
data = []

for token in doc:
    data.append([token, token._.is_hashtag])
pd.DataFrame(data, columns=["Text", "is_hashtag"])

Unnamed: 0,Text,is_hashtag
0,@amthor,False
1,große,False
2,Koalition,False
3,Merkel,False
4,#GroKo,True
5,#CDU,True
6,#SPD,True


In [12]:
data = []

for token in doc:
    data.append([token, token._.is_user])
pd.DataFrame(data, columns=["Text", "is_user"])

Unnamed: 0,Text,is_user
0,@amthor,True
1,große,False
2,Koalition,False
3,Merkel,False
4,#GroKo,False
5,#CDU,False
6,#SPD,False


---
## 1.3 Named Entities
How are named entities detected? Especially those that are hashtags.

In [13]:
doc = nlp(QUERY)
data = []

for ent in doc.ents:
    data.append([ent.text, spacy.explain(ent.label_)])
    
# displacy.render(doc, style="ent")
pd.DataFrame(data, columns=["Text", "NER Label"])

Unnamed: 0,Text,NER Label
0,Merkel,Named person or family.


It seems that named entities are treated not optimally. Sometimes named entities aren't detected or the corresponding tokens don't make sense. 

---
## 1.4 Part of Speech Tagging

In [14]:
data = []

for token in doc:
    data.append ([token.text, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop, token._.is_hashtag, token._.is_user])

pd.DataFrame(data, columns=["Text", "UPOS Tag", "Tag", "Syntactics", "Shape", "Alpha Token", "Stop Token", "Hashtag", "User"], index=None)

Unnamed: 0,Text,UPOS Tag,Tag,Syntactics,Shape,Alpha Token,Stop Token,Hashtag,User
0,@amthor,PROPN,NE,ROOT,@xxxx,False,False,False,True
1,große,ADJ,ADJA,nk,xxxx,True,True,False,False
2,Koalition,NOUN,NN,ROOT,Xxxxx,True,False,False,False
3,Merkel,PROPN,NE,nk,Xxxxx,True,False,False,False
4,#GroKo,PROPN,NE,nk,#XxxXx,False,False,True,False
5,#CDU,PROPN,NE,ROOT,#XXX,False,False,True,False
6,#SPD,PROPN,NE,nk,#XXX,False,False,True,False


---
## 1.5 Candidate Selection
Extract terms that are used to find synonyms. The words to find synonyms for should be:
- verbs or nouns
- no hashtags or users
- only alphabet characters
- no e-mail, URLs or currencies

In [15]:
def select_candidate_terms(doc: spacy.tokens.doc.Doc, pos_tags):
    """
    Select the tokens that should be used for finding similar terms.
    """
    candidate_terms = []

    for token in doc:
        if token.pos_ not in pos_tags:
            continue

        if token._.is_hashtag is True:
            continue

        if token._.is_user is True:
            continue

        if token.is_alpha is False:
            continue

        #if token.is_stop:
        #    continue

        if token.like_email:
            continue

        if token.like_url:
            continue

        if token.is_currency:
            continue

        # lemmatize token via ES query pattern
        # token.lemma_

        candidate_terms.append(token.text)
    
    return candidate_terms

In [16]:
pos_tags = ["VERB", "NOUN", "PROPN", "ADJ"]
candidate_terms = select_candidate_terms(doc, pos_tags)

print(candidate_terms)

['große', 'Koalition', 'Merkel']


---
# 2. Word Embeddings
The following embeddings are applied to the selected terms
- FastText
- Word2Vec

In [17]:
# number of most similar words (synonyms) 
NUM_SIM_TERMS = 3

## 2.1 FastText

Load FastText model with **FastText**

In [18]:
import fasttext

ft_model = fasttext.load_model('data/fasttext/cc.de.300.bin')



In [21]:
ft_synonyms = {}

for term in candidate_terms:
    synonyms = ft_model.get_nearest_neighbors(term, k=NUM_SIM_TERMS)
    ft_synonyms[f"{term}"] = [n[1] for n in synonyms]
    
print(ft_synonyms)

{'große': ['größere', 'grosse', 'riesengroße'], 'Koalition': ['Regierungskoalition', 'Koalitionsrunde', 'Koalitionspartei'], 'Merkel': ['Kanzlerin', 'Merkels', 'Bundeskanzlerin']}


In [22]:
del ft_model

The FastText module gives pretty fancy results. Even out-of-vocabulary words are treated well as expected.


---
## 2.2 Word2Vec


Load Word2Vec model via **Gensim**

In [24]:
from gensim.models import KeyedVectors

gensim_w2v_model = KeyedVectors.load_word2vec_format(fname="data/devmount/german.model", no_header=False, binary=True)

In [27]:
w2v_synonyms = {}

for term in candidate_terms:

    if not gensim_w2v_model.has_index_for(term):
        print(f"The word '{term}' does not appear in this model")

    else:
        synonyms = gensim_w2v_model.most_similar(term)[:NUM_SIM_TERMS]
        w2v_synonyms[f"{term}"] = [n[0] for n in synonyms]


print(w2v_synonyms)

The word 'große' does not appear in this model
{'Koalition': ['Grosse_Koalition', 'Grossen_Koalition', 'Regierungskoalition'], 'Merkel': ['Kanzlerin_Merkel', 'Merkel_CDU', 'Bundeskanzlerin']}


In [29]:
del gensim_w2v_model

In [28]:
# TODO: lemmatize the terms 

Model seems to work properly. However, it is case-sensitive and may requires to lemmatize the terms. Otherwise the model can't find the correct word vector.

---
# 3. Elastic Search

Finally, the reformulated query is used to retrieve Tweets from the Elastic Search index.

## 3.1 Data Preparation
Obtain a list of Hashtags, Twitter Users and Entities that are included in the query.

In [30]:
hashtags = [t.text for t in doc if t._.is_hashtag ]

pd.DataFrame(hashtags, columns=["Hashtag"])

Unnamed: 0,Hashtag
0,#GroKo
1,#CDU
2,#SPD


In [31]:
users = [t.text for t in doc if t._.is_user ]

pd.DataFrame(users, columns=["User"])

Unnamed: 0,User
0,@amthor


In [32]:
entities = [ent.text for ent in doc.ents]

pd.DataFrame(entities, columns=["Entity"])

Unnamed: 0,Entity
0,Merkel


In [33]:
# for test purposes use a predefined list of synonyms
ft_synonyms = {'große': ['größere', 'grosse', 'riesengroße'], 'Koalition': ['Regierungskoalition', 'Koalitionsrunde', 'Koalitionspartei'], 'Merkel': ['Kanzlerin', 'Merkels', 'Bundeskanzlerin']}

---
## 3.2 Expansion Handling
Now, it must be determined which of the terms of the initial query should be replaced or used to expand the query.

In [None]:
# TODO: use co-occurences to find suitable terms

ft_queries = []
ft_queries.append([t for t in candidate_terms])

for term in candidate_terms:
    for synonym in ft_synonyms[term]:
        ft_queries.append([synonym] + [t for t in candidate_terms if t != term])
    
ft_queries

---
## 3.3 Query Formulation 
Finally, the resulting terms must be arranged in an Elastic Search query. Define a pattern to retrieve relevant tweets.

In [49]:
from src.utils import es_connect

import json
import configparser

config = configparser.ConfigParser()
config.read('auth/es-credentials.ini')

es_client = es_connect(credentials=config["ELASTIC"])

Connecting to Elastic Search...
Successfully connected to https://localhost:9200


Develop a pattern for an Elastic Search query with
- boolean operators (`AND`, `OR`)
- boosting  `^`
- filter

The following Hyperparameters are set in order to modify the query:

In [50]:
# Name of the Elastic Search index 
INDEX = "tweets_ngram"

# Number of tweets displayed as result
SIZE = 10

# Are retweets allowed?
RETWEET = False

# How much is the matching of hashtags boosted? 
HASHTAG_BOOST = None

# Range of Tweets to be included (FROM, TO)
TWEET_RANGE = ("2021-01-01", "2023-01-01")

In [39]:
# Load the pre-configured template for an elastic search query
query = json.load(open('config/es-query.conf'))['query']
query

{'bool': {'should': {'query_string': {'query': '',
    'fields': ['txt', 'hashtags']}},
  'must': {'match': {'hashtags': ''}},
  'must_not': {'term': {'txt': '_retweet_'}},
  'filter': [{'range': {'created_at': {'gte': ''}}},
   {'range': {'created_at': {'lte': ''}}}]}}

### Manipulate Query 

In [40]:
# set if retweets are allowed
if RETWEET:
    del query['bool']['must_not']['term']

In [41]:
# TODO: for testing I use candidate terms which are not yet applied to word embeddings
query['bool']['should']['query_string']['query'] = ' '.join(candidate_terms)

In [42]:
# insert the hashtags if present in initial user query
if len(hashtags) > 0 :
    query['bool']['must']['match']['hashtags'] = ' '.join([h[1:] for h in  hashtags])
else:
    del query['bool']['must']

In [43]:
if HASHTAG_BOOST is not None:
    query['bool']['should']['query_string']["fields"][1] += f"^{HASHTAG_BOOST}"

In [44]:
# TODO
for entity in entities:
    ...

In [None]:
# set date range for tweets
query['bool']['filter'][0]['range']['created_at']['gte'] = TWEET_RANGE[0]
query['bool']['filter'][1]['range']['created_at']['lte'] = TWEET_RANGE[1]

In [46]:
# Final Query
query

{'bool': {'should': {'query_string': {'query': 'große Koalition Merkel',
    'fields': ['txt', 'hashtags']}},
  'must': {'match': {'hashtags': 'GroKo CDU SPD'}},
  'must_not': {'term': {'txt': '_retweet_'}},
  'filter': [{'range': {'created_at': {'gte': '2021-01-01'}}},
   {'range': {'created_at': {'lte': '2023-01-01'}}}]}}

### Execute the final Query

In [51]:
res = es_client.search(index=INDEX, size=SIZE, query=query)

print(f'Total of {res["hits"]["total"]["value"]} hits in {res["took"]}ms \n')

for i, doc in enumerate(res["hits"]["hits"]):
    print("Tweet", i, "\n", doc["_source"], "\n")


Total of 695 hits in 283ms 

Tweet 0 
 {'retweet_count': 0, 'reply_count': 0, 'like_count': 0, 'created_at': '2021-09-01T14:26:57+02:00', 'txt': 'Sätze die formal korrekt sind aber halt auch Bekenntnisse.\n\n“Und wir [Wer? Politik? Boomer?] müssen auch noch mehr [weil ihr das schon so viel tut?] mit der jüngeren Generation [weil nur die sich fürs Klima interessiert 🤔] sprechen.” [zuhören! Ernst nehmen! Handel!]\n#SPD #GroKo https://t.co/cA0tCDzFYA', 'hashtags': ['spd', 'groko'], 'word_count': 43} 

Tweet 1 
 {'retweet_count': 42, 'reply_count': 17, 'like_count': 350, 'created_at': '2021-08-29T15:25:10+02:00', 'txt': 'Von den letzten 16 Jahren hat die #SPD 12 Jahre mit der #CDU regiert. Die #SPD hat Scholz mit großem Getöse nicht zum Parteivorsitzenden gewählt,mit dem Argument,er stünde für die #GroKo Jetzt ist er Kanzlerkandidat und kokettiert offen damit merkellike zu sein.Die Wahrheit ist:', 'hashtags': ['spd', 'spd', 'cdu', 'groko'], 'word_count': 44} 

Tweet 2 
 {'retweet_count': 1