# Twitter Query Reformulation 
Step by step building a custom pipeline to handle queries for Twitter database.

In [1]:
import pandas as pd
import spacy

from spacy import displacy

Download one of the predefined German models.

In [2]:
# !python -m spacy download de_core_news_sm
# !python -m spacy download de_core_news_lg

In [3]:
# select german package
MODEL = 'de_core_news_lg'

In [4]:
# load german language model
nlp = spacy.load(MODEL)

Define a user query to test the whole pipeline

In [5]:
QUERY = "Ist die große Koalition gescheitert unter Merkel? #Groko#SPD#CDU"

## Investigate Tokens from SpaCy

In [6]:
doc = nlp(QUERY)

# displacy.render(doc, style="dep", jupyter=True)
print([token.text for token in doc])


['Ist', 'die', 'große', 'Koalition', 'gescheitert', 'unter', 'Merkel', '?', '#', 'Groko#SPD#CDU']


---
# 1. Preprocessing
Hashtags are treated poorly. Detect them and prevent the tokenizer from splitting them.
- don't split hashtag and it's text
- split compound hashtags
- mark hashtags in SpaCy

The user mentions are kept as one token. 
- mark them as well

## 1.1 Tokenizer
Modify the tokenizer such that hashtags are not split at `#`

In [7]:
from spacy.tokenizer import _get_regex_pattern
import re

# get default pattern for tokens that don't get split
re_token_match = _get_regex_pattern(nlp.Defaults.token_match)

# add your patterns (here: hashtags and in-word hyphens)
re_token_match = f"({re_token_match}|#\w+|\w+-\w+)"

# overwrite token_match function of the tokenizer
nlp.tokenizer.token_match = re.compile(re_token_match).match

In [8]:
print([token.text for token in nlp(QUERY)])

['Ist', 'die', 'große', 'Koalition', 'gescheitert', 'unter', 'Merkel', '?', '#Groko#SPD#CDU']


Then make sure the whitespaces are set correctly in between the hashtags.

In [9]:
def seperate_hashtags(text: str):
    """
    Insert a whitespace if hashtags are missing a gap in between.  
    """
    for i, j in enumerate(text):
        if (text[i] == "#" and i > 0):
            if text[i-1] != " ":
                    text = text[:i] + " " + text[i:]
                    i+=1
    return text

QUERY = seperate_hashtags(QUERY)

print([token.text for token in nlp(QUERY)])

['Ist', 'die', 'große', 'Koalition', 'gescheitert', 'unter', 'Merkel', '?', '#Groko', '#SPD', '#CDU']


---
## 1.2 Matcher
Customize the Matcher to handle Tweet-specific syntax - i.e. hashtags.
- Mark Hashtag (#)
- Mark Twitter User (@)

In [10]:
from src.hashtag_matcher import create_hashtag_matcher
from src.user_matcher import create_user_matcher

nlp.add_pipe("hashtag_matcher") 
nlp.add_pipe("user_matcher") 

<src.user_matcher.UserMatcher at 0x7fcfa5aeefb0>

In [11]:
doc = nlp(QUERY)
data = []

for token in doc:
    data.append([token, token._.is_hashtag])
pd.DataFrame(data, columns=["Text", "is_hashtag"])

Unnamed: 0,Text,is_hashtag
0,Ist,False
1,die,False
2,große,False
3,Koalition,False
4,gescheitert,False
5,unter,False
6,Merkel,False
7,?,False
8,#Groko,True
9,#SPD,True


In [12]:
data = []

for token in doc:
    data.append([token, token._.is_user])
pd.DataFrame(data, columns=["Text", "is_user"])

Unnamed: 0,Text,is_user
0,Ist,False
1,die,False
2,große,False
3,Koalition,False
4,gescheitert,False
5,unter,False
6,Merkel,False
7,?,False
8,#Groko,False
9,#SPD,False


---
## 1.3 Named Entities
How are named entities detected? Especially those that are hashtags.

In [13]:
doc = nlp(QUERY)
data = []

for ent in doc.ents:
    data.append([ent.text, spacy.explain(ent.label_)])
    
# displacy.render(doc, style="ent")
pd.DataFrame(data, columns=["Text", "NER Label"])

Unnamed: 0,Text,NER Label


It seems that named entities are treated not optimally. Sometimes named entities aren't detected or the corresponding tokens don't make sense. 

---
## 1.4 Part of Speech Tagging

In [14]:
data = []

for token in doc:
    data.append ([token.text, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop, token._.is_hashtag, token._.is_user])

pd.DataFrame(data, columns=["Text", "UPOS Tag", "Tag", "Syntactics", "Shape", "Alpha Token", "Stop Token", "Hashtag", "User"], index=None)

Unnamed: 0,Text,UPOS Tag,Tag,Syntactics,Shape,Alpha Token,Stop Token,Hashtag,User
0,Ist,AUX,VAFIN,ROOT,Xxx,True,True,False,False
1,die,DET,ART,nk,xxx,True,True,False,False
2,große,ADJ,ADJA,nk,xxxx,True,True,False,False
3,Koalition,NOUN,NN,sb,Xxxxx,True,False,False,False
4,gescheitert,VERB,VVFIN,pd,xxxx,True,False,False,False
5,unter,ADP,APPR,mo,xxxx,True,True,False,False
6,Merkel,PROPN,NE,nk,Xxxxx,True,False,False,False
7,?,PUNCT,$.,punct,?,False,False,False,False
8,#Groko,PROPN,NE,nk,#Xxxxx,False,False,True,False
9,#SPD,NOUN,NN,ROOT,#XXX,False,False,True,False


---
## 1.5 Candidate Selection
Extract terms that are used to find synonyms. The words to find synonyms for should be:
- verbs or nouns
- no hashtags or users
- only alphabet characters
- no e-mail, URLs or currencies

In [15]:
def select_candidate_terms(doc: spacy.tokens.doc.Doc, pos_tags):
    """
    Select the tokens that should be used for finding similar terms.
    """
    candidate_terms = []

    for token in doc:
        if token.pos_ not in pos_tags:
            continue

        if token._.is_hashtag is True:
            continue

        if token._.is_user is True:
            continue

        if token.is_alpha is False:
            continue

        if token.like_email:
            continue

        if token.like_url:
            continue

        if token.is_currency:
            continue

        # TODO: lemmatize token ?
        candidate_terms.append(token.text)
    
    return candidate_terms

In [16]:
pos_tags = ["VERB", "NOUN", "PROPN", "ADJ"]
candidate_terms = select_candidate_terms(doc, pos_tags)

print(candidate_terms)

['große', 'Koalition', 'gescheitert', 'Merkel']


---
# 2. Word Embeddings
The following embeddings are applied to the selected terms
- FastText
- Word2Vec

In [17]:
# number of most similar words (synonyms) 
NUM_SIM_TERMS = 3

## 2.1 FastText

Load FastText model with **FastText**

In [18]:
# Download german model from fasttext website
# !wget -P ./data/fasttext https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.de.300.bin.gz

In [19]:
# unzip the fasttext model
# !gunzip -d ./data/fasttext/cc.de.300.bin.gz

In [18]:
import fasttext
ft_model = fasttext.load_model('data/fasttext/cc.de.300.bin')



In [19]:
ft_synonyms = {}

# obtain candidate terms and store them in a json object
for term in candidate_terms:
    synonyms = ft_model.get_nearest_neighbors(term, k=NUM_SIM_TERMS)
    ft_synonyms[f"{term}"] = [n[1] for n in synonyms]
    
print(ft_synonyms)

{'große': ['größere', 'grosse', 'riesengroße'], 'Koalition': ['Regierungskoalition', 'Koalitionsrunde', 'Koalitionspartei'], 'gescheitert': ['scheitert', 'Gescheitert', 'gescheitert.'], 'Merkel': ['Kanzlerin', 'Merkels', 'Bundeskanzlerin']}


In [20]:
del ft_model

The FastText module gives pretty fancy results. Even out-of-vocabulary words are treated well as expected.


---
## 2.2 Word2Vec


Load Word2Vec model via **Gensim**

In [None]:
# Download german model from devmount website
!wget -P ./data/fasttext https://cloud.devmount.de/d2bc5672c523b086/german.model

In [12]:
from gensim.models import KeyedVectors

gensim_w2v_model = KeyedVectors.load_word2vec_format(fname="data/word2vec/german.model", no_header=False, binary=True)

In [90]:
w2v_synonyms = {}

# obtain candidate terms and store them in a json object
for term in candidate_terms:
    if not gensim_w2v_model.has_index_for(term):
        print(f"The word '{term}' does not appear in this model")
    else:
        synonyms = gensim_w2v_model.most_similar(term)[:NUM_SIM_TERMS]
        w2v_synonyms[f"{term}"] = [n[0].replace("_"," ") for n in synonyms]

print(w2v_synonyms)

The word 'groß' does not appear in this model
{'Koalition': ['Grosse Koalition', 'Grossen Koalition', 'Regierungskoalition'], 'scheitern': ['gescheitert', 'scheitert', 'platzen'], 'Merkel': ['Kanzlerin Merkel', 'Merkel CDU', 'Bundeskanzlerin']}


In [91]:
del gensim_w2v_model

Model seems to work properly. However, it is case-sensitive and requires to lemmatize the terms. Otherwise the model can't find the correct word vector.

---
# 3. Elastic Search

Finally, the obtained terms are used to retrieve Tweets from the Elastic Search index. Beforehand, the most relevant expansion terms must be determined. For this purpose, the [Adjacency Matrix Aggregations](https://www.elastic.co/guide/en/elasticsearch/reference/current/search-aggregations-bucket-adjacency-matrix-aggregation.html) are utilized. Afterwards, the hashtags, twitter users and entities are prepared. Given the final expansion terms, the Elastic Search template is loaded and the query is executed on the specified `INDEX`.

In [22]:
from src.utils import es_connect

import json
import configparser

config = configparser.ConfigParser()
config.read('auth/es-credentials.ini')

es_client = es_connect(credentials=config["ELASTIC"])

Connecting to Elastic Search...
Successfully connected to https://localhost:9200


In [23]:
# Name of the Elastic Search index 
INDEX = "tweets"

---
## 3.1 Aggregation Query
Now, it must be determined which of the terms of the initial query should be replaced or used to expand the query. For this purpose the co-occurrence of the expansion terms as well as initial terms are investigated. Terms that occur often together might be suitable expansions for the final query.

In [24]:
# load the predefined aggregation query
es_agg_query = json.load(open('config/es-adjacency-matrix.conf'))

In [28]:
filters = es_agg_query["aggs"]["interactions"]["adjacency_matrix"]["filters"]

# compose the aggregation query with the candidate terms
for term in candidate_terms:
    for synonym in ft_synonyms[term]:
        filters[term+"+"+synonym] = { "terms" : { "txt" : [term.lower(), synonym.lower()] }}

In [29]:
# execute the search aggregation query
res = es_client.search(index=INDEX, size=es_agg_query["size"], aggregations=es_agg_query["aggs"])

# get the aggregations and their score from the response
aggregations = [(t["key"], t["doc_count"]) for t in res["aggregations"]["interactions"]["buckets"]]

# sort the aggregations based on their score
aggregations.sort(key=lambda x:x[1], reverse=True)

# print the results
print("Took",res["took"],"ms\n")
pd.DataFrame(aggregations, columns=["Term Aggregation", "Document Count"])

Took 215 ms



Unnamed: 0,Term Aggregation,Document Count
0,Merkel+Kanzlerin,3420
1,Merkel+Bundeskanzlerin,3169
2,Merkel+Bundeskanzlerin&Merkel+Kanzlerin,2909
3,Merkel+Bundeskanzlerin&Merkel+Merkels,2908
4,Merkel+Kanzlerin&Merkel+Merkels,2908
5,Merkel+Merkels,2908
6,Koalition+Regierungskoalition,2594
7,Koalition+Koalitionspartei,2519
8,Koalition+Koalitionspartei&Koalition+Koalition...,2516
9,Koalition+Koalitionspartei&Koalition+Regierung...,2516


In [None]:
# choose the 'best' expansion terms
expansion_terms = []

## 3.2 Data Preparation
Obtain a list of Hashtags, Twitter Users and Entities that are included in the query.

In [30]:
hashtags = [t.text.lower() for t in doc if t._.is_hashtag ]

pd.DataFrame(hashtags, columns=["Hashtag"])

Unnamed: 0,Hashtag
0,#groko
1,#spd
2,#cdu


In [31]:
users = [t.text.lower() for t in doc if t._.is_user ]

pd.DataFrame(users, columns=["User"])

Unnamed: 0,User


In [32]:
entities = [ent.text.lower() for ent in doc.ents]

pd.DataFrame(entities, columns=["Entity"])

Unnamed: 0,Entity


---
## 3.3 Query Formulation 
Finally, the resulting terms must be arranged in an Elastic Search query. Define a pattern to retrieve relevant tweets.

Develop a pattern for an Elastic Search query with
- boolean operators (`AND`, `OR`)
- boosting  `^`
- filter

The following Hyperparameters are set in order to modify the query:

In [33]:
# Are retweets allowed?
RETWEET = False

# How much is the matching of hashtags boosted? 
HASHTAG_BOOST = 0.5

# Range of Tweets to be included (FROM, TO)
TWEET_RANGE = ("2021-01-01", "2023-01-01")

In [34]:
# Load the pre-configured template for an elastic search query
es_query = json.load(open('config/es-query.conf'))
query = es_query['query']

### Manipulate Query 

In [35]:
# set if retweets are allowed
if RETWEET:
    del query['bool']['must_not']['term']

In [36]:
# TODO: for testing I use candidate terms which are not yet applied to word embeddings
query['bool']['should'][0]['match']['txt']['query'] = ' '.join(candidate_terms)
query['bool']['should'][1]['terms']['hashtags'] = candidate_terms

In [37]:
# insert the hashtags if present in initial user query
if len(hashtags) > 0 :
    query['bool']['must']['terms_set']['hashtags']['terms'] = [h[1:].lower() for h in hashtags]
else:
    del query['bool']['must']

In [38]:
if HASHTAG_BOOST is not None:
    query['bool']['should'][1]['terms']["boost"] = HASHTAG_BOOST

In [39]:
# TODO
for entity in entities:
    ...

In [40]:
# set date range for tweets
query['bool']['filter'][0]['range']['created_at']['gte'] = TWEET_RANGE[0]
query['bool']['filter'][1]['range']['created_at']['lte'] = TWEET_RANGE[1]

In [41]:
# Final Query
es_query["query"]

{'bool': {'should': [{'match': {'txt': {'query': 'große Koalition gescheitert Merkel',
      'operator': 'OR'}}},
   {'terms': {'hashtags': ['große', 'Koalition', 'gescheitert', 'Merkel'],
     'boost': 0.5}}],
  'must': {'terms_set': {'hashtags': {'terms': ['groko', 'spd', 'cdu'],
     'minimum_should_match_script': {'source': 'Math.min(params.num_terms, 1)'}}}},
  'must_not': {'term': {'txt': '_retweet_'}},
  'filter': [{'range': {'created_at': {'gte': '2021-01-01'}}},
   {'range': {'created_at': {'lte': '2023-01-01'}}}]}}

### Execute the final Query

In [42]:
res = es_client.search(index=INDEX, size=es_query['size'], query=es_query["query"], aggregations=es_query["aggs"])

print(f'Total of {res["hits"]["total"]["value"]} hits in {res["took"]}ms \n')

for i, doc in enumerate(res["hits"]["hits"]):
    print("Tweet", i, "\n", doc["_source"], "\n")


Total of 4561 hits in 540ms 

Tweet 0 
 {'retweet_count': 28, 'reply_count': 7, 'like_count': 191, 'created_at': '2021-09-02T20:44:54+02:00', 'txt': 'Das Versagen der #GroKo #cdu #spd in einem Tweet \n👎🏼Parteitaktik über alles \n👎🏼in 4 J. keine wirkliche Reform hinbekommen\n👎🏼 Oppositionsvorschl. wie immer abgelehnt \n\n👎🏼👎🏼 Konsequenz: evtl über 900 MdB inkl. riesiger Kosten &amp; Chaos https://t.co/K9s1T8dVH5', 'hashtags': ['spd', 'cdu', 'groko'], 'word_count': 35} 

Tweet 1 
 {'retweet_count': 42, 'reply_count': 17, 'like_count': 350, 'created_at': '2021-08-29T15:25:10+02:00', 'txt': 'Von den letzten 16 Jahren hat die #SPD 12 Jahre mit der #CDU regiert. Die #SPD hat Scholz mit großem Getöse nicht zum Parteivorsitzenden gewählt,mit dem Argument,er stünde für die #GroKo Jetzt ist er Kanzlerkandidat und kokettiert offen damit merkellike zu sein.Die Wahrheit ist:', 'hashtags': ['spd', 'spd', 'cdu', 'groko'], 'word_count': 44} 

Tweet 2 
 {'retweet_count': 49, 'reply_count': 41, 'like_co