# Twitter Query Reformulation 
Step by step building a custom pipeline to handle queries for Twitter database.

In [1]:
import pandas as pd
import spacy
from spacy.language import Language
from spacy.matcher import Matcher
from spacy.tokens import Token
from spacy import displacy

Download one of the predefined German models.

In [2]:
# !python -m spacy download de_core_news_sm
# !python -m spacy download de_core_news_lg

In [3]:
# download german package
MODEL = 'de_core_news_lg'

In [4]:
# load german language model
nlp = spacy.load(MODEL)

Input a query for testing purposes...

In [5]:
QUERY = "gro√üe Koalition Merkel #GroKo #CDU"

---
## Investigate Tokens from SpaCy

In [6]:
doc = nlp(QUERY)

displacy.render(doc, style="dep", jupyter=True)

for token in doc:
    print(token)

gro√üe
Koalition
Merkel
#
GroKo
#
CDU


Hashtags are treated poorly. How to detect them and prevent the tokenizer from splitting them?
- split compound hashtags
- mark hashtags in SpaCy

---
# 1. Preprocessing
Firstly, make sure the whitespaces are set correctly in between the hashtags.

In [7]:
def seperate_hashtags(text: str):
    """
    Insert a whitespace if hashtags are missing a gap in between.  
    """
    for i, j in enumerate(text):
        if (text[i] == "#" and i > 0):
            if text[i-1] != " ":
                    text = text[:i] + " " + text[i:]
                    i+=1
    return text

QUERY = seperate_hashtags(QUERY)

print(QUERY)

gro√üe Koalition Merkel #GroKo #CDU


---
## 1.1 Matcher
Customize the Matcher to handle Tweet-specific syntax - i.e. hashtags.
- Tokenize by Hashtag (#)
- by Twitter entity (@)

In [8]:
@Language.factory("hashtag_finder")
def create_hashtag_finder(nlp, name):
    return HashtagFinder(nlp.vocab)

class HashtagFinder:
    """
    The purpose of this class is to detect hashtags and mark them.
    """
    def __init__(self, vocab):
        patterns = [ [{"ORTH": "#"}] ]

        # Register a new token extension to mark hashtags
        Token.set_extension("hashtag", default=False)
        self.matcher = Matcher(vocab)
        self.matcher.add("hashtag_finder", patterns)

    def __call__(self, doc):
        # This method is invoked when the component is called on a Doc
        matches = self.matcher(doc)
        spans = []  # Collect the matched spans here

        for match_id, start, end in matches:
            # TODO: what happens if whitespace after hashtag? 
            if (end < len(doc)):
                spans.append(doc[start+1:end+1])
            
        with doc.retokenize() as retokenizer:
            for span in spans:
                retokenizer.merge(span)
                for token in span:
                    token._.hashtag = True  # Mark token as hashtag
        return doc
     
nlp.add_pipe("hashtag_finder", before="ner")  # Add component to the pipeline

<__main__.HashtagFinder at 0x7f913a1776d0>

In [9]:
doc = nlp(QUERY)
data = []

for token in doc:
    data.append([token, token._.hashtag])
pd.DataFrame(data, columns=["Text", "Hashtag"])

Unnamed: 0,Text,Hashtag
0,gro√üe,False
1,Koalition,False
2,Merkel,False
3,#,False
4,GroKo,True
5,#,False
6,CDU,True


---
## 1.2 Named Entities
How are named entities detected? Especially those that are hashtags.

In [10]:
doc = nlp(QUERY)
data = []

for ent in doc.ents:
    data.append([ent.text, spacy.explain(ent.label_)])
    
displacy.render(doc, style="ent")
pd.DataFrame(data, columns=["Text", "NER Label"])

Unnamed: 0,Text,NER Label
0,Merkel,Named person or family.


It seems that named entities as well as hashtags are treated correctly. However, sometimes the named entities aren't detected. Now, let's have a look at which terms are relevant for POS Tagging.

---
## 1.3 Part of Speech Tagging

In [11]:
data = []

for token in doc:
    data.append ([token.text, token.pos_, token.tag_, token.dep_, token.shape_, token.is_alpha, token.is_stop, token._.hashtag])

pd.DataFrame(data, columns=["Text", "UPOS Tag", "Tag", "Syntactics", "Shape", "Alpha Token", "Stop Token", "Hashtag"], index=None)

Unnamed: 0,Text,UPOS Tag,Tag,Syntactics,Shape,Alpha Token,Stop Token,Hashtag
0,gro√üe,ADJ,ADJA,nk,xxxx,True,True,False
1,Koalition,NOUN,NN,ROOT,Xxxxx,True,False,False
2,Merkel,PROPN,NE,nk,Xxxxx,True,False,False
3,#,ADP,NN,nk,#,False,False,False
4,GroKo,PROPN,NN,nk,XxxXx,True,False,True
5,#,NOUN,NN,ROOT,#,False,False,False
6,CDU,PROPN,NE,nk,XXX,True,False,True


---
## 1.5 Candidate Selection
Extract terms that are used to find synonyms. The words to find synonyms for should be:
- verbs or nouns
- no hashtags or entities
- only alphabet characters
- no e-mail, URLs or currencies

In [12]:
def select_candidate_terms(doc: spacy.tokens.doc.Doc, pos_tags):
    """
    Select the tokens that should be used for finding synonyms.
    """
    candidate_terms = []

    for token in doc:
        if token.pos_ not in pos_tags:
            continue

        if token._.hashtag is True:
            continue

        if token.is_alpha is False:
            continue

        #if token.is_stop:
        #    continue

        if token.like_email:
            continue

        if token.like_url:
            continue

        if token.is_currency:
            continue

        # lemmatize token at the end? or via ES (THIS IS FOR EMBEDDINGS)
        # token.lemma_

        candidate_terms.append(token.text)
    
    return candidate_terms

In [13]:
pos_tags = ["VERB", "NOUN", "PROPN", "ADJ"]
candidate_terms = select_candidate_terms(doc, pos_tags)

print(candidate_terms)

['gro√üe', 'Koalition', 'Merkel']


---
# 2. Word Embeddings
The following embeddings are applied to the selected terms
- FastText
- Word2Vec

In [14]:
# number of most similar words (synonyms) 
NUM_SIM_TERMS = 3

## 2.1 FastText

### Load FastText model with **SpaCy**
If a large german pipeline is loaded, word vectors are already available. Otherwise, a model can be loaded manually.

In [11]:
# To load the German FastText model run the following command
# !python -m spacy init vectors de ../data/fasttext/cc.de.300.zip ../models/de-fasttext-10000 --prune 10000

In [15]:
nlp_ft = spacy.load("../models/de-fasttext-10000/")

In [16]:
doc1 = nlp_ft("Haus")
doc2 = nlp_ft("Katze")
doc3 = nlp_ft("Hund")
doc4 = nlp_ft("Bude")

print(f"Haus | Katze -> {doc1.similarity(doc2)}")
print(f"Katze | Hund -> {doc2.similarity(doc3)}")
print(f"Haus | Bude -> {doc1.similarity(doc4)}")

Haus | Katze -> 0.367066624402335
Katze | Hund -> 0.7219703586509103
Haus | Bude -> 0.6441379922375441


In [None]:
# make sure to free up space afterwards
del nlp_ft

Since SpaCy does not support finding similar terms given a term - I refer to Gensim and FastText to load the word vectors there.

---
### Load FastText model with **Gensim**

In [None]:
from gensim.models.fasttext import load_facebook_vectors, load_facebook_model

# model too big to load
gensim_ft_model = load_facebook_vectors("../data/fasttext/cc.de.300.bin")

Find most similar terms for a given term:

In [49]:
gensim_ft_model.most_similar("merkel")

[('merkels', 0.7103838920593262),
 ('gauck', 0.6791979670524597),
 ('sch√§uble', 0.6752827167510986),
 ('Merkel', 0.6560235023498535),
 ('kanzlerin', 0.6355306506156921),
 ('bundeskanzlerin', 0.626380980014801),
 ('westerwelle', 0.6188872456550598),
 ('steinbr√ºck', 0.6188532710075378),
 ('cdu', 0.6108233332633972),
 ('steinmeier', 0.6054732203483582)]

In [None]:
del gensim_ft_model

Loading a FastText model via Gensim takes an eternity. Also, out-of-vocabulary words aren't handled.
Thus, this approach is neglected and referred to the FastText python module of Facebook.

---
### Load FastText model with **FastText**

In [13]:
import fasttext

ft_model = fasttext.load_model('../data/fasttext/cc.de.300.bin')



In [14]:
ft_synonyms = {}

for term in candidate_terms:
    synonyms = ft_model.get_nearest_neighbors(term.text, k=NUM_SIM_TERMS)
    ft_synonyms[f"{term.text}"] = [n[1] for n in synonyms]
    
print(ft_synonyms)

{'gro√üe': ['gr√∂√üere', 'grosse', 'riesengro√üe'], 'Koalition': ['Regierungskoalition', 'Koalitionsrunde', 'Koalitionspartei'], 'Merkel': ['Kanzlerin', 'Merkels', 'Bundeskanzlerin']}


In [15]:
del ft_model

The FastText module gives pretty fancy results. Even out-of-vocabulary words are treated well as expected.


---
## 2.2 Word2Vec


### Load Word2Vec model via **Gensim**

In [16]:
from gensim.models import KeyedVectors

gensim_w2v_model = KeyedVectors.load_word2vec_format(fname="../data/devmount/german.model", no_header=False, binary=True)

In [23]:
w2v_synonyms = {}

for term in candidate_terms:

    if not gensim_w2v_model.has_index_for(term.text):
        print(f"The word '{term.text}' does not appear in this model")

    else:
        synonyms = gensim_w2v_model.most_similar(term.text)[:NUM_SIM_TERMS]
        w2v_synonyms[f"{term.text}"] = [n[0] for n in synonyms]


print(w2v_synonyms)

The word 'gro√üe' does not appear in this model
{'Koalition': ['Grosse_Koalition', 'Grossen_Koalition', 'Regierungskoalition'], 'Merkel': ['Kanzlerin_Merkel', 'Merkel_CDU', 'Bundeskanzlerin']}


In [None]:
del gensim_w2v_model

Model seems to work properly. However, it is case-sensitive and may requires to lemmatize the terms. Otherwise the model can't find the correct word vector.

---
# 3. Elastic Search

Finally, the reformulated query is used to retrieve Tweets from the Elastic Search index.

## 3.1 Data Preparation
Obtain a list of Hashtags and Entities that are included in the query.

In [14]:
hashtags = [t.text for t in doc if t._.hashtag ]

pd.DataFrame(hashtags, columns=["Hashtag"])

Unnamed: 0,Hashtag
0,GroKo
1,CDU


In [15]:
entities = [ent.text for ent in doc.ents]

pd.DataFrame(entities, columns=["Entity"])

Unnamed: 0,Entity
0,Merkel


In [37]:
# for test purposes use a predefined list of synonyms
ft_synonyms = {'gro√üe': ['gr√∂√üere', 'grosse', 'riesengro√üe'], 'Koalition': ['Regierungskoalition', 'Koalitionsrunde', 'Koalitionspartei'], 'Merkel': ['Kanzlerin', 'Merkels', 'Bundeskanzlerin']}

---
## 3.2 Term Replacement
Now, it must be determined which of the terms of the initial query should be replaced.

In [17]:
ft_queries = []
ft_queries.append([t.text for t in candidate_terms])

for term in candidate_terms:
    for synonym in ft_synonyms[term.text]:
        ft_queries.append([synonym] + [t.text for t in candidate_terms if t.text != term.text])
    
ft_queries

[['gro√üe', 'Koalition', 'Merkel'],
 ['gr√∂√üere', 'Koalition', 'Merkel'],
 ['grosse', 'Koalition', 'Merkel'],
 ['riesengro√üe', 'Koalition', 'Merkel'],
 ['Regierungskoalition', 'gro√üe', 'Merkel'],
 ['Koalitionsrunde', 'gro√üe', 'Merkel'],
 ['Koalitionspartei', 'gro√üe', 'Merkel'],
 ['Kanzlerin', 'gro√üe', 'Koalition'],
 ['Merkels', 'gro√üe', 'Koalition'],
 ['Bundeskanzlerin', 'gro√üe', 'Koalition']]

---
## 3.3 Query Formulation 
Finally, the resulting terms must be arranged in an Elastic Search query. Define a pattern to retrieve relevant tweets.

In [16]:
from src.utils import es_connect

import json
import configparser

config = configparser.ConfigParser()
config.read('auth/es-credentials.ini')

es_client = es_connect(credentials=config["ELASTIC"])

Connecting to Elastic Search...
Successfully connected to https://localhost:9200


Develop a pattern for an Elastic Search query with
- boolean operators (`AND`, `OR`)
- boosting  `^`
- filter

The following Hyperparameters are set in order to modify the query:

In [28]:
# Name of the Elastic Search index 
INDEX = "tweets_35"

# Number of tweets displayed as result
SIZE = 10

# Are retweets allowed?
RETWEET = False

# How much is the matching of hashtags boosted? 
HASHTAG_BOOST = 1

# How much boosting for a simple match in the text?
TEXT_BOOST = 1

# Range of Tweets to be included (FROM, TO)
TWEET_RANGE = ("2021-01-01", "2023-01-01")

In [32]:
# Load the pre-configured template for an elastic search query
query = json.load(open('config/es-query.conf'))['query']
query

{'bool': {'should': {'query_string': {'query': 'philipp CDU',
    'fields': ['txt', 'hashtags']}},
  'must': {'match': {'hashtags': ''}},
  'must_not': {'term': {'txt': '_retweet_'}},
  'filter': [{'range': {'created_at': {'gte': '2021-01-01'}}},
   {'range': {'created_at': {'lte': '2023-01-01'}}}]}}

### Manipulate Query 

In [33]:
# set if retweets are allowed
if RETWEET:
    del query['bool']['must_not']['term']

query

{'bool': {'should': {'query_string': {'query': 'philipp CDU',
    'fields': ['txt', 'hashtags']}},
  'must': {'match': {'hashtags': ''}},
  'must_not': {'term': {'txt': '_retweet_'}},
  'filter': [{'range': {'created_at': {'gte': '2021-01-01'}}},
   {'range': {'created_at': {'lte': '2023-01-01'}}}]}}

In [34]:
# TODO: for testing I use candidate terms which are not yet applied to word embeddings
query['bool']['should']['query_string']['query'] = ' '.join(candidate_terms)
query

{'bool': {'should': {'query_string': {'query': 'gro√üe Koalition Merkel',
    'fields': ['txt', 'hashtags']}},
  'must': {'match': {'hashtags': ''}},
  'must_not': {'term': {'txt': '_retweet_'}},
  'filter': [{'range': {'created_at': {'gte': '2021-01-01'}}},
   {'range': {'created_at': {'lte': '2023-01-01'}}}]}}

In [35]:
# insert the hashtags if present in initial user query
if len(hashtags) > 0 :
    query['bool']['must']['match']['hashtags'] = '#' + ' #'.join(hashtags)
else:
    del query['bool']['must']

query

{'bool': {'should': {'query_string': {'query': 'gro√üe Koalition Merkel',
    'fields': ['txt', 'hashtags']}},
  'must': {'match': {'hashtags': '#GroKo #CDU'}},
  'must_not': {'term': {'txt': '_retweet_'}},
  'filter': [{'range': {'created_at': {'gte': '2021-01-01'}}},
   {'range': {'created_at': {'lte': '2023-01-01'}}}]}}

In [36]:
query['bool']['should']['query_string']["fields"][0] += f"^{TEXT_BOOST}"
query['bool']['should']['query_string']["fields"][1] += f"^{HASHTAG_BOOST}"
query

{'bool': {'should': {'query_string': {'query': 'gro√üe Koalition Merkel',
    'fields': ['txt^1', 'hashtags^1']}},
  'must': {'match': {'hashtags': '#GroKo #CDU'}},
  'must_not': {'term': {'txt': '_retweet_'}},
  'filter': [{'range': {'created_at': {'gte': '2021-01-01'}}},
   {'range': {'created_at': {'lte': '2023-01-01'}}}]}}

In [25]:
# TODO
for entity in entities:
    ...

In [37]:
# set date range for tweets
query['bool']['filter'][0]['range']['created_at']['gte'] = TWEET_RANGE[0]
query['bool']['filter'][1]['range']['created_at']['lte'] = TWEET_RANGE[1]
query

{'bool': {'should': {'query_string': {'query': 'gro√üe Koalition Merkel',
    'fields': ['txt^1', 'hashtags^1']}},
  'must': {'match': {'hashtags': '#GroKo #CDU'}},
  'must_not': {'term': {'txt': '_retweet_'}},
  'filter': [{'range': {'created_at': {'gte': '2021-01-01'}}},
   {'range': {'created_at': {'lte': '2023-01-01'}}}]}}

### Execute the final Query

In [38]:
res = es_client.search(index=INDEX, size=SIZE, query=query)

print("Total Hits:", res["hits"]["total"]["value"], "\n")

for i, doc in enumerate(res["hits"]["hits"]):
    print("Tweet", i, "\n", doc["_source"], "\n")


Total Hits: 1070 

Tweet 0 
 {'retweet_count': 0, 'reply_count': 0, 'like_count': 3, 'created_at': '2021-10-09T11:34:54+02:00', 'txt': 'Gr√ºne u rote Jugend wissen was sie wollen. Eine üö•-Koalition oder eine Bundesregierung, die gemeinsam gestaltet, geh√∂ren wohl nicht dazu. Ach, bleibt doch bei der #Groko, dann k√∂nnt ihr weiter nach Herzenslust gegen was auch immer protestieren.', 'hashtags': ['groko'], 'word_count': 37} 

Tweet 1 
 {'retweet_count': 11, 'reply_count': 2, 'like_count': 109, 'created_at': '2021-09-11T15:40:52+02:00', 'txt': 'Die #Ehefueralle /  Gesetzentwurf hatte ich als Vorsitzende Rechtsausschuss 29 x i d vorletzten WP auf die TO gesetzt. 29x vertagt durch #GroKo. Bis klar war, dass es scharfes  Wahlkampfthema/ Bedingung wird, Merkel es frei geben wollte nach der Wahl. Zack, wir waren schneller.', 'hashtags': ['groko', 'ehefueralle'], 'word_count': 46} 

Tweet 2 
 {'retweet_count': 3, 'reply_count': 3, 'like_count': 68, 'created_at': '2021-09-12T20:34:38+02:00', 