# Twitter Query Reformulation
...

## 1 - SpaCy TextProcessingPipeline 

In [1]:
# download the respective SpaCy model
# !python -m spacy download de_core_news_sm

In [2]:
from pipeline.text_processing import TextProcessingPipeline

MODEL = "de_core_news_sm"
pipe = TextProcessingPipeline(model=MODEL)

In [3]:
from pipeline.utils import load_queries

QUERIES = load_queries()

docs = []

# for each query, invoke the SpaCy pipeline
for query in QUERIES:
    doc = pipe.invoke(query)
    docs.append(doc)

In [14]:
# select terms to expand
EMBEDDING_PARAMS = {
    "pos_list": ["NOUN","ADJ","VERB"],
    "entity": True,
    "hashtag": False,
    "user": False,
    "N": 3
}

query_tokens = []

# for each processed query, filter the tokens depending on the specified parameters
for doc in docs:
    filtered_tokens = pipe.get_filtered_tokens(doc, EMBEDDING_PARAMS)
    query_tokens.append(filtered_tokens)

query_tokens

[[@amthor, Ist, große, Koalition, gescheitert, #Groko],
 [#Lauterbach, Held, Virus]]

---
## 2 - Word Embedding 

In [5]:
EMBEDDING = "WORD2VEC" # FASTTEXT

W2V_MODEL = 'data/word2vec/german.model'
FT_MODEL = 'data/fasttext/cc.de.300.bin'

In [6]:
from pipeline.embedding import Word2Vec
from pipeline.embedding import FastText

if EMBEDDING == "WORD2VEC":
    model = Word2Vec(W2V_MODEL)

elif EMBEDDING == "FASTTEXT":
    model = FastText(FT_MODEL)
    
else:
    raise ValueError("Invalid Embedding")


similar_terms = []

for tokens in query_tokens:
    similar_terms.append(model.get_similar_terms(pipe.trim_symbols(tokens), PARAMS["N"]))

del model

In [7]:
similar_terms

[{'Ist': ['War', 'Waere', 'Sind'],
  'Koalition': ['Grosse Koalition',
   'Grossen Koalition',
   'Regierungskoalition'],
  'gescheitert': ['scheitert', 'Gescheitert', 'scheitern']},
 {'Lauterbach': ['Alsfeld', 'Buedingen', 'Hardt'],
  'Held': ['Traeumer', 'Supermann', 'Protagonist'],
  'Virus': ['Erreger', 'Viren', 'Infektion']}]

---
## 3 - Elastic Search

In [8]:
from pipeline.elasticsearch import ElasticsearchClient
import configparser

INDEX = "tweets"

config = configparser.ConfigParser()
config.read('auth/es-credentials.ini')

es_client = ElasticsearchClient(credentials=config["ELASTIC"], index=INDEX)
es_client.connect(config["ELASTIC"]["PWD"])

Successfully connected to https://localhost:9200


In [9]:
co_occurrences = []

for terms in similar_terms:
    co_occurrences.append(es_client.get_co_occurring_terms(terms))

In [10]:
from pipeline.utils import get_expansion_terms

ALPHA = 0.1
query_terms = []

for i in range(len(QUERIES)):
    expansion_terms = get_expansion_terms(pipe.trim_symbols(query_tokens[i]), similar_terms[i], co_occurrences[i], ALPHA)
    query_terms.append(pipe.trim_symbols(query_tokens[i]) + expansion_terms)

query_terms

[['amthor',
  'Ist',
  'große',
  'Koalition',
  'gescheitert',
  'Groko',
  'Gescheitert'],
 ['Lauterbach', 'Held', 'Virus', 'Infektion']]

TODO:
- einmal coocurrences checken, dann hoch, dann gute erweiterung
- wenn einzelnes Vorkommen oft, dann gute erweiterung   

In [13]:
parameters = []

for i in range(len(QUERIES)):

    params = {
        "retweet": False,
        "hashtag_boost": 0.5,
        "tweet_range": ("2021-01-01", "2023-01-01")
    }

    params["terms"] = query_terms[i]

    params["hashtags"] = [h.lower() for h in pipe.trim_symbols([t for t in query_tokens[i] if t._.is_hashtag ])]
    params["users"] = pipe.trim_symbols([t for t in query_tokens[i] if t._.is_user ])
    params["entities"] = pipe.trim_symbols([t for t in query_tokens[i] if t.ent_type_ ])

    parameters.append(params)

es_client.get_tweets(parameters[0])

{'size': 10, 'query': {'bool': {'should': [{'match': {'txt': {'query': 'amthor Ist große Koalition gescheitert Groko Gescheitert', 'operator': 'OR'}}}, {'terms': {'hashtags': ['groko'], 'boost': 0.5}}], 'must': {'terms_set': {'hashtags': {'terms': ['groko'], 'minimum_should_match_script': {'source': 'Math.min(params.num_terms, 1)'}}}}, 'must_not': {'term': {'txt': '_retweet_'}}, 'filter': [{'range': {'created_at': {'gte': '2021-01-01'}}}, {'range': {'created_at': {'lte': '2023-01-01'}}}]}}, 'aggs': {'sample': {'sampler': {'shard_size': 500}, 'aggs': {'keywords': {'significant_terms': {'field': 'hashtags'}}}}}, 'collapse': {}, 'sort': {}}


{'hits': 353,
 'took': 101,
 'tweets': [{'_index': 'tweets',
   '_id': '1435174959677124609',
   '_score': 19.941412,
   '_source': {'retweet_count': 27,
    'reply_count': 21,
    'like_count': 291,
    'created_at': '2021-09-07T11:35:49+02:00',
    'txt': '#Laschet hält im Bundestag eine Rede darüber, wie großartig die Große Koalition ist. #Groko Mehr muss man eigentlich nicht hören. #btw21',
    'hashtags': ['laschet', 'btw21', 'groko'],
    'word_count': 21}},
  {'_index': 'tweets',
   '_id': '1441036904540033031',
   '_score': 19.691784,
   '_source': {'retweet_count': 0,
    'reply_count': 1,
    'like_count': 6,
    'created_at': '2021-09-23T15:49:05+02:00',
    'txt': '@calvinamfreitag Scholz hat eine Groko nie final ausgeschlossen. Darum geht es auch nicht. Ausgeschlossen habt ihr die #Groko schon beim letzten Mal und was ist daraus geworden...? Aus Berlin hört man, dass viele aus der Bundestagsfraktion im Zweifel auch wieder mit der Union koalieren würden.',
    'hashtags': [