# Twitter Query Reformulation
...

## 1 - SpaCy Pipeline 

In [None]:
# download the respective SpaCy model
!python -m spacy download de_core_news_sm

In [1]:
from pipeline.pipeline import Pipeline

MODEL = "de_core_news_sm"
pipe = Pipeline(model=MODEL)

In [2]:
QUERIES = [
    "@amthor Ist die große Koalition gescheitert unter Merkel? #Groko#SPD #CDU",
]

DOCS = []

# for each query, invoke the SpaCy pipeline
for query in QUERIES:
    doc = pipe.invoke(query)
    DOCS.append(doc)

In [3]:
# select terms to expand
PARAMS = {
    "pos_list": ["NOUN","ADJ","VERB"],
    "entity": False,
    "hashtag": True,
    "user": False,
    "N": 3
}

TOKENS = []

# for each processed query, filter the tokens depending on the specified parameters
for doc in DOCS:
    filtered_tokens = pipe.get_filtered_tokens(doc, PARAMS)
    TOKENS.append(filtered_tokens)

## 2 - Word Embedding 

In [None]:
from pipeline.embedding import Word2Vec

WORD2VEC_MODEL = 'data/word2vec/german.model'

w2v = Word2Vec(WORD2VEC_MODEL)
w2v_similar_terms = []

for token in TOKENS:
    w2v_similar_terms.append(w2v.get_similar_terms([t.text for t in token], PARAMS["N"]))
    
w2v_similar_terms

In [None]:
# free space
del w2v

In [None]:
from pipeline.embedding import FastText

FT_MODEL = 'data/fasttext/cc.de.300.bin'

ft = FastText(FT_MODEL)
ft_similar_terms = []

for token in TOKENS:
    ft_similar_terms = ft.get_similar_terms([t.text for t in token], PARAMS["N"])
ft_similar_terms

In [5]:
# free space
del ft

## 3 - Elastic Search

In [6]:
from pipeline.elasticsearch import ElasticsearchClient
import configparser

config = configparser.ConfigParser()
config.read('auth/es-credentials.ini')

es_client = ElasticsearchClient(credentials=config["ELASTIC"], index="tweets")

In [7]:
es_client.connect(config["ELASTIC"]["PWD"])

Successfully connected to https://localhost:9200


In [None]:
es_client.get_cooccurring_terms('config/es-adjacency-matrix.conf', w2v_similar_terms)