# Elastic Search Playground
First, load the credentials to connect the elastic search client and the posgres database. 

In [2]:
import os

# workaround: change the working directory to the root of the project
os.chdir("../")
print(os.getcwd())

/home/jsonpy/Projects/Practical/twitter-query-expansion


In [6]:
import psycopg2
import sys
import json

from pipeline.src.utils import es_connect
from elasticsearch import Elasticsearch
import configparser

config = configparser.ConfigParser()
config.read('auth/es-credentials.ini')

['auth/es-credentials.ini']

Check if the Elastic Search instance is running by using the elastic search python library

In [7]:
# connect to elastic instance
es_client = es_connect(credentials=config['ELASTIC'])
es_client.info()

Connecting to Elastic Search...
Successfully connected to https://localhost:9200


ObjectApiResponse({'name': 'f6240d32ea65', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'YIiFu2p-QOWJhSPb-Zcavw', 'version': {'number': '8.5.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': 'a846182fa16b4ebfcc89aa3c11a11fd5adf3de04', 'build_date': '2022-11-17T18:56:17.538630285Z', 'build_snapshot': False, 'lucene_version': '9.4.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

---
## Create or Delete Index

In [6]:
es_client.indices.create(index="test")

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'test'})

In [7]:
es_client.indices.delete(index="test")

ObjectApiResponse({'acknowledged': True})

---
## Using Analyzer and Settings

In [8]:
es_config = json.load(open('config/es-config.conf'))

In [None]:
es_client.indices.create(index="test", settings=es_config["settings"], mappings=es_config["mappings"])

In [55]:
txt = " RT @PeterPan 500 Millionen und 17,30€ 'bin' <b>Merkel's</b> als #Leben#SPD_VM #liebte#um 19:30 Uhr Millionen  möchte liebten liebte lieb"

In [56]:
res = es_client.indices.analyze(index="tweets_kw", analyzer="tweet_analyzer", text=txt)
print(len(res["tokens"]))
for i, doc in enumerate(res["tokens"]):
    print(f"{doc['token']}")

148
_re
_ret
_retw
_retwe
_retwee
_retweet
ret
retw
retwe
retwee
retweet
retweet_
etw
etwe
etwee
etweet
etweet_
twe
twee
tweet
tweet_
wee
weet
weet_
eet
eet_
et_
_retweet_
_us
_use
_user
_user_
_user_p
_user_pe
use
user
user_
user_p
user_pe
user_pet
ser
ser_
ser_p
ser_pe
ser_pet
ser_pete
er_
er_p
er_pe
er_pet
er_pete
er_peter
r_p
r_pe
r_pet
r_pete
r_peter
r_peterp
_pe
_pet
_pete
_peter
_peterp
_peterpa
pet
pete
peter
peterp
peterpa
peterpan
ete
eter
eterp
eterpa
eterpan
ter
terp
terpa
terpan
erp
erpa
erpan
rpa
rpan
pan
_user_peterpan
500
mil
mill
milli
millio
million
ill
illi
illio
illion
lli
llio
llion
lio
lion
ion
one
onen
nen
17
30€
mer
merk
merkel
erk
erkel
rke
rkel
kel
leb
ebe
eben
ben
spd
spd_
spd_v
spd_vm
pd_
pd_v
pd_vm
d_v
d_vm
_vm
lie
lieb
liebt
ieb
iebt
ebt
bte
19
30
uhr
moc
moch
mocht
och
ocht
cht
hte
bten
ten


Now we want to feed data from the Twitter PostgreSQL database into Elastic Search. Therefore use the script provided within the `src` folder.

---
## Search data

In [52]:
es_query = json.load(open('config/example-query.conf'))
es_query

{'size': 10,
 'query': {'bool': {'should': {'match': {'txt': {'query': 'große Koalition gescheitert Merkel',
      'operator': 'OR'}}},
   'must': {'terms_set': {'hashtags': {'terms': ['cdu', 'groko'],
      'minimum_should_match_script': {'source': 'Math.min(params.num_terms, 1)'}}}},
   'must_not': {'term': {'txt': '_retweet_'}}}},
 'aggs': {'sample': {'sampler': {'shard_size': 500},
   'aggs': {'keywords': {'significant_terms': {'field': 'hashtags'}}}}},
 'collapse': {},
 'sort': {}}

In [53]:
res = es_client.search(index="tweets_kw", size=5, query=es_query["query"], aggregations=es_query["aggs"])

print("Total Hits:", res["hits"]["total"]["value"])
res["hits"]["hits"]

Total Hits: 1555


[{'_index': 'tweets_kw',
  '_id': '1433501203418259460',
  '_score': 20.296646,
  '_source': {'retweet_count': 28,
   'reply_count': 7,
   'like_count': 191,
   'created_at': '2021-09-02T20:44:54+02:00',
   'txt': 'Das Versagen der #GroKo #cdu #spd in einem Tweet \n👎🏼Parteitaktik über alles \n👎🏼in 4 J. keine wirkliche Reform hinbekommen\n👎🏼 Oppositionsvorschl. wie immer abgelehnt \n\n👎🏼👎🏼 Konsequenz: evtl über 900 MdB inkl. riesiger Kosten &amp; Chaos https://t.co/K9s1T8dVH5',
   'hashtags': ['spd', 'cdu', 'groko'],
   'word_count': 35}},
 {'_index': 'tweets_kw',
  '_id': '1431971185521217542',
  '_score': 19.17094,
  '_source': {'retweet_count': 42,
   'reply_count': 17,
   'like_count': 350,
   'created_at': '2021-08-29T15:25:10+02:00',
   'txt': 'Von den letzten 16 Jahren hat die #SPD 12 Jahre mit der #CDU regiert. Die #SPD hat Scholz mit großem Getöse nicht zum Parteivorsitzenden gewählt,mit dem Argument,er stünde für die #GroKo Jetzt ist er Kanzlerkandidat und kokettiert offen dam

---
## Data Aggregation

In [38]:
es_query = json.load(open('config/example-query.conf'))
es_query

{'size': 10,
 'query': {'bool': {'should': {'query_string': {'query': 'corona',
     'fields': ['hashtags', 'txt'],
     'default_operator': 'OR'}},
   'must_not': {'term': {'txt': '_retweet_'}}}},
 'aggs': {'sample': {'sampler': {'shard_size': 500},
   'aggs': {'keywords': {'significant_terms': {'field': 'hashtags'}}}}},
 'collapse': {},
 'sort': {}}

In [39]:
res = es_client.search(index="tweets_kw", query=es_query["query"], aggregations=es_query["aggs"])
#print(res)
print("Total Hits:", [t for t in res["aggregations"]["sample"]["keywords"]["buckets"]])
#res["hits"]["hits"]

Total Hits: [{'key': 'corona', 'doc_count': 534, 'score': 8.000855488631489, 'bg_count': 2331}, {'key': 'coronaschutzimpfung', 'doc_count': 50, 'score': 0.9772123602892833, 'bg_count': 169}, {'key': 'coronavirus', 'doc_count': 48, 'score': 0.7320431844660195, 'bg_count': 206}, {'key': 'impfung', 'doc_count': 64, 'score': 0.4892907342627798, 'bg_count': 526}, {'key': 'coronakrise', 'doc_count': 16, 'score': 0.32564294017094014, 'bg_count': 52}, {'key': 'diesmalnpd', 'doc_count': 9, 'score': 0.31949082352941177, 'bg_count': 17}, {'key': 'coronapolitik', 'doc_count': 6, 'score': 0.30340800000000007, 'bg_count': 8}, {'key': 'coronapk', 'doc_count': 4, 'score': 0.2705848888888889, 'bg_count': 4}, {'key': 'füreuchgemeinsamstark', 'doc_count': 8, 'score': 0.23755693827160493, 'bg_count': 18}, {'key': 'delta', 'doc_count': 8, 'score': 0.22477323976608188, 'bg_count': 19}]
