In [7]:
import requests
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
import json
import copy
import logging
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)

In [8]:
ES_HOST = "https://127.0.0.1:9200"
auth = ('elastic', 'elastic')
headers = {'Content-Type': 'application/json'}

In [9]:
def scroll_fetch_all(index):
    url = f"{ES_HOST}/{index}/_search?scroll=2m"
    query = {"size": 1000, "query": {"match_all": {}}}
    response = requests.get(url, headers=headers, auth=auth, json=query, verify=False)
    res_json = response.json()

    scroll_id = res_json["_scroll_id"]
    all_documents = {hit["_id"]: hit["_source"] for hit in res_json["hits"]["hits"]}

    while True:
        scroll_response = requests.post(
            f"{ES_HOST}/_search/scroll",
            headers=headers,
            auth=auth,
            json={"scroll": "2m", "scroll_id": scroll_id},
            verify=False
        )
        scroll_json = scroll_response.json()
        hits = scroll_json["hits"]["hits"]
        if not hits:
            break
        scroll_id = scroll_json["_scroll_id"]
        all_documents.update({hit["_id"]: hit["_source"] for hit in hits})

    return all_documents

In [10]:
moutput_data = scroll_fetch_all("moutputdata")
sentiment_data = scroll_fetch_all("sentiment_status")
new_index = "final_data"
with open('mapping.json', 'r', encoding='utf-8') as file:
    mapping = json.load(file)

In [11]:
requests.delete(f"{ES_HOST}/{new_index}", auth=auth, verify=False)
requests.put(f"{ES_HOST}/{new_index}", headers=headers, auth=auth, json=mapping, verify=False)

<Response [200]>

In [14]:
import os
print(os.getcwd())

/Users/fayezhao/PycharmProjects/comp90024_team_72


In [12]:
bulk_data = []

for doc_id, moutput_doc in moutput_data.items():

    merged_doc = copy.deepcopy(moutput_doc)

    sentiment_doc = sentiment_data.get(doc_id)
    if sentiment_doc:
        merged_doc.setdefault("author", {}).setdefault("status", {})["score"] = sentiment_doc["sentiment"]["score"]
        merged_doc["author"]["status"]["label"] = sentiment_doc["sentiment"]["label"]

        if merged_doc.get("engagement") is None:
            merged_doc["engagement"] = {"favorites": [], "replies": [], "reblogs": []}

        for favorite in merged_doc["engagement"].get("favorites", []):
            favorite.setdefault("status", {})["label"] = sentiment_doc["sentiment"]["label"]

        for reply in merged_doc["engagement"].get("replies", []):
            reply.setdefault("status", {})["label"] = []

        for reblog in merged_doc["engagement"].get("reblogs", []):
            reblog.setdefault("status", {})["label"] = "unknown"

        merged_doc.setdefault("engagement", {}).setdefault("interaction_counts", {})["favorites_count"] = len(merged_doc.get("engagement", {}).get("favorites", []))
        merged_doc["engagement"]["interaction_counts"]["reblogs_count"] = len(merged_doc.get("engagement", {}).get("reblogs", []))
        merged_doc["engagement"]["interaction_counts"]["replies_count"] = len(merged_doc.get("engagement", {}).get("replies", []))

    bulk_data.append(json.dumps({"index": {"_index": new_index, "_id": doc_id}}))
    bulk_data.append(json.dumps(merged_doc))

    if len(bulk_data) >= 1000 * 2:
        bulk_response = requests.post(f"{ES_HOST}/_bulk", headers=headers, auth=auth, data="\n".join(bulk_data) + "\n", verify=False)
        bulk_data = []

if bulk_data:
    requests.post(f"{ES_HOST}/_bulk", headers=headers, auth=auth, data="\n".join(bulk_data) + "\n", verify=False)