In [6]:
import requests
import logging
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)
import json
import re
import emoji
from tqdm import tqdm
from transformers import pipeline

In [7]:
url = "https://127.0.0.1:9200/final_data/_search"
auth = ('elastic', 'elastic')
headers = {'Content-Type': 'application/json'}

In [8]:
classifier = pipeline(
    "zero-shot-classification",
    model="joeddav/xlm-roberta-large-xnli",
    device="cpu"
)

labels = [
    "agrees with the main post (oppose tariff)",
    "disagrees with the main post (support tariff)",
    "neutral or irrelevant"
]

label_mapping = {
    "agrees with the main post (oppose tariff)": "oppose tariff",
    "disagrees with the main post (support tariff)": "support tariff",
    "neutral or irrelevant": "unknown"
}

Device set to use cpu


In [9]:
def text_preprocessing(text):
    if not isinstance(text, str):
        return ""
    # remove URLs
    text = re.sub(r'http\S+|www\S+', '', text)
    # remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    # remove Markdown images and links
    text = re.sub(r'!\[.*?\]\(.*?\)', '', text)
    text = re.sub(r'\[.*?\]\(.*?\)', '', text)
    # keep hashtag words, remove the "#" symbol
    text = re.sub(r'#(\w+)', r'\1', text)
    # remove @someone
    text = re.sub(r'@\w+', '', text)
    # convert emojis to text descriptions
    text = emoji.demojize(text)
    # normalize line breaks
    text = text.replace('\n', ' ').replace('\r', ' ')
    # convert multiple whitespace into one space
    text = re.sub(r'\s+', ' ', text)
    # remove extra whitespace from start and end of text
    return text.strip()

In [10]:
query = {
    "size": 10000,
    "query": {
        "bool": {
            "must": [
                {"term": {"author.status.label": "oppose tariff"}},
                {"range": {"author.status.score": {"gt": 0.9}}}
            ]
        }
    }
}

response = requests.get(
    url,
    headers=headers,
    auth=auth,
    json=query,
    verify=False
)

In [13]:
def run_sentiment_pipeline(data):
    documents = [hit["_source"] for hit in data["hits"]["hits"]]

    for doc in tqdm(documents, desc="Processing documents"):
        main_text = text_preprocessing(doc["content"].get("text", ""))
        replies = doc.get("engagement", {}).get("replies", [])

        if not replies:
            continue

        for reply in replies:
            reply_text = text_preprocessing(reply.get("content", ""))

            if not reply_text:
                reply["status"] = {"label": "unknown"}
                continue

            sequence_to_classify = f"Main post: {main_text} Reply: {reply_text}"

            result = classifier(sequence_to_classify, labels)
            predicted_label = result['labels'][0]
            final_label = label_mapping[predicted_label]

            reply["status"] = {"label": final_label}

    with open("oppose_data.json", "w", encoding="utf-8") as f:
        json.dump(documents, f, ensure_ascii=False, indent=2)

In [14]:
if __name__ == "__main__":
    if response.status_code == 200:
        data = response.json()
        run_sentiment_pipeline(data)
    else:
        print(f"Query failed: {response.status_code}, {response.text}")

Processing documents: 100%|██████████| 2021/2021 [15:07<00:00,  2.23it/s]  
