In [71]:
import glob
import hashlib
import itertools
import json
from typing import List

import pandas as pd
import requests

In [45]:
# experiment setup

integrations: List[dict] = [
    {
        "provider": "huggingFace",
        "model": "mistralai/Mistral-7B-Instruct-v0.2",
    },
    # disabling openai during first test
    # {
    #     "provider": "OpenAI",
    #     "model": "gpt-3.5-turbo"
    # }
]
personas: List[List[str]] = [
    ["liberal"],
    ["conservative"],
    ["alt_right"]
]
languages: List[str] = ["English", "German", "Dutch"]
networks: List[str] = ["Twitter", "Reddit"]
topics: List[str] = ["Ukraine War", "Covid-19"]

# number of iterations
x: int = 1

In [83]:
for _ in range(x):
    for i, p, l, n, t in list(itertools.product(*[integrations, personas, languages, networks, topics])):

        payload: dict = {
            "personas": p,
            "integration": i,
            "language": l,
            "network": n,
            "topic": t
        }

        response = requests.post('https://api.twon.uni-trier.de/generate/', json=payload)

        if response.status_code == 500 or response.status_code == 502:
            print("500/502: Connection Error, too many request, try again later.")
            break
            
        try:
            data: dict = response.json()
            
        except Exception as e:
            print(e, ':', response)
            break

        sample: dict = {
            "agent": p[0],
            "model": i["model"],
            "topic": t,
            "style": n,
            "language": l,
            "text": data["response"],
            "annotation": {
                "topic": None,
                "persona": None,
                "authenticity": None
            }
        }

        sample['id'] = hashlib.shake_256(str.encode(json.dumps(sample))).hexdigest(24)
        open(f'./raw/{sample["id"]}.json', 'w').write(json.dumps(sample))

500/502: Connection Error, too many request, try again later.


In [88]:
dataset: pd.DataFrame = (
    pd.json_normalize([
        json.load(open(file, 'r'))
        for file in glob.glob('./raw/*.json')
    ])
    .set_index('id')
    .astype({
        'agent': 'category', 'model': 'category',
        'topic': 'category', 'style': 'category',
        'language': 'category'
    })
)
dataset.to_parquet('dataset.parquet')
dataset

Unnamed: 0_level_0,agent,model,topic,style,language,text,annotation.topic,annotation.persona,annotation.authenticity
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
4fb4d34ef4731c6fbab113c642756ea9115570b4f409ccba,liberal,gpt-3.5-turbo,Ukraine War,Twitter,English,Heartbroken and deeply concerned about the ong...,,,
7afecff1da3ceda35aa5ac7df8a93dbc9fca15c7561a0f64,liberal,gpt-3.5-turbo,Covid-19,Twitter,English,🌍🌡️🦠 Hey world! COVID-19 has undeniably tested...,,,
b0537c9d1c187ca01e3ce3c605eab5fe39e11e1f383ad0e1,liberal,gpt-3.5-turbo,Ukraine War,Reddit,English,Hey fellow Redditors! Just wanted to take a mo...,,,
2f472f3e336b51d9b1c5c3a6f4f6932340a015911ac4cd31,liberal,mistralai/Mistral-7B-Instruct-v0.2,Ukraine War,Twitter,Dutch,💔 Mijn hart zit vol met tranen voor het lijden...,,,
a1b1cd27765d3f9d1a541c7c4d3894feb1e0123c9a1f0871,liberal,mistralai/Mistral-7B-Instruct-v0.2,Covid-19,Twitter,German,"😷 ist eine globale Herausforderung, die uns a...",,,
89b3f93b9e4dc33aee2284d9c8e4713112bb110cb3a8c6a0,liberal,mistralai/Mistral-7B-Instruct-v0.2,Covid-19,Reddit,English,"😷💉 In these challenging times, it's more impor...",,,
319f4af62a1bde8446ef0823c8d2d3e6fd6f539caec85dbf,liberal,gpt-3.5-turbo,Covid-19,Twitter,English,"""🌍✊ Solidarity in the face of is essential no...",,,
b6ce3b48bd93d8738ecc4cbe645f1763958d31a6ddf5aa5d,liberal,gpt-3.5-turbo,Covid-19,Twitter,English,🌍🌡️🦠 Hey world! COVID-19 has undeniably tested...,,,
7cd082f9d26f68c5129989a7a7dbfc9189995367bb3bac1d,liberal,mistralai/Mistral-7B-Instruct-v0.2,Ukraine War,Twitter,English,💔 Heartbreaking to witness the ongoing conflic...,,,
faaa7730b7f8b190dddf1f3047d8f1f177fdb0d67aae9e86,liberal,gpt-3.5-turbo,Covid-19,Twitter,German,"""Die Covid-19-Pandemie hat uns alle vor große ...",,,


In [89]:
open('dataset.json', 'w').write(json.dumps([
    json.load(open(file, 'r'))
    for file in glob.glob('./raw/*.json')
]))

52944