<a href="https://colab.research.google.com/github/GryffindorafAviator/verifiable_ai_oracle_for_prediction_markets/blob/main/pm_data_clean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
EVENTS_KEYS = ["id",
              "title",
              "subtitle",
              "description",
              "resolutionSource",
              "startDate",
              "creationDate",
              "endDate",
              "liquidity",
              "volumn",
              "openInterest",
              "category",
              "subcategory",
              "volume24hr",
              "volume1wk",
              "volume1mo",
              "volume1yr",
              "parentEvent",
              "liquidityAmm",
              "liquidityClob",
              "negRisk",
              "negRiskMarketID",
              "subEvents",
              "markets",
              "series",
              "categories",
              "tags",
              "cyom",
              "closedTime",
              "automaticallyResolved",
              "eventDate",
              "startTime",
              "eventWeek",
              "ended",
              "finishedTimestamp",
              "eventCreators",
              "estimateValue",
              "cantEstimate",
              "estimatedValue"]

MARKETS_KEYS = [
    "id",
    "question",
    "conditionId",
    "resolutionSource",
    "endDate",
    "category",
    "ammType",
    "liquidity",
    "startDate",
    "description",
    "outcomes",
    "outcomePrices",
    "volume",
    "marketType",
    "marketMakerAddress",
    "resolvedBy",
    "marketGroup",
    "groupItemTitle",
    "groupItemThreshold",
    "questionID",
    "umaEndDate",
    "umaResolutionStatus",
    "volumeNum",
    "liquidityNum",
    "hasReviewedDates",
    "volume24hr",
    "volume1wk",
    "volume1mo",
    "volume1yr",
    "fpmmLive",
    "volume24hrAmm",
    "volume1wkAmm",
    "volume1moAmm",
    "volume1yrAmm",
    "volume24hrClob",
    "volume1wkClob",
    "volume1moClob",
    "volume1yrClob",
    "volumeAmm",
    "volumeClob",
    "liquidityAmm",
    "liquidityClob",
    "events",
    "categories",
    "tags",
    "competitive",
    "automaticallyResolved",
    "lastTradePrice",
    "bestBid",
    "bestAsk",
    "negRiskOther",
    "umaResolutionStatuses",
]

SERIES_KEYS = [
    "id",
    "title",
    "subtitle",
    "seriesType",
    "description",
    "pythTokenID",
    "events",
    "categories",
    "tags",
]

CATEGORIES_KEYS = [
    "id",
    "label",
    "parentCategory",
]

TAGS_KEYS = [
    "id",
    "label",
]

In [None]:
def filter_dict(obj: dict, allowed_keys: list) -> dict:
    if not isinstance(obj, dict):
        return None
    return {k: obj.get(k) for k in allowed_keys if k in obj}

def filter_list(items, allowed_keys):
    if not isinstance(items, list):
        return []
    return [filter_dict(item, allowed_keys) for item in items]

In [None]:
import json

def filter_events_jsonl(
    input_file: str,
    output_file: str,
    EVENTS_KEYS: list,
    MARKETS_KEYS: list,
    SERIES_KEYS: list,
    CATEGORIES_KEYS: list,
    TAGS_KEYS: list,
):
    filtered_count = 0

    with open(input_file, "r", encoding="utf-8") as fin, \
         open(output_file, "w", encoding="utf-8") as fout:

        for line in fin:
            event = json.loads(line)

            # Filter event-level fields
            filtered_event = filter_dict(event, EVENTS_KEYS)

            # Filter nested markets
            if "markets" in event and isinstance(event["markets"], list):
                filtered_markets = []

                for m in event["markets"]:
                    fm = filter_dict(m, MARKETS_KEYS)

                    # series
                    if "series" in m:
                        fm["series"] = filter_dict(m["series"], SERIES_KEYS)

                    # categories
                    if "categories" in m:
                        fm["categories"] = filter_list(
                            m["categories"], CATEGORIES_KEYS
                        )

                    # tags
                    if "tags" in m:
                        fm["tags"] = filter_list(
                            m["tags"], TAGS_KEYS
                        )

                    filtered_markets.append(fm)

                filtered_event["markets"] = filtered_markets

            # Filter event-level series, categories, and tags
            if "series" in event:
                filtered_event["series"] = filter_dict(
                    event["series"], SERIES_KEYS
                )

            if "categories" in event:
                filtered_event["categories"] = filter_list(
                    event["categories"], CATEGORIES_KEYS
                )

            if "tags" in event:
                filtered_event["tags"] = filter_list(
                    event["tags"], TAGS_KEYS
                )

            # Write to a new file (JSONL format)
            fout.write(json.dumps(filtered_event) + "\n")
            filtered_count += 1

    print(f"Filtering completed. Processed {filtered_count} events in total.")
    print(f"Output file saved to: {output_file}")

In [None]:
INPUT_FILE = "/content/drive/MyDrive/AI_Oracle/all_closed_events.jsonl"
OUTPUT_FILE = "/content/drive/MyDrive/AI_Oracle/all_closed_events_filtered.jsonl"

filter_events_jsonl(
    input_file=INPUT_FILE,
    output_file=OUTPUT_FILE,
    EVENTS_KEYS=EVENTS_KEYS,
    MARKETS_KEYS=MARKETS_KEYS,
    SERIES_KEYS=SERIES_KEYS,
    CATEGORIES_KEYS=CATEGORIES_KEYS,
    TAGS_KEYS=TAGS_KEYS,
)

In [None]:
import json

input_file = "/content/drive/MyDrive/AI_Oracle/all_closed_events_filtered.jsonl"
output_file = "/content/drive/MyDrive/AI_Oracle/all_closed_events_filtered_pretty.json"

data = []
with open(input_file, "r", encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))

with open(output_file, "w", encoding="utf-8") as f:
    json.dump(data, f, indent=2, ensure_ascii=False)

print("Saved pretty JSON file")

In [None]:
import json
from collections import defaultdict

def collect_event_metadata(input_file):
    """
    Collect unique event-level categories, tags, and series
    from a JSONL file.
    """

    categories = set()
    tags = set()
    series = set()

    total_events = 0

    with open(input_file, "r", encoding="utf-8") as fin:
        for line in fin:
            event = json.loads(line)
            total_events += 1

            # ---- Categories (event level) ----
            if "category" in event: #and isinstance(event["category"], list):
                # for c in event["category"]:
                categories.add(event["category"])
                    # if isinstance(c, dict) and "id" in c:
                    #     categories[c["id"]] = c.get("label")

            # ---- Tags (event level) ----
            # if "tags" in event and isinstance(event["tags"], list):
            #     for t in event["tags"]:
            #         if isinstance(t, dict) and "id" in t:
            #             tags.add(t.get("label"))

            # ---- Series (event level) ----
            # if "series" in event and isinstance(event["series"], dict):
            #     s = event["series"]
            #     if "id" in s:
            #         series.add(s.get("title"))
    print(categories)
    print(tags)
    print(series)
    # unique_categories = list(categories.keys())
    # print(f"Unique categories ({len(unique_categories)}): {unique_categories}")
    # unique_tags = list(tags.keys())
    # print(f"Unique tags ({len(unique_tags)}): {unique_tags}")
    # unique_series = list(series.keys())
    # print(f"Unique series ({len(unique_series)}): {unique_series}")

    return {
        "total_events": total_events,
        "categories": categories,
        "tags": tags,
        "series": series
    }

In [None]:
INPUT_FILE = "/content/drive/MyDrive/AI_Oracle/all_closed_events.jsonl"

result = collect_event_metadata(INPUT_FILE)

print(f"Total events processed: {result['total_events']}")
print(f"Unique categories: {len(result['categories'])}")
print(f"Unique tags: {len(result['tags'])}")
print(f"Unique series: {len(result['series'])}")