In [1]:
import lmstudio as lms

import os
import json

model = lms.llm("gemma-3-4b-persian-v0")

In [2]:
with open("./cities.csv", "r") as file:
    cities = file.read()

with open("./cities.json", "r") as file:
    cities_dict = json.load(file)

questions = [
    {
        "field": "location.province",
        "question": "استان مربوط به این رویداد جیست؟ (فقط نام استان)",
    },
]

In [3]:
def update(event: dict, cities, questions: dict, model):
    description = event["description"]

    for question_dict in questions:
        fields = question_dict["field"].split(".")
        question = question_dict["question"]

        answer = model.respond(f"{cities}. {description}. {question}").parsed

        obj = event
        for field in fields[:-1]:
            obj = obj[field]
        obj[fields[-1]] = answer
    return event

In [4]:
def get_city(event: dict, cities, model):
    if event["location"]["city"] not in ["نامعلوم", "<TBD>"]:
        return event

    question = "شهر مربوط به این رویداد جیست؟ (فقط نام شهر که باید در لیست باشد)"
    answer = model.respond(f"{cities}. {event["description"]}. {question}").parsed

    event["location"]["city"] = answer

    return event

In [5]:
def get_latitude_longitude(event: dict, cities, cities_dict, model):
    if (
        event["location"]["coordinates"]["latitude"] != "<TBD"
        and event["location"]["coordinates"]["longitude"] != "<TBD>"
    ):
        return event

    question = "مرکز استان مربوط به این رویداد چیست؟ (فقط نام شهر)"
    answer = model.respond(f"{cities}. {event["description"]}. {question}").parsed

    if answer not in cities_dict:
        return event

    latitude = cities_dict[answer]["coordinates"]["latitude"]
    longitude = cities_dict[answer]["coordinates"]["longitude"]

    event["location"]["coordinates"]["latitude"] = latitude
    event["location"]["coordinates"]["longitude"] = longitude

    return event

In [8]:
INPUT_ROOT = "./DATASET-V1"
OUTPUT_ROOT = "./DATASET-V2"

os.makedirs(OUTPUT_ROOT, exist_ok=True)

for dirpath, dirnames, filenames in os.walk(INPUT_ROOT):
    relative_path = os.path.relpath(dirpath, INPUT_ROOT)
    output_dirpath = os.path.join(OUTPUT_ROOT, relative_path)

    os.makedirs(output_dirpath, exist_ok=True)

    for filename in filenames:
        input_path = os.path.join(dirpath, filename)

        with open(input_path, "r", encoding="utf-8") as file:
            event = json.load(file)

        event = update(event, "", questions, model)
        event = get_city(event, "", model)
        event = get_latitude_longitude(event, "", cities_dict, model)

        output_path = os.path.join(output_dirpath, filename)
        with open(output_path, "w", encoding="utf-8") as file:
            json.dump(event, file, ensure_ascii=False, indent=4)

        print(f"✅ successfully processed {output_path}")

✅ successfully processed ./DATASET-V2/BIRTH/115.json
✅ successfully processed ./DATASET-V2/BIRTH/050.json
✅ successfully processed ./DATASET-V2/BIRTH/142.json
✅ successfully processed ./DATASET-V2/BIRTH/007.json
✅ successfully processed ./DATASET-V2/BIRTH/011.json
✅ successfully processed ./DATASET-V2/BIRTH/103.json
✅ successfully processed ./DATASET-V2/BIRTH/046.json
✅ successfully processed ./DATASET-V2/BIRTH/085.json
✅ successfully processed ./DATASET-V2/BIRTH/093.json
✅ successfully processed ./DATASET-V2/BIRTH/139.json
✅ successfully processed ./DATASET-V2/BIRTH/119.json
✅ successfully processed ./DATASET-V2/BIRTH/031.json
✅ successfully processed ./DATASET-V2/BIRTH/066.json
✅ successfully processed ./DATASET-V2/BIRTH/089.json
✅ successfully processed ./DATASET-V2/BIRTH/123.json
✅ successfully processed ./DATASET-V2/BIRTH/070.json
✅ successfully processed ./DATASET-V2/BIRTH/135.json
✅ successfully processed ./DATASET-V2/BIRTH/027.json
✅ successfully processed ./DATASET-V2/BIRTH/02