In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import openai
from openai import OpenAI

import os
from dotenv import load_dotenv

load_dotenv(override=True)
openai.api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI()

In [106]:
import re

In [49]:
import ast

In [3]:
config = {
    'dataset_dir' : '../../Datasets/'
}

In [4]:
df_penelitian = pd.read_csv(config['dataset_dir']+'Daftar Penerima Pendanaan Program Penelitian Tahun Anggaran 2024.csv')

df_abdimas = pd.read_csv(config['dataset_dir']+'Penerima Pendanaan Program Pengabdian kepada Masyarakat Batch II  Tahun  Anggaran 2025.csv')
df_abdimas = df_abdimas.rename(columns={'Institusi':'Nama Institusi'})

In [5]:
df = pd.concat([df_penelitian[['Nama Institusi','Judul']],df_abdimas[['Nama Institusi','Judul']]]).reset_index(drop=True)

In [27]:
def batch_generator(data,batch_size=50):
    for i in range(0,len(data),batch_size):
        yield data[i:i+batch_size]

In [54]:
all_institusi = list(df['Nama Institusi'].unique())

In [59]:
def get_location(list_institusi):
    prompt = f"Saya mempunyai list yang berisi nama-nama institusi pendidikan di Indonesia sebagai berikut {list_institusi}\n\n"
    prompt += "Berdasarkan list nama-nama institusi tersebut, cari tahu masing-masing kabupaten atau kota dan provinsi tempat institusi itu berada.\n\n"
    prompt += "Jawab dalam format list yang berisi tuple-tuple seperti ini: [(nama_institusi,kabupaten atau kota,provinsi),(nama_institusi,kabupaten atau kota,provinsi), ...]. Jika salah satu tidak diketahui maka isi dengna 'unknown'."

    response = client.chat.completions.create(
        model="gpt-4o-2024-08-06",
        messages=[
            {"role": "system", "content": 'Kamu adalah asisten yang mencari tahu di kota/kabupaten dan provinsi mana suatu institusi berada. Kamu hanya menjawab dalam format list yang berisi tuple'},
            {"role": "user", "content": prompt}
        ],
        temperature=0
    )
    content = response.choices[0].message.content.strip()
    if content.startswith("```") and content.endswith("```"):
        content = "\n".join(content.splitlines()[1:-1])
    return ast.literal_eval(content)

In [39]:
gen = batch_generator(all_institusi,50)
nyoba = next(iter(gen))

In [88]:
gen = batch_generator(all_institusi,50)
list_tuple_all = []
for list_institusi in gen:
    list_tuple = get_location(list_institusi)
    list_tuple_all += list_tuple

In [89]:
len(list_tuple_all)

1285

In [90]:
institusi_dict = {}
for t in list_tuple_all:
    institusi_dict[t[0]] = {'kota':t[1],'provinsi':t[2]}

In [117]:
def get_kota(x):
    result = institusi_dict.get(x,'unknown')
    if result == 'unknown':
        return result
    else:
        return result['kota']

In [93]:
df['kota'] = df['Nama Institusi'].apply(get_kota)

In [94]:
all_institusi2 = list(df[df['kota']=='unknown']['Nama Institusi'].unique())

In [95]:
gen = batch_generator(all_institusi2,50)
list_tuple_all = []
for list_institusi in gen:
    list_tuple = get_location(list_institusi)
    list_tuple_all += list_tuple

In [96]:
for t in list_tuple_all:
    institusi_dict[t[0]] = {'kota':t[1],'provinsi':t[2]}

In [97]:
df['kota'] = df['Nama Institusi'].apply(get_kota)

In [98]:
df['kota']

0           Kota Bogor
1           Kota Bogor
2           Kota Bogor
3           Kota Bogor
4           Kota Bogor
             ...      
12485        Pekanbaru
12486        Pekanbaru
12487        Pekanbaru
12488    Pasaman Barat
12489            Batam
Name: kota, Length: 12490, dtype: object

In [100]:
sum(df['kota'] == 'unknown')

633

In [101]:
all_institusi3 = list(df[df['kota']=='unknown']['Nama Institusi'].unique())

In [103]:
len(all_institusi2)

202

In [109]:
df['Nama Institusi'] = df['Nama Institusi'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())

In [118]:
df['kota'] = df['Nama Institusi'].apply(get_kota)

In [119]:
all_institusi3 = list(df[df['kota']=='unknown']['Nama Institusi'].unique())

In [120]:
len(all_institusi3)

105

In [127]:
from json import dump,load
import json

In [128]:
with open("insitusi.json", "w", encoding="utf-8") as f:
    json.dump(institusi_dict, f, ensure_ascii=False, indent=4)