## Libraries and Helper Functions

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re
import string
import nltk
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.decomposition import NMF, LatentDirichletAllocation, TruncatedSVD

from umap.umap_ import UMAP
from hdbscan import HDBSCAN
from bertopic import BERTopic
from bertopic.representation import KeyBERTInspired,MaximalMarginalRelevance
from bertopic.representation import OpenAI as OpenAI_BERTopic
from sentence_transformers import SentenceTransformer

from gensim.corpora import Dictionary
from gensim.models import CoherenceModel

import os
from dotenv import load_dotenv

import optuna

nltk.download('punkt')
nltk.download('stopwords')

import torch
import random
import os

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hanif\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hanif\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
import torch
import torchvision

In [3]:
load_dotenv(override=True)

True

In [4]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.StopWordRemover.StopWordRemover import StopWordRemover

In [92]:
import os
import json
import re
from openai import OpenAI
import openai

openai.api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI()

def summarize_topic(keywords, docs):
    prompt = f"Saya mempunyai sebuah topik terkait penelitian ilmiah atau pengabdian masyarakat yang dapat di-deskripsikan dengan beberapa kata kunci berikut: {keywords}\n\n"
    prompt += "Di dalam topik ini, dokumen-dokumen berikut adalah judul penelitian atau pengabdian masyarakat yang merupakan sebagian kecil namun representatif dari semua dokumen dalam topik ini:\n\n"
    prompt += "\n\n".join(docs[:5])  # ambil 5 dokumen saja
    prompt += 'Berdasarkan informasi di atas, deskripsikan topik ini dan berikan label berupa satu kalimat yang representatif terhadap topik ini dengan format JSON berikut:\n {"label_topik":<label> , "deskripsi":<deskripsi>}'

    response = client.chat.completions.create(
        model="gpt-4o-2024-08-06",
        messages=[
            {"role": "system", "content": 'Kamu adalah asisten akademik yang merangkum topik. Anda adalah model yang hanya menjawab dalam format JSON valid. Jangan beri penjelasan. Berikan hanya output seperti ini: {"label_topik": "...", "deskripsi": "..."}'},
            {"role": "user", "content": prompt}
        ],
        temperature=0.5,
        max_tokens=300
    )
    content = response.choices[0].message.content.strip()
    try:
        match = re.search(r'\{.*?\}', content, re.DOTALL)
        json_str = match.group(0)
        result = json.loads(json_str)
        return result
    except json.JSONDecodeError:
        print("JSON tidak valid:", content)


In [6]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    
set_seed()
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hanif\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\hanif\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [7]:
import time
class StopWatch():
    def __init__(self):
        self.__start_time = 0
        self.__end_time = 0
        self.__elapsed_time = 0
    def start(self):
        self.__start_time = time.time()
        self.__end_time = 0
    def stop(self):
        self.__end_time = time.time()
        self.__elapsed_time = self.__end_time - self.__start_time
    def print(self):
        detik = self.__elapsed_time

        jam = detik // 3600
        detik = detik % 3600

        menit = detik // 60
        detik = detik % 60

        print('Waktu eksekusi: ',end='')
        if jam != 0:
            print(int(jam),'jam ',end='')
        if menit != 0:
            print(int(menit),'menit',end='')
        print(f' {detik:.2f}','detik.')
    def get_elapsed_time(self):
        return self.__elapsed_time

In [8]:
factory = StopWordRemoverFactory()
stopword_remover = factory.create_stop_word_remover()  
repeated_phrases = ["kecerdasan buatan"]

additional_stop_words = ['ai']

stop_words = set(stopwords.words('english'))
for word in additional_stop_words:
    stop_words.add(word)

def preprocess_text(text):
    # Lowercasing
    text = text.lower()
    
    # URL removal
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Punctuation removal
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Remove repeated phrases
    for phrase in repeated_phrases:
        text = re.sub(phrase,"",text)

    # Remove duplicate white space
    text = re.sub(r'\s+', ' ', text)
    
    text = stopword_remover.remove(text) 

     # Tokenization
    tokens = word_tokenize(text)
    
    # Stopword removal
    tokens = [word for word in tokens if word not in stop_words]
    
    return ' '.join(tokens)

In [9]:
def trim(x):
    if x[-8:]=='Collapse':
        return x[:-8]
    elif x[-9:] == ' … Expand':
        return x[:-9]
    else:
        return x

In [10]:
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"\nTopic #{topic_idx + 1}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

In [11]:
def get_topics_words(model, vectorizer):
    feature_names = vectorizer.get_feature_names_out()
    topics_words = []
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[:-11:-1]
        top_words = [feature_names[i] for i in top_features_ind]
        topics_words.append(top_words)

    return topics_words

def get_bert_topics_words(topic_model):
    bert_topics_words = []
    for topic_id in topic_model.get_topics().keys():
        if topic_id == -1:
            continue
        topic_words = [word for word, _ in topic_model.get_topic(topic_id)]
        bert_topics_words.append(topic_words)
    return bert_topics_words

def get_coherence(topics_words):
    sw = StopWatch()
    sw.start()
    
    coherence = CoherenceModel(
                    topics=topics_words,
                    texts=tokenized_docs,
                    dictionary=dictionary,
                    coherence='c_v'
                ).get_coherence()

    sw.stop()
    sw.print()

    return coherence

In [12]:
config = {
    'dataset_dir' : '../../Datasets/'
}

## EDA

In [13]:
df_penelitian = pd.read_csv(config['dataset_dir']+'Daftar Penerima Pendanaan Program Penelitian Tahun Anggaran 2024.csv')
df_abdimas = pd.read_csv(config['dataset_dir']+'Penerima Pendanaan Program Pengabdian kepada Masyarakat Batch II  Tahun  Anggaran 2025.csv')

In [14]:
df_penelitian

Unnamed: 0,No,Kategori Institusi,Nama Institusi,NIDN,Nama,Judul,Ruang Lingkup
0,1,PTNBH,Institut Pertanian Bogor,27046503,Achmad Farajallah,Pola Distribusi Macrobrachium Sintangense di J...,PPS- PTM
1,2,PTNBH,Institut Pertanian Bogor,24129002,Adisti Permatasari Putri Hartoyo,APLIKASI BIO-NANOFERTILIZERS DAN DRONE SEEDING...,PFR
2,3,PTNBH,Institut Pertanian Bogor,24129002,Adisti Permatasari Putri Hartoyo,APLIKASI SEED BOMB TECHNOLOGY DAN NANO- CS/PMA...,PPS- PTM
3,4,PTNBH,Institut Pertanian Bogor,2076607,Agus Buono,Optimasi Model Hybrid Convolutional Vision Tra...,PPS- PDD
4,5,PTNBH,Institut Pertanian Bogor,18096208,Agus Hikmat,Pengembangan Sistem Agroforestri dan Produk Ar...,PFR
...,...,...,...,...,...,...,...
11989,11990,LLDIKTI Wilayah XVI,Universitas Trinita,912086503,Yongker Baali,Pemanfaatan Limbah Tempurung Kelapa untuk pros...,PDP Reguler
11990,11991,LLDIKTI Wilayah XVI,Universitas Widya Nusantara,1613039801,Hardianti,Nuget Daun Krokot Dan Tepung Bekatul Sebagai A...,PDP Reguler
11991,11992,LLDIKTI Wilayah XVI,Universitas Widya Nusantara,917068903,Matius Paundanan,"Studi Kandungan Logam Berat (Hg, Pb, Cd dan Cu...",PDP Reguler
11992,11993,LLDIKTI Wilayah XVI,Universitas Widya Nusantara,1602099801,Nuristha Febrianti,Pemanfaatan Pangan Lokal Stik Dangke Kelor Seb...,PDP Reguler


In [15]:
df_abdimas = df_abdimas.rename(columns={'Institusi':'Nama Institusi'})

In [16]:
df = pd.concat([df_penelitian[['Nama Institusi','Judul']],df_abdimas[['Nama Institusi','Judul']]]).reset_index(drop=True)

In [17]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12490 entries, 0 to 12489
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Nama Institusi  12490 non-null  object
 1   Judul           12490 non-null  object
dtypes: object(2)
memory usage: 195.3+ KB


## Preprocessing and Feature Engineering #1

In [18]:
df_tm = df[['Judul']].copy()

In [19]:
sw = StopWatch()
sw.start()
df_tm['processed_judul'] = df_tm['Judul'].apply(preprocess_text)
sw.stop()
sw.print()

Waktu eksekusi:  2.73 detik.


In [20]:
df_tm.head()

Unnamed: 0,Judul,processed_judul
0,Pola Distribusi Macrobrachium Sintangense di J...,pola distribusi macrobrachium sintangense jawa...
1,APLIKASI BIO-NANOFERTILIZERS DAN DRONE SEEDING...,aplikasi bionanofertilizers drone seeding sist...
2,APLIKASI SEED BOMB TECHNOLOGY DAN NANO- CS/PMA...,aplikasi seed bomb technology nano cspmaanpkmi...
3,Optimasi Model Hybrid Convolutional Vision Tra...,optimasi model hybrid convolutional vision tra...
4,Pengembangan Sistem Agroforestri dan Produk Ar...,pengembangan sistem agroforestri produk berbas...


## Topic Modelling

In [21]:
docs = df_tm['processed_judul'].to_list()

In [22]:
def get_top_keyword(vectorizer,model):
    componenets = model.components_
    feature_names = vectorizer.get_feature_names_out()
    top_n = 10
    keywords_per_topic = [
        [feature_names[i] for i in topic.argsort()[:-top_n - 1:-1]] for topic in componenets
    ]
    return keywords_per_topic

def select_representative_docs(model,topics):
    topics_docs = {}
    for topic_idx in range(model.n_components):
        topic_strength = topics[:,topic_idx]
        top_doc_indices = topic_strength.argsort()[::-1][:5]
        topics_docs[topic_idx] = [docs[i] for i in top_doc_indices]
    return topics_docs

In [23]:
def summarize(model, topics, vectorizer):
    r_docs = select_representative_docs(model,topics)
    top_key = get_top_keyword(vectorizer,model)
    topic_desc = {}
    for topic_idx, keywords in enumerate(top_key):
        docs_sample = r_docs[topic_idx]
        result = summarize_topic(keywords, docs_sample)
        topic_desc[topic_idx] = result
    return topic_desc

In [24]:
tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf = tfidf_vectorizer.fit_transform(df_tm['processed_judul'])

count_vectorizer = CountVectorizer(max_features=1000)
count = count_vectorizer.fit_transform(df_tm['processed_judul'])

### NMF

In [25]:
sw = StopWatch()
sw.start()
nmf_model = NMF(n_components=10, random_state=42)
nmf_topics = nmf_model.fit_transform(tfidf)
sw.stop()
sw.print()

Waktu eksekusi:  0.83 detik.


In [26]:
df_tm['NMF_topic'] = np.argmax(nmf_topics,axis=1)

In [27]:
df_tm['NMF_topic'].value_counts()

NMF_topic
4    1559
8    1558
6    1543
7    1495
3    1365
5    1361
2    1056
0     966
1     901
9     686
Name: count, dtype: int64

### LDA

In [28]:
sw = StopWatch()
sw.start()
lda_model = LatentDirichletAllocation(n_components=10, random_state=42)
lda_topics = lda_model.fit_transform(count)
sw.stop()
sw.print()

Waktu eksekusi:  24.57 detik.


In [29]:
df_tm['LDA_topic'] = np.argmax(lda_topics,axis=1)

In [30]:
df_tm['LDA_topic'].value_counts()

LDA_topic
5    2031
1    1864
4    1734
3    1319
8    1082
9    1042
0     939
7     899
6     795
2     785
Name: count, dtype: int64

### LSA

In [31]:
sw = StopWatch()
sw.start()
lsa_model = TruncatedSVD(n_components=10, random_state=42)
lsa_topics = lsa_model.fit_transform(tfidf)
sw.stop()
sw.print()

Waktu eksekusi:  0.06 detik.


In [32]:
df_tm['LSA_topics'] = np.argmax(lsa_topics,axis=1)

In [33]:
df_tm['LSA_topics'].value_counts()

LSA_topics
0    7477
2    1526
5     927
6     808
7     592
9     414
4     210
8     201
1     188
3     147
Name: count, dtype: int64

### BERTopic

In [34]:
# Prepare sub-models
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
umap_model = UMAP(n_components=5, n_neighbors=50, random_state=42, metric="cosine", verbose=True)
hdbscan_model = HDBSCAN(min_samples=20, gen_min_span_tree=True, prediction_data=True, min_cluster_size=20)
vectorizer_model = CountVectorizer(ngram_range=(1, 3), min_df=5)

In [35]:
# Representation models
representation_models = {
    "KeyBERTInspired": KeyBERTInspired()
}

# Fit BERTopic
sw = StopWatch()
sw.start()

topic_model= BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        representation_model=representation_models,
        verbose=True
).fit(docs)

sw.stop()
sw.print()

2025-07-28 07:08:39,683 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2025-07-28 07:08:50,582 - BERTopic - Embedding - Completed ✓
2025-07-28 07:08:50,583 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


UMAP(angular_rp_forest=True, metric='cosine', n_components=5, n_jobs=1, n_neighbors=50, random_state=42, verbose=True)
Mon Jul 28 07:08:50 2025 Construct fuzzy simplicial set
Mon Jul 28 07:08:50 2025 Finding Nearest Neighbors
Mon Jul 28 07:08:50 2025 Building RP forest with 11 trees
Mon Jul 28 07:09:03 2025 NN descent for 14 iterations
	 1  /  14
	 2  /  14
	 3  /  14
	Stopping threshold met -- exiting after 3 iterations
Mon Jul 28 07:09:41 2025 Finished Nearest Neighbor Search
Mon Jul 28 07:09:46 2025 Construct embedding


Epochs completed:   0%|            0/200 [00:00]

	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Mon Jul 28 07:10:01 2025 Finished embedding


2025-07-28 07:10:01,201 - BERTopic - Dimensionality - Completed ✓
2025-07-28 07:10:01,203 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-28 07:10:01,956 - BERTopic - Cluster - Completed ✓
2025-07-28 07:10:01,964 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-28 07:10:05,440 - BERTopic - Representation - Completed ✓


Waktu eksekusi: 1 menit 26.21 detik.


In [36]:
topic_info = topic_model.get_topic_info()  
id_to_name = topic_info.set_index("Topic")["Name"].to_dict()

In [37]:
id_to_name

{-1: '-1_berbasis_pengembangan_model_indonesia',
 0: '0_menggunakan_deep_deteksi_sistem',
 1: '1_wisata_tourism_desa wisata_destinasi',
 2: '2_bakteri_tanaman_endofit_senyawa',
 3: '3_ikan_perikanan_perairan_pakan',
 4: '4_budaya_adat_masyarakat_tradisi',
 5: '5_marketing_digital marketing_digital_pemasaran',
 6: '6_pangan_ketahanan pangan_ketahanan_petani',
 7: '7_listrik_energi_pembangkit_surya',
 8: '8_reality_augmented_augmented reality_virtual',
 9: '9_matematika_kemampuan_siswa_pembelajaran',
 10: '10_inggris_bahasa inggris_bahasa_membaca',
 11: '11_kanker_payudara_sel_antikanker',
 12: '12_bencana_mitigasi_banjir_tsunami',
 13: '13_green_hijau_green economy_economy',
 14: '14_antioksidan_ekstrak_daun_etanol',
 15: '15_ion_sensor_material_oksida',
 16: '16_berpikir_keterampilan_berpikir kritis_kritis',
 17: '17_sapi_daging_ayam_telur',
 18: '18_diabetes_tikus_ekstrak_daun',
 19: '19_nanopartikel_luka_ekstrak_delivery',
 20: '20_ibu_bayi_stunting_pencegahan',
 21: '21_katalis_baha

In [38]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,KeyBERTInspired,Representative_Docs
0,-1,6692,-1_berbasis_pengembangan_model_indonesia,"[berbasis, pengembangan, model, indonesia, men...","[pembelajaran, learning, pendidikan, sekolah d...",[konstruktivisme model pembelajaran cognitive ...
1,0,586,0_menggunakan_deep_deteksi_sistem,"[menggunakan, deep, deteksi, sistem, learning,...","[machine learning, neural network, artificial ...",[pengklasifikasi bibit kelapa menggunakan algo...
2,1,455,1_wisata_tourism_desa wisata_destinasi,"[wisata, tourism, desa wisata, destinasi, desa...","[tourism, desa wisata, wisatawan, wisata, kota...",[optimalisasi desa wisata berbasis umkm melalu...
3,2,362,2_bakteri_tanaman_endofit_senyawa,"[bakteri, tanaman, endofit, senyawa, daun, baw...","[bakteri, senyawa bioaktif, infeksi, bioaktif,...",[analisis senyawa bioaktif identifikasi moleku...
4,3,208,3_ikan_perikanan_perairan_pakan,"[ikan, perikanan, perairan, pakan, dna, endemi...","[perikanan, ikan, ekosistem, wilayah pesisir, ...",[rekonstruksi modernisasi perikanan tangkap tr...
5,4,180,4_budaya_adat_masyarakat_tradisi,"[budaya, adat, masyarakat, tradisi, suku, iden...","[masyarakat adat, masyarakat suku, budaya loka...",[elaborasi studi antropologi hukum antropoling...
6,5,173,5_marketing_digital marketing_digital_pemasaran,"[marketing, digital marketing, digital, pemasa...","[digital marketing, pemasaran digital, marketi...",[kpopfikasi digital marketing akselerasi visib...
7,6,169,6_pangan_ketahanan pangan_ketahanan_petani,"[pangan, ketahanan pangan, ketahanan, petani, ...","[ketahanan pangan, mitigasi perubahan iklim, p...",[transformasi lumbung pangan desa berketahanan...
8,7,169,7_listrik_energi_pembangkit_surya,"[listrik, energi, pembangkit, surya, tenaga, p...","[pembangkit listrik tenaga, energi terbarukan,...",[optimalisasi energi angin pembangkit listrik ...
9,8,162,8_reality_augmented_augmented reality_virtual,"[reality, augmented, augmented reality, virtua...","[berbasis augmented reality, augmented reality...",[optimalisasi pembelajaran vokasional berbasis...


In [39]:
sw = StopWatch()
sw.start()

bert_topics, probs = topic_model.transform(docs)

sw.stop()
sw.print()

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2025-07-28 07:10:16,367 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-07-28 07:10:16,404 - BERTopic - Dimensionality - Completed ✓
2025-07-28 07:10:16,405 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-07-28 07:10:17,105 - BERTopic - Cluster - Completed ✓


Waktu eksekusi:  11.08 detik.


In [40]:
df_tm['BERT_topic'] = bert_topics

### Evaluating #1

In [41]:
# Tokenisasi dokumen
tokenized_docs = [doc.split() for doc in docs]

dictionary = Dictionary(tokenized_docs)
corpus = [dictionary.doc2bow(text) for text in tokenized_docs]

In [42]:
nmf_topics_words = get_topics_words(nmf_model,tfidf_vectorizer)
lda_topics_words = get_topics_words(lda_model,count_vectorizer)
lsa_topics_words = get_topics_words(lsa_model,tfidf_vectorizer)

In [43]:
bert_topics_words = get_bert_topics_words(topic_model)

In [44]:
sw.start()

coherence_dict = {
    'Model' : [
        'NMF',
        'LDA',
        'LSA',
        'BERTopic',
    ],
    'Coherence':[
        get_coherence(nmf_topics_words),
        get_coherence(lda_topics_words),
        get_coherence(lsa_topics_words),
        get_coherence(bert_topics_words),
    ]
}

sw.stop()
sw.print()

Waktu eksekusi:  11.16 detik.
Waktu eksekusi:  11.04 detik.
Waktu eksekusi:  10.84 detik.
Waktu eksekusi:  11.67 detik.
Waktu eksekusi:  44.71 detik.


In [45]:
coherence_df = pd.DataFrame(coherence_dict)

In [46]:
coherence_df

Unnamed: 0,Model,Coherence
0,NMF,0.651512
1,LDA,0.508846
2,LSA,0.459366
3,BERTopic,0.626531


### Hyperparameter Tuning

In [47]:
optuna.__version__

'4.4.0'

In [48]:
def nmf_objective(trial):
    params = {
        'n_components': trial.suggest_int('n_components', 3, 100)
    }

    model = NMF(**params, random_state=42)
    topics = model.fit_transform(tfidf)

    topics_words = get_topics_words(model, tfidf_vectorizer)
    return get_coherence(topics_words)

def lda_objective(trial):
    params = {
        'n_components': trial.suggest_int('n_components', 3, 30)
    }

    model = LatentDirichletAllocation(**params, random_state=42)
    topics = model.fit_transform(count)

    topics_words = get_topics_words(model, count_vectorizer)
    return get_coherence(topics_words)

def lsa_objective(trial):
    params = {
        'n_components': trial.suggest_int('n_components', 3, 30)
    }

    model = TruncatedSVD(**params, random_state=42)
    topics = model.fit_transform(tfidf)

    topics_words = get_topics_words(model, tfidf_vectorizer)
    return get_coherence(topics_words)

def bert_objective(trial):
    embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
    umap_model = UMAP(n_components=5, n_neighbors=50, random_state=42, metric="cosine", verbose=True)
    hdbscan_model = HDBSCAN(min_samples=20, gen_min_span_tree=True, prediction_data=True, min_cluster_size=trial.suggest_int('min_cluster_size', 3, 30))
    vectorizer_model = CountVectorizer(ngram_range=(1, 3), min_df=5)

    representation_models = {
        "KeyBERTInspired": KeyBERTInspired()
    }

    topic_model= BERTopic(
            embedding_model=embedding_model,
            umap_model=umap_model,
            hdbscan_model=hdbscan_model,
            vectorizer_model=vectorizer_model,
            representation_model=representation_models,
            verbose=True
    ).fit(docs)

    bert_topics_words = get_bert_topics_words(topic_model)
    return get_coherence(bert_topics_words)

In [49]:
# sw.start()

# nmf_study = optuna.create_study(direction='maximize')
# nmf_study.optimize(nmf_objective,n_trials=50)

# sw.stop()
# sw.print()

In [50]:
nmf_parmas = {'n_components': 10}

In [51]:
# sw.start()

# lda_study = optuna.create_study(direction='maximize')
# lda_study.optimize(lda_objective,n_trials=20)

# sw.stop()
# sw.print()

In [52]:
lda_params = {'n_components': 13}

In [53]:
# sw.start()

# lsa_study = optuna.create_study(direction='maximize')
# lsa_study.optimize(lsa_objective,n_trials=20)

# sw.stop()
# sw.print()

In [54]:
lsa_params = {'n_components': 4}

In [55]:
# sw.start()

# bert_study = optuna.create_study(direction='maximize')
# bert_study.optimize(bert_objective,n_trials=5)

# sw.stop()
# sw.print()

In [56]:
# bert_params = {'min_cluster_size': 22}

In [57]:
nmf_model = NMF(**nmf_parmas, random_state=42)
nmf_topics = nmf_model.fit_transform(tfidf)

lda_model = LatentDirichletAllocation(**lda_params, random_state=42)
lda_topics = lda_model.fit_transform(count)

lsa_model = TruncatedSVD(**lsa_params, random_state=42)
lsa_topics = lsa_model.fit_transform(tfidf)

In [65]:
# Prepare sub-models
embedding_model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
umap_model = UMAP(n_components=5, n_neighbors=50, random_state=42, metric="cosine", verbose=True)
hdbscan_model = HDBSCAN(min_samples=20, gen_min_span_tree=True, prediction_data=True, min_cluster_size=20)
vectorizer_model = CountVectorizer(ngram_range=(1, 3), min_df=5)

# Representation models
representation_models = {
    "KeyBERTInspired": KeyBERTInspired(),
    "MMR": MaximalMarginalRelevance(diversity=0.3),
    "KeyBERT + MMR": [KeyBERTInspired(), MaximalMarginalRelevance(diversity=0.3)]
}

# Fit BERTopic
sw = StopWatch()
sw.start()

topic_model= BERTopic(
        embedding_model=embedding_model,
        umap_model=umap_model,
        hdbscan_model=hdbscan_model,
        vectorizer_model=vectorizer_model,
        representation_model=representation_models,
        verbose=True
).fit(docs)

sw.stop()
sw.print()

2025-07-28 07:13:50,307 - BERTopic - Embedding - Transforming documents to embeddings.


Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2025-07-28 07:14:00,228 - BERTopic - Embedding - Completed ✓
2025-07-28 07:14:00,229 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm


UMAP(angular_rp_forest=True, metric='cosine', n_components=5, n_jobs=1, n_neighbors=50, random_state=42, verbose=True)
Mon Jul 28 07:14:00 2025 Construct fuzzy simplicial set
Mon Jul 28 07:14:00 2025 Finding Nearest Neighbors
Mon Jul 28 07:14:00 2025 Building RP forest with 11 trees
Mon Jul 28 07:14:00 2025 NN descent for 14 iterations
	 1  /  14
	 2  /  14
	 3  /  14
	Stopping threshold met -- exiting after 3 iterations
Mon Jul 28 07:14:06 2025 Finished Nearest Neighbor Search
Mon Jul 28 07:14:06 2025 Construct embedding


Epochs completed:   0%|            0/200 [00:00]

	completed  0  /  200 epochs
	completed  20  /  200 epochs
	completed  40  /  200 epochs
	completed  60  /  200 epochs
	completed  80  /  200 epochs
	completed  100  /  200 epochs
	completed  120  /  200 epochs
	completed  140  /  200 epochs
	completed  160  /  200 epochs
	completed  180  /  200 epochs
Mon Jul 28 07:14:17 2025 Finished embedding


2025-07-28 07:14:17,404 - BERTopic - Dimensionality - Completed ✓
2025-07-28 07:14:17,406 - BERTopic - Cluster - Start clustering the reduced embeddings
2025-07-28 07:14:17,984 - BERTopic - Cluster - Completed ✓
2025-07-28 07:14:17,990 - BERTopic - Representation - Fine-tuning topics using representation models.
2025-07-28 07:14:28,051 - BERTopic - Representation - Completed ✓


Waktu eksekusi:  38.18 detik.


In [66]:
nmf_topics_words = get_topics_words(nmf_model,tfidf_vectorizer)
lda_topics_words = get_topics_words(lda_model,count_vectorizer)
lsa_topics_words = get_topics_words(lsa_model,tfidf_vectorizer)

In [67]:
bert_topics_words = get_bert_topics_words(topic_model)

In [68]:
get_coherence(bert_topics_words)

Waktu eksekusi:  11.65 detik.


0.6265311567790711

In [69]:
sw.start()

coherence_dict = {
    'Model' : [
        'NMF',
        'LDA',
        'LSA',
        'BERTopic',
    ],
    'Coherence':[
        get_coherence(nmf_topics_words),
        get_coherence(lda_topics_words),
        get_coherence(lsa_topics_words),
        get_coherence(bert_topics_words),
    ]
}

sw.stop()
sw.print()

Waktu eksekusi:  16.42 detik.
Waktu eksekusi:  10.54 detik.
Waktu eksekusi:  10.32 detik.
Waktu eksekusi:  11.15 detik.
Waktu eksekusi:  48.43 detik.


In [70]:
coherence_df = pd.DataFrame(coherence_dict)
coherence_df

Unnamed: 0,Model,Coherence
0,NMF,0.651512
1,LDA,0.57652
2,LSA,0.601848
3,BERTopic,0.626531


## Conclusion

In [80]:
# topic_model.save("BERTopic_Penelitian")

In [81]:
topic_model = BERTopic.load("BERTopic_Penelitian")

Mon Jul 28 07:20:20 2025 Building and compiling search function


In [76]:
def get_top_topics(topics):
    topic_labels = np.argmax(topics,axis=1)
    values,counts = np.unique(topic_labels,return_counts=True)

    sorted_indices = np.argsort(-counts)

    sorted_values = values[sorted_indices]
    sorted_counts = counts[sorted_indices]

    return sorted_values,sorted_counts

In [77]:
def get_sorted_top_keyword(vectorizer,model,sorted_topics):
    componenets = model.components_[sorted_topics]
    feature_names = vectorizer.get_feature_names_out()
    top_n = 10
    keywords_per_topic = [
        [feature_names[i] for i in topic.argsort()[:-top_n - 1:-1]] for topic in componenets
    ]
    return keywords_per_topic

In [78]:
topic_info = topic_model.get_topic_info()

In [82]:
topics,prob = topic_model.transform(docs)

Batches:   0%|          | 0/391 [00:00<?, ?it/s]

2025-07-28 07:20:52,865 - BERTopic - Dimensionality - Reducing dimensionality of input embeddings.
2025-07-28 07:20:52,899 - BERTopic - Dimensionality - Completed ✓
2025-07-28 07:20:52,900 - BERTopic - Clustering - Approximating new points with `hdbscan_model`
2025-07-28 07:20:53,466 - BERTopic - Cluster - Completed ✓


In [84]:
df['BERTopic'] = topics
df = df[df['BERTopic'] != -1]

In [93]:
def summarize_bert(topics_info):
    r_docs = topics_info['Representative_Docs'].to_list()
    top_key = topics_info['KeyBERTInspired'].to_list()
    topic_desc = {}
    for topic_idx, keywords in enumerate(top_key):
        docs_sample = r_docs[topic_idx]
        result = summarize_topic(keywords, docs_sample)
        topic_desc[topic_idx] = result
    return topic_desc

In [102]:
topic_info = topic_info.iloc[1:]

In [104]:
bert_sum = summarize_bert(topic_info)

In [105]:
bert_sum

{0: {'label_topik': 'Penerapan Machine Learning dan Neural Network dalam Sistem Cerdas',
  'deskripsi': 'Topik ini mencakup penelitian dan pengabdian masyarakat yang berfokus pada penggunaan machine learning, neural network, dan artificial intelligence untuk mengembangkan sistem cerdas. Penelitian ini melibatkan algoritma klasifikasi dan optimisasi dalam berbagai aplikasi, termasuk pengklasifikasi bibit kelapa menggunakan deep learning, deteksi dini penyakit melalui sensor IoT, dan sistem otentikasi berbasis suara.'},
 1: {'label_topik': 'Pengembangan Desa Wisata Berbasis Kearifan Lokal',
  'deskripsi': 'Topik ini membahas tentang strategi dan implementasi pengembangan desa wisata dengan memanfaatkan potensi lokal dan kearifan lokal untuk menciptakan destinasi wisata yang berkelanjutan. Fokusnya adalah pada optimalisasi sumber daya lokal, termasuk UMKM dan ekonomi kreatif, serta penerapan teknologi cerdas untuk pengelolaan wisata. Studi kasus yang relevan melibatkan desa wisata seperti

In [99]:
(df['BERTopic']==0).sum()

586

In [1]:
with open("institusi.json", "r", encoding="utf-8") as f:
    data = json.load(f)

FileNotFoundError: [Errno 2] No such file or directory: 'institusi.json'