In [1]:
# export libraries
import pandas as pd
import sys
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
import warnings
import json

# suppress warnings in notebook
warnings.filterwarnings('ignore')

# add src to path
sys.path.append('../')

from src.models.llm_handler import LLMFactory
from src.models.extraction.bible_progress_extractor import BibleProgressExtractor

print('CUDA Available:', torch.cuda.is_available())
print('CUDA Version:', torch.version.cuda)
print('GPU:', torch.cuda.get_device_name(0))

  from .autonotebook import tqdm as notebook_tqdm


CUDA Available: True
CUDA Version: 13.0
GPU: NVIDIA GeForce RTX 4070 Laptop GPU


In [2]:
df = pd.read_csv("../data/processed/cleaned_messages.csv")

print("Dataframe shape:", df.shape)
df.head(10)

Dataframe shape: (18949, 3)


Unnamed: 0,sender,message,timestamp
0,"dr. Andreas C.N., Fp.B.",Siap terimakasih sudah diadd di grup üôèüèª,2020-08-02 11:52:41
1,Lenny Pandjidharma,sami2 ...,2020-08-02 11:52:51
2,Lenny Pandjidharma,untuk peraturan group dan tata pelaksanaan ......,2020-08-02 11:53:17
3,Oma Lisa,Thanks Lenny.t Lisa ikut ya spytdk cpt pikun a...,2020-08-02 12:20:41
4,Mfitri,"Ok,makasih ci üôè",2020-08-02 12:22:35
5,Sim Ay Tjan,Thanks Len.üôèüèº,2020-08-02 12:51:17
6,Tjunfebelyana,Thanks Lenüôè,2020-08-02 13:01:32
7,Oma Lisa,Mulak kapan dan jam brp Lenny.Gbu txs,2020-08-02 14:22:31
8,Lenny Pandjidharma,"Dimulainya besok, Tante Lisa.",2020-08-02 14:26:58
9,Oma Lisa,Ok,2020-08-02 14:37:45


In [3]:
messages = df["message"].astype(str).tolist()

In [4]:
# model_name = "suayptalha/Komodo-7B-Instruct"

# tokenizer = AutoTokenizer.from_pretrained(model_name)
# tokenizer.pad_token = tokenizer.eos_token
# tokenizer.padding_side = "left"

# model = AutoModelForCausalLM.from_pretrained(
#     model_name,
#     device_map="auto",
#     trust_remote_code=True,
#     dtype="auto"
# )

In [5]:
# model.config.pad_token_id = tokenizer.pad_token_id
# model.generation_config.pad_token_id = tokenizer.pad_token_id

# text_gen = pipeline("text-generation", model=model, tokenizer=tokenizer)

In [6]:
# example_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

# ### Instruction:
# {}
# ### Input:
# {}
# ### Response:
# """

In [7]:
system_instruction = """
Ekstrak referensi Alkitab dari teks. Output JSON array saja, tanpa penjelasan.

Format: [{"book_text":"...","start_chapter":N,"end_chapter":N,"raw_text":"...","confidence":1.0,"source":"llm"}]

Contoh:
"Kej 3-5" ‚Üí [{"book_text":"Kej","start_chapter":3,"end_chapter":5,"raw_text":"Kej 3-5","confidence":1.0,"source":"llm"}]

"1 Korintus 12 sampai 14" ‚Üí [{"book_text":"1 Korintus","start_chapter":12,"end_chapter":14,"raw_text":"1 Korintus 12 sampai 14","confidence":1.0,"source":"llm"}]

"Wahyu 19-20 done, Wahyu 21-22 done" ‚Üí [
{"book_text":"Wahyu","start_chapter":19,"end_chapter":20,"raw_text":"Wahyu 19-20","confidence":1.0,"source":"llm"},
{"book_text":"Wahyu","start_chapter":21,"end_chapter":22,"raw_text":"Wahyu 21-22","confidence":1.0,"source":"llm"}
]

Tidak ada referensi ‚Üí []
"""

In [8]:
# message_subset = messages[30:50]

# outputs = []
# # Example: simplified prompt
# for user_text in message_subset:
#     # Combine instructions + input directly
#     prompt = example_prompt.format(
#         system_instruction,
#         user_text,
#         ""
#     )
    
#     raw_out = text_gen(
#         prompt,
#         return_full_text=False,
#         max_new_tokens=256,
#         do_sample=False,
#         repetition_penalty=1.1,
#     )

#     outputs.append(raw_out[0]["generated_text"])    

# # %%
# # Step 7: Show results
# for inp, out in zip(message_subset, outputs):
#     print(f"Input: {inp}")
#     print(f"Output: {out}\n")

In [9]:
handler = LLMFactory.create_handler('komodo')

Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:09<00:00,  4.79s/it]
Device set to use cuda:0


In [10]:
# Test on small sample first
test_messages = messages[30:50]

test_outputs = handler.generate_batch(
    prompts=test_messages,
    system_message=system_instruction,
    mode='extraction'
)

# Verify quality
for msg, out in zip(test_messages[:5], test_outputs[:5]):
    print(f"Input: {msg}")
    print(f"Output: {out}\n")

Processing batches:   0%|          | 0/20 [00:00<?, ?msg/s]The following generation flags are not valid and may be ignored: ['temperature', 'top_p', 'top_k']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Processing batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [01:12<00:00,  3.61s/msg]

Input: Kej 1-2 done
Output: [{"book_text":"Kej","start_chapter":1,"end_chapter":2,"raw_text":"Kej 1-2","confidence":1.0,"source":"llm"}]

Input: Kej 1-2 done
Output: [{"book_text":"Kej","start_chapter":1,"end_chapter":2,"raw_text":"Kej 1-2","confidence":1.0,"source":"llm"}]

Input: Kej 1-2 done
Output: [{"book_text":"Kej","start_chapter":1,"end_chapter":2,"raw_text":"Kej 1-2","confidence":1.0,"source":"llm"}]

Input: Kej 1-2 done
Output: [{"book_text":"Kej","start_chapter":1,"end_chapter":2,"raw_text":"Kej 1-2","confidence":1.0,"source":"llm"}]

Input: Kej 1- 2 selesai.üôè
Output: [{"book_text":"Kej","start_chapter":1,"end_chapter":2,"raw_text":"Kej 1- 2 selesai.üôè","confidence":1.0,"source":"llm"}]






In [15]:
print(type(test_outputs[0]))

<class 'str'>


In [18]:
system_instruction_2 = """
Tentukan apakah pesan berisi laporan bacaan Alkitab atau bukan. Output JSON object saja, tanpa penjelasan.

Format: {"is_progress_report":true/false,"confidence":0.0-1.0}

Laporan bacaan = menyebutkan kitab/pasal Alkitab yang sudah/akan dibaca

Contoh:
"Kej 1-3 done" ‚Üí {"is_progress_report":true,"confidence":1.0}
"Wahyu 21-22 selesai dibaca" ‚Üí {"is_progress_report":true,"confidence":1.0}
"Apa arti Yohanes 3:16?" ‚Üí {"is_progress_report":false,"confidence":0.95}
"Halo apa kabar?" ‚Üí {"is_progress_report":false,"confidence":1.0}
"Hari ini belum baca" ‚Üí {"is_progress_report":false,"confidence":0.9}
"""

In [19]:
# Test on small sample first

test_outputs_2 = handler.generate_batch(
    prompts=test_messages,
    system_message=system_instruction_2,
    mode='extraction'
)

# Verify quality
for msg, out in zip(test_messages, test_outputs_2):
    print(f"Input: {msg}")
    print(f"Output: {out}\n")

Processing batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 20/20 [01:01<00:00,  3.09s/msg]

Input: Kej 1-2 done
Output: {"is_progress_report":true,"confidence":1.0}

Input: Kej 1-2 done
Output: {"is_progress_report":true,"confidence":1.0}

Input: Kej 1-2 done
Output: {"is_progress_report":true,"confidence":1.0}

Input: Kej 1-2 done
Output: {"is_progress_report":true,"confidence":1.0}

Input: Kej 1- 2 selesai.üôè
Output: {"is_progress_report":true,"confidence":0.9}

Input: Kej 1-2 done
Output: {"is_progress_report":true,"confidence":1.0}

Input: Kej 1-2 done
Output: {"is_progress_report":true,"confidence":1.0}

Input: Kej 1-2 selesai
Output: {"is_progress_report":true,"confidence":1.0}

Input: Kej1-2 done
Output: {"is_progress_report":true,"confidence":1.0}

Input: Kej 1-2 done
Output: {"is_progress_report":true,"confidence":1.0}

Input: _Kej 1-2_ ‚úì
Output: {"is_progress_report":true,"confidence":1.0}

Input: Kej 1-2‚úì
Output: {"is_progress_report":true,"confidence":1.0}

Input: Kej 1-2 done
Output: {"is_progress_report":true,"confidence":1.0}

Input: https://youtu.be/6t3I


