In [1]:
# Import libraries
from pathlib import Path
import pandas as pd
import warnings
import json
import sys

# suppress warnings in notebook
warnings.filterwarnings('ignore')

# add src to path
sys.path.append(str(Path('../').resolve()))

from annotation.bible_reference_annotator import BibleReferenceAnnotator

In [2]:

df = pd.read_csv('..\data\processed\cleaned_messages.csv')

print(f'Total messages: {len(df)}')
print(f'\nDataFrame shape: {df.shape}')
print(f'\nColumns: {df.columns.tolist()}')

Total messages: 17191

DataFrame shape: (17191, 3)

Columns: ['timestamp', 'sender', 'message']


In [3]:
df.head(15)

Unnamed: 0,timestamp,sender,message
0,2020-08-02 11:52:41,"dr. Andreas C.N., Fp.B.",Siap terimakasih sudah diadd di grup üôèüèª
1,2020-08-02 11:52:51,Lenny Pandjidharma,sami2 ...
2,2020-08-02 11:53:17,Lenny Pandjidharma,untuk peraturan group dan tata pelaksanaan ......
3,2020-08-02 12:20:41,Oma Lisa,Thanks Lenny.t Lisa ikut ya spytdk cpt pikun a...
4,2020-08-02 12:22:35,Mfitri,"Ok,makasih ci üôè"
5,2020-08-02 12:51:17,Sim Ay Tjan,Thanks Len.üôèüèº
6,2020-08-02 13:01:32,Tjunfebelyana,Thanks Lenüôè
7,2020-08-02 14:22:31,Oma Lisa,Mulak kapan dan jam brp Lenny.Gbu txs
8,2020-08-02 14:26:58,Lenny Pandjidharma,"Dimulainya besok, Tante Lisa."
9,2020-08-02 15:03:11,üéç,"Baik, tks. Gbuüôè"


In [4]:
with open("../data/bible_references.json", "r", encoding="utf-8") as f:
    bible_books = json.load(f)

annotator = BibleReferenceAnnotator(bible_books)

annotated_df = annotator.annotate_dataframe(df, text_column='message')

In [5]:
annotated_df.head(15)

Unnamed: 0,timestamp,sender,message,bible_references,bible_ref_count,ner_spans,labels
0,2020-08-02 11:52:41,"dr. Andreas C.N., Fp.B.",Siap terimakasih sudah diadd di grup üôèüèª,[],0,[],False
1,2020-08-02 11:52:51,Lenny Pandjidharma,sami2 ...,[],0,[],False
2,2020-08-02 11:53:17,Lenny Pandjidharma,untuk peraturan group dan tata pelaksanaan ......,[],0,[],False
3,2020-08-02 12:20:41,Oma Lisa,Thanks Lenny.t Lisa ikut ya spytdk cpt pikun a...,[],0,[],False
4,2020-08-02 12:22:35,Mfitri,"Ok,makasih ci üôè",[],0,[],False
5,2020-08-02 12:51:17,Sim Ay Tjan,Thanks Len.üôèüèº,[],0,[],False
6,2020-08-02 13:01:32,Tjunfebelyana,Thanks Lenüôè,[],0,[],False
7,2020-08-02 14:22:31,Oma Lisa,Mulak kapan dan jam brp Lenny.Gbu txs,[],0,[],False
8,2020-08-02 14:26:58,Lenny Pandjidharma,"Dimulainya besok, Tante Lisa.",[],0,[],False
9,2020-08-02 15:03:11,üéç,"Baik, tks. Gbuüôè",[],0,[],False


In [10]:
annotated_df['bible_ref_count'].value_counts()

bible_ref_count
2     13958
0      1022
4       978
3       724
1       247
6       106
8        59
5        30
10       22
7        15
12        8
11        7
9         7
14        4
21        1
20        1
18        1
17        1
Name: count, dtype: int64

In [6]:
annotated_df[annotated_df["bible_ref_count"] == 0]

Unnamed: 0,timestamp,sender,message,bible_references,bible_ref_count,ner_spans,labels
0,2020-08-02 11:52:41,"dr. Andreas C.N., Fp.B.",Siap terimakasih sudah diadd di grup üôèüèª,[],0,[],False
1,2020-08-02 11:52:51,Lenny Pandjidharma,sami2 ...,[],0,[],False
2,2020-08-02 11:53:17,Lenny Pandjidharma,untuk peraturan group dan tata pelaksanaan ......,[],0,[],False
3,2020-08-02 12:20:41,Oma Lisa,Thanks Lenny.t Lisa ikut ya spytdk cpt pikun a...,[],0,[],False
4,2020-08-02 12:22:35,Mfitri,"Ok,makasih ci üôè",[],0,[],False
...,...,...,...,...,...,...,...
17185,2022-06-23 09:18:28,susianawati309,Baik,[],0,[],False
17187,2022-07-01 15:14:59,Vik. Dessy Waiman,"Selamta Sore bpk ibu, krn Group BRC3 sdh dibua...",[],0,[],False
17188,2022-07-01 15:18:49,Andrie HG,~ Andrie HG ditambahkan,[],0,[],False
17189,2022-07-01 15:15:16,Andrie HG,Terima kasih bu dessy üôèüèΩüôèüèΩüôèüèΩ,[],0,[],False


In [7]:
annotated_df[annotated_df["bible_ref_count"] == 1]

Unnamed: 0,timestamp,sender,message,bible_references,bible_ref_count,ner_spans,labels
15,2020-08-02 21:39:18,Vik. Dessy Waiman,1 Raja dan 2 Raja,"[{'book_start': 'dan', 'start_chapter': 2, 'bo...",1,"[{'start': 7, 'end': 10, 'label': 'BOOK', 'tex...",True
18,2020-08-03 03:48:55,Melanie Chandra,Kej 1-2 done,"[{'book_start': 'Kej', 'start_chapter': 1, 'bo...",1,"[{'start': 0, 'end': 3, 'label': 'BOOK', 'text...",True
19,2020-08-03 04:03:51,Lindawati Haryanto,Kej 1-2 done,"[{'book_start': 'Kej', 'start_chapter': 1, 'bo...",1,"[{'start': 0, 'end': 3, 'label': 'BOOK', 'text...",True
20,2020-08-03 04:08:44,Sherly Cahyadi,Kej 1-2 done,"[{'book_start': 'Kej', 'start_chapter': 1, 'bo...",1,"[{'start': 0, 'end': 3, 'label': 'BOOK', 'text...",True
21,2020-08-03 04:32:19,Seto Ninik,Kej 1-2 done,"[{'book_start': 'Kej', 'start_chapter': 1, 'bo...",1,"[{'start': 0, 'end': 3, 'label': 'BOOK', 'text...",True
...,...,...,...,...,...,...,...
17142,2022-06-21 23:23:42,Andrie HG,Wahyu 21 - 22 selesai,"[{'book_start': 'Wahyu', 'start_chapter': 21, ...",1,"[{'start': 0, 'end': 5, 'label': 'BOOK', 'text...",True
17160,2022-06-22 00:39:26,"dr. Andreas C.N., Sp.B.",Wahyu 21 - 22 Done üòä\nTerimakasih Bu Dessy unt...,"[{'book_start': 'Wahyu', 'start_chapter': 21, ...",1,"[{'start': 0, 'end': 5, 'label': 'BOOK', 'text...",True
17164,2022-06-22 08:18:52,Yozef Tjandra,Wahyu 19-22 done,"[{'book_start': 'Wahyu', 'start_chapter': 19, ...",1,"[{'start': 0, 'end': 5, 'label': 'BOOK', 'text...",True
17169,2022-06-22 11:15:14,Darius Handoko,Kejadian 1-2 done,"[{'book_start': 'Kejadian', 'start_chapter': 1...",1,"[{'start': 0, 'end': 8, 'label': 'BOOK', 'text...",True


In [13]:
annotated_df[annotated_df["bible_ref_count"] > 2]

Unnamed: 0,timestamp,sender,message,bible_references,bible_ref_count,intent,ner_spans
104,2020-08-04 15:21:49,Tjunfebelyana,Kej 3-4 selesai.\nJojo Kej 3-4 selesai,"[{'book_start': 'Kej', 'start_chapter': 3, 'bo...",4,PROGRESS_REPORT,"[{'start': 0, 'end': 3, 'label': 'BOOK', 'text..."
122,2020-08-04 21:11:48,Tan,Kej 1 - 2 ‚úÖ\nKej 3 - 4 ‚úÖ,"[{'book_start': 'Kej', 'start_chapter': 1, 'bo...",4,PROGRESS_REPORT,"[{'start': 0, 'end': 3, 'label': 'BOOK', 'text..."
159,2020-08-05 15:57:58,Tjunfebelyana,Kej 5-6 selesai\nJojo kej 5-6 selesai,"[{'book_start': 'Kej', 'start_chapter': 5, 'bo...",4,PROGRESS_REPORT,"[{'start': 0, 'end': 3, 'label': 'BOOK', 'text..."
176,2020-08-05 21:12:35,Yoshua,Kej 1 - 2 ‚úÖ\nKej 3 - 4 ‚úÖ,"[{'book_start': 'Kej', 'start_chapter': 1, 'bo...",4,PROGRESS_REPORT,"[{'start': 0, 'end': 3, 'label': 'BOOK', 'text..."
248,2020-08-06 21:05:59,Tjunfebelyana,Kej 7-8 done\nJojo kej 7-8 done,"[{'book_start': 'Kej', 'start_chapter': 7, 'bo...",4,PROGRESS_REPORT,"[{'start': 0, 'end': 3, 'label': 'BOOK', 'text..."
...,...,...,...,...,...,...,...
16870,2022-06-13 18:05:52,BL katering,1yoh 1-5 done\n2yoh 1 done\n3yoh 1 done,"[{'book_start': '1yoh', 'start_chapter': 1, 'b...",4,PROGRESS_REPORT,"[{'start': 0, 'end': 4, 'label': 'BOOK', 'text..."
16882,2022-06-14 04:45:54,BL katering,Yudas 1 done\nWahyu 1-10 done,"[{'book_start': 'Wahyu', 'start_chapter': 1, '...",3,PROGRESS_REPORT,"[{'start': 13, 'end': 18, 'label': 'BOOK', 'te..."
16998,2022-06-17 21:30:07,Tjunfebelyana,Jojo 1 Yoh 4-Why 12 done,"[{'book_start': '1 Yoh', 'start_chapter': 4, '...",3,PROGRESS_REPORT,"[{'start': 5, 'end': 10, 'label': 'BOOK', 'tex..."
17051,2022-06-20 07:22:13,Helen Fransiska Margarita,Wahyu 19-20 done\nWahyu 21- 22 FINISH ü•∞üôè,"[{'book_start': 'Wahyu', 'start_chapter': 19, ...",4,PROGRESS_REPORT,"[{'start': 0, 'end': 5, 'label': 'BOOK', 'text..."


### Intent Classification

In [None]:
intent_df = annotated_df[['message', 'labels']]

In [None]:
print(f'\nintent_df Columns: {intent_df.columns.tolist()}')


intent_df Columns: ['message', 'labels']


In [None]:
intent_df.to_csv('..\data\processed\intent_messages_labeled.csv', index=False, encoding="utf-8-sig")

### NER

In [18]:
# Split into single-ref and multi-ref groups
single_ref_df = annotated_df[annotated_df["bible_ref_count"] == 1]
multi_ref_df  = annotated_df[annotated_df["bible_ref_count"] >= 2]

# Sample 750 from each group
single_sample = single_ref_df.sample(n=750, random_state=42)
multi_sample  = multi_ref_df.sample(n=750, random_state=42)

# Combine into one dataset
balanced_1500 = pd.concat([single_sample, multi_sample])

# Shuffle final result
balanced_1500 = balanced_1500.sample(frac=1, random_state=42).reset_index(drop=True)

# Keep only needed columns for NER
ner_df = balanced_1500[["message", "ner_spans"]]

print(len(ner_df))


1500


In [21]:
import json

def to_label_studio_format(df):
    tasks = []

    for _, row in df.iterrows():
        text = row["message"]
        spans = row["ner_spans"]

        results = []
        for span in spans:
            results.append({
                "from_name": "label",   # must match your Label Studio config
                "to_name": "text",      # must match your Label Studio config
                "type": "labels",
                "value": {
                    "start": span["start"],
                    "end": span["end"],
                    "text": span["text"],
                    "labels": [span["label"]]
                }
            })

        task = {
            "data": {
                "text": text
            },
            "predictions": [
                {
                    "result": results
                }
            ]
        }

        tasks.append(task)

    return tasks


# Convert ner_df ‚Üí Label Studio JSON tasks
label_studio_tasks = to_label_studio_format(ner_df)

# Save into file
with open("../data/processed/label_studio_ner.json", "w", encoding="utf-8") as f:
    json.dump(label_studio_tasks, f, ensure_ascii=False, indent=2)

print("Saved: label_studio_ner.json")


Saved: label_studio_ner.json


### LLM Finetuning

In [None]:
finetuning_df = annotated_df[['message', 'bible_references', 'bible_ref_count']]

In [None]:
finetuning_df.rename(columns={
    "message" : "input",
    "bible_references" : "response"}, inplace=True)

In [None]:
print(f'finetuning_df Columns: {finetuning_df.columns.tolist()}')

inetuning_df Columns: ['input', 'response', 'bible_ref_count']


In [None]:
def make_strata(x):
    if x == 0:
        return "zero"
    elif x == 1:
        return "one"
    else:
        return "more_than_one"
    
finetuning_df["strata"] = finetuning_df["bible_ref_count"].apply(make_strata)

In [None]:
strata_counts = finetuning_df["strata"].value_counts(normalize=True)
strata_counts

strata
one              0.803212
zero             0.126502
more_than_one    0.070285
Name: proportion, dtype: float64

In [None]:
train_samples = []
test_samples = []

train_size = 1000
test_size = 250

for strata, proporation in strata_counts.items():
    strata_df = finetuning_df[finetuning_df["strata"] == strata].sample(frac=1, random_state=42)

    n_train = round(proporation * train_size)
    n_test = round(proporation * test_size)

    train_samples.append(strata_df.iloc[:n_train])
    test_samples.append(strata_df.iloc[n_train:n_train+n_test])


train_df = pd.concat(train_samples).sample(frac=1, random_state=42).reset_index(drop=True)
test_df  = pd.concat(test_samples).sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
test_df

Unnamed: 0,input,response,bible_ref_count,strata
0,Kej 17 - 20 done,"[{'book_start': 'Kej', 'start_chapter': 17, 'b...",1,one
1,Maz 27-28 done,"[{'book_start': 'Maz', 'start_chapter': 27, 'b...",1,one
2,Maz 96-97 done,"[{'book_start': 'Maz', 'start_chapter': 96, 'b...",1,one
3,Filipi 1-2 done,"[{'book_start': 'Filipi', 'start_chapter': 1, ...",1,one
4,2 sam 1-1raja-raja 4 done,"[{'book_start': '2 sam', 'start_chapter': 1, '...",1,one
...,...,...,...,...
246,Yes 23-24 done,"[{'book_start': 'Yes', 'start_chapter': 23, 'b...",1,one
247,Kis 11-22 done,"[{'book_start': 'Kis', 'start_chapter': 11, 'b...",1,one
248,2 Raj 8 - 11 done,"[{'book_start': '2 Raj', 'start_chapter': 8, '...",1,one
249,Bil 12-13 done,"[{'book_start': 'Bil', 'start_chapter': 12, 'b...",1,one


In [None]:
instruction_text = (
    "Ekstrak semua referensi Alkitab dari teks berikut. "
    "Keluarkan hanya JSON array tanpa penjelasan. "
    "Jika tidak ada referensi, keluarkan array kosong."
)

train_df["instruction"] = instruction_text

train_df = train_df[["instruction", "input", "response"]]
test_df = test_df[["input", "response"]]

In [None]:
test_df

Unnamed: 0,input,response
0,Kej 17 - 20 done,"[{'book_start': 'Kej', 'start_chapter': 17, 'b..."
1,Maz 27-28 done,"[{'book_start': 'Maz', 'start_chapter': 27, 'b..."
2,Maz 96-97 done,"[{'book_start': 'Maz', 'start_chapter': 96, 'b..."
3,Filipi 1-2 done,"[{'book_start': 'Filipi', 'start_chapter': 1, ..."
4,2 sam 1-1raja-raja 4 done,"[{'book_start': '2 sam', 'start_chapter': 1, '..."
...,...,...
246,Yes 23-24 done,"[{'book_start': 'Yes', 'start_chapter': 23, 'b..."
247,Kis 11-22 done,"[{'book_start': 'Kis', 'start_chapter': 11, 'b..."
248,2 Raj 8 - 11 done,"[{'book_start': '2 Raj', 'start_chapter': 8, '..."
249,Bil 12-13 done,"[{'book_start': 'Bil', 'start_chapter': 12, 'b..."


In [None]:
train_df.to_json("../data/processed/finetuning/bible_ref_train.jsonl", 
                 orient="records", lines=True, force_ascii=False)
test_df.to_json("../data/processed/bible_ref_test.jsonl", 
                orient="records", lines=True, force_ascii=False)