# ðŸ“– Bible Reading Progress Tracker â€” Data Preparation

**Purpose**  
This notebook prepares cleaned and structured datasets for text classification, named entity recognition (NER), and entity extraction tasks. The output will be model-ready datasets for downstream training and evaluation.

In [1]:
# Import libraries
import pandas as pd
import warnings
import ast
import json
import sys
from pathlib import Path
from sklearn.model_selection import train_test_split

# supress warning in notebook
warnings.filterwarnings('ignore')

sys.path.append(str(Path('..').resolve() / 'src'))


from utils.text_cleaner import normalize_dashes
from utils import BibleDataLoader, BibleReferenceAnnotator

---
## 2. Text Classification Dataset Preparation

In [None]:
df_cls = pd.read_csv('..\data\processed\cleaned_messages.csv')

print(f'messages parsed : {len(df_cls)}')
print(f'Shape           : {df_cls.shape}')
print(f'Columns         : {df_cls.columns.tolist()}')

In [None]:
df_cls.info()

<class 'pandas.DataFrame'>
RangeIndex: 19142 entries, 0 to 19141
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   timestamp             19142 non-null  str  
 1   sender                19142 non-null  str  
 2   message               19142 non-null  str  
 3   primary_label         19142 non-null  str  
 4   additional_labels     19142 non-null  str  
 5   has_progress_keyword  19142 non-null  bool 
 6   bible_references      19142 non-null  str  
 7   ner_spans             19142 non-null  str  
 8   bible_ref_count       19142 non-null  int64
 9   likely_progress       19142 non-null  bool 
dtypes: bool(2), int64(1), str(7)
memory usage: 9.6 MB


In [None]:
df_cls['primary_label'].value_counts()

primary_label
USER      18679
SYSTEM      463
Name: count, dtype: int64

In [None]:
# Define Target Labels

df_cls['target'] = df_cls['likely_progress'].astype(int)
df_cls.drop(columns=['likely_progress'], inplace=True)

In [None]:
X = df_cls['message']
y = df_cls['target']

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42
)

print(f'Train      : {len(X_train)}')
print(f'Validation : {len(X_val)}')
print(f'Test       : {len(X_test)}')

Train      : 15313
Validation : 1914
Test       : 1915


In [None]:
# Save Classification Dataset

OUT_DIR = OUT_DIR = Path('../data/processed/text classification')

train_cls = pd.DataFrame({'text': X_train, 'label': y_train})
val_cls = pd.DataFrame({'text': X_val, 'label': y_val})
test_cls = pd.DataFrame({'text': X_test, 'label': y_test})

train_path = OUT_DIR / 'train_classification.csv'
val_path = OUT_DIR / 'val_classification.csv'
test_path = OUT_DIR / 'test_classification.csv'

train_cls.to_csv(train_path, index=False, encoding='utf-8-sig')
val_cls.to_csv(val_path, index=False, encoding='utf-8-sig')
test_cls.to_csv(test_path, index=False, encoding='utf-8-sig')

print(f"Saved train: {len(train_cls):,} â†’ {train_path}")
print(f"Saved val:   {len(val_cls):,} â†’ {val_path}")
print(f"Saved test:  {len(test_cls):,} â†’ {test_path}")

Saved train: 15,313 â†’ ..\data\processed\text classification\train_classification.csv
Saved val:   1,914 â†’ ..\data\processed\text classification\val_classification.csv
Saved test:  1,915 â†’ ..\data\processed\text classification\test_classification.csv


---
## 3. NER Dataset Preparation

In [2]:
ner_df = pd.read_csv('../data/processed/likely_progress_messages.csv')

ner_df = ner_df[['message']]

print(f'messages parsed : {len(ner_df)}')
print(f'Shape           : {ner_df.shape}')
print(f'Columns         : {ner_df.columns.tolist()}')

messages parsed : 15745
Shape           : (15745, 1)
Columns         : ['message']


In [3]:
# First, normalize dashes for NER tasks
ner_df['message'] = ner_df['message'].apply(normalize_dashes)

In [4]:
loader = BibleDataLoader()
bible_books = loader.load_default()
annotator = BibleReferenceAnnotator(bible_books)

ner_df = annotator.annotate_dataframe(ner_df)

total_refs = (ner_df["bible_ref_count"] > 0).sum()
print(f'Messages with Bible references : {total_refs:,} ({total_refs / len(ner_df) * 100:.1f}%)')

Messages with Bible references : 15,745 (100.0%)


In [5]:
ner_df[ner_df["bible_ref_count"] == 0]

Unnamed: 0,message,bible_references,ner_spans,bible_ref_count


In [11]:
ner_df[ner_df["message"].str.contains("Ul 1")]

Unnamed: 0,message,ner_spans
37,"Ul 14 - 15 done Anin,Ul 14 - 15 done","[{'start': 0, 'end': 2, 'label': 'BOOK', 'text..."
44,"Ul 18 - 19 done Anin,Ul 18 - 19 done","[{'start': 0, 'end': 2, 'label': 'BOOK', 'text..."
186,Bil 36 - Ul 1 done,"[{'start': 0, 'end': 3, 'label': 'BOOK', 'text..."


In [7]:
# Split into single-ref and multi-ref groups
single_ref_df = ner_df[ner_df["bible_ref_count"] == 1]
multi_ref_df  = ner_df[ner_df["bible_ref_count"] >= 2]

# Sample 100 from each group
single_sample = single_ref_df.sample(n=100, random_state=42)
multi_sample  = multi_ref_df.sample(n=100, random_state=42)

# Combine into one dataset
balanced_200 = pd.concat([single_sample, multi_sample])

# Shuffle final result
balanced_200 = balanced_200.sample(frac=1, random_state=42).reset_index(drop=True)

# Keep only needed columns for NER
ner_df = balanced_200[["message", "ner_spans"]]

print(len(ner_df))

200


In [8]:
ner_df['ner_spans'] = ner_df['ner_spans'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

In [9]:
def to_label_studio_format(df):
    tasks = []

    for _, row in df.iterrows():
        text = row["message"]
        spans = row["ner_spans"]

        results = []
        for span in spans:
            results.append({
                "from_name": "label",
                "to_name": "text",     
                "type": "labels",
                "value": {
                    "start": span["start"],
                    "end": span["end"],
                    "text": span["text"],
                    "labels": [span["label"]]
                }
            })

        task = {
            "data": {
                "text": text
            },
            "predictions": [
                {
                    "result": results
                }
            ]
        }

        tasks.append(task)

    return tasks


# Convert ner_df â†’ Label Studio JSON tasks
label_studio_tasks = to_label_studio_format(ner_df)

# Save into file
with open("../data/processed/NER_tasks/label_studio_ner.json", "w", encoding="utf-8") as f:
    json.dump(label_studio_tasks, f, ensure_ascii=False, indent=2)

print("Saved: label_studio_ner.json")


Saved: label_studio_ner.json


### LLM Finetuning

In [None]:
df_entity = pd.read_csv('../data/processed/likely_progress_messages.csv')

print(f'messages parsed : {len(df_entity)}')
print(f'Shape           : {df_entity.shape}')
print(f'Columns         : {df_entity.columns.tolist()}')

In [None]:
finetuning_df.rename(columns={
    "message" : "input",
    "bible_references" : "response"}, inplace=True)

In [None]:
print(f'finetuning_df Columns: {finetuning_df.columns.tolist()}')

inetuning_df Columns: ['input', 'response', 'bible_ref_count']


In [None]:
def make_strata(x):
    if x == 0:
        return "zero"
    elif x == 1:
        return "one"
    else:
        return "more_than_one"
    
finetuning_df["strata"] = finetuning_df["bible_ref_count"].apply(make_strata)

In [None]:
strata_counts = finetuning_df["strata"].value_counts(normalize=True)
strata_counts

strata
one              0.803212
zero             0.126502
more_than_one    0.070285
Name: proportion, dtype: float64

In [None]:
train_samples = []
test_samples = []

train_size = 1000
test_size = 250

for strata, proporation in strata_counts.items():
    strata_df = finetuning_df[finetuning_df["strata"] == strata].sample(frac=1, random_state=42)

    n_train = round(proporation * train_size)
    n_test = round(proporation * test_size)

    train_samples.append(strata_df.iloc[:n_train])
    test_samples.append(strata_df.iloc[n_train:n_train+n_test])


train_df = pd.concat(train_samples).sample(frac=1, random_state=42).reset_index(drop=True)
test_df  = pd.concat(test_samples).sample(frac=1, random_state=42).reset_index(drop=True)

In [None]:
test_df

Unnamed: 0,input,response,bible_ref_count,strata
0,Kej 17 - 20 done,"[{'book_start': 'Kej', 'start_chapter': 17, 'b...",1,one
1,Maz 27-28 done,"[{'book_start': 'Maz', 'start_chapter': 27, 'b...",1,one
2,Maz 96-97 done,"[{'book_start': 'Maz', 'start_chapter': 96, 'b...",1,one
3,Filipi 1-2 done,"[{'book_start': 'Filipi', 'start_chapter': 1, ...",1,one
4,2 sam 1-1raja-raja 4 done,"[{'book_start': '2 sam', 'start_chapter': 1, '...",1,one
...,...,...,...,...
246,Yes 23-24 done,"[{'book_start': 'Yes', 'start_chapter': 23, 'b...",1,one
247,Kis 11-22 done,"[{'book_start': 'Kis', 'start_chapter': 11, 'b...",1,one
248,2 Raj 8 - 11 done,"[{'book_start': '2 Raj', 'start_chapter': 8, '...",1,one
249,Bil 12-13 done,"[{'book_start': 'Bil', 'start_chapter': 12, 'b...",1,one


In [None]:
instruction_text = (
    "Ekstrak semua referensi Alkitab dari teks berikut. "
    "Keluarkan hanya JSON array tanpa penjelasan. "
    "Jika tidak ada referensi, keluarkan array kosong."
)

train_df["instruction"] = instruction_text

train_df = train_df[["instruction", "input", "response"]]
test_df = test_df[["input", "response"]]

In [None]:
test_df

Unnamed: 0,input,response
0,Kej 17 - 20 done,"[{'book_start': 'Kej', 'start_chapter': 17, 'b..."
1,Maz 27-28 done,"[{'book_start': 'Maz', 'start_chapter': 27, 'b..."
2,Maz 96-97 done,"[{'book_start': 'Maz', 'start_chapter': 96, 'b..."
3,Filipi 1-2 done,"[{'book_start': 'Filipi', 'start_chapter': 1, ..."
4,2 sam 1-1raja-raja 4 done,"[{'book_start': '2 sam', 'start_chapter': 1, '..."
...,...,...
246,Yes 23-24 done,"[{'book_start': 'Yes', 'start_chapter': 23, 'b..."
247,Kis 11-22 done,"[{'book_start': 'Kis', 'start_chapter': 11, 'b..."
248,2 Raj 8 - 11 done,"[{'book_start': '2 Raj', 'start_chapter': 8, '..."
249,Bil 12-13 done,"[{'book_start': 'Bil', 'start_chapter': 12, 'b..."


In [None]:
train_df.to_json("../data/processed/finetuning/bible_ref_train.jsonl", 
                 orient="records", lines=True, force_ascii=False)
test_df.to_json("../data/processed/bible_ref_test.jsonl", 
                orient="records", lines=True, force_ascii=False)