### Data Preprocessing

In [7]:
import os
import json
from glob import glob

# ---------------- CONFIG ----------------
DATA_DIR = r"C:\Users\islam hitham\paper_extractor_dataset\json"
OUTPUT_FILE = "structured_input_output.jsonl"
BATCH_SIZE = 50  # optional, for processing in batches

# ---------------- LOAD & CONVERT ----------------
def convert_document(doc):
    # Get full text (abstract + body)
    full_text = ""
    # Abstract sentences
    abstract_texts = []
    for s in doc.get("abstract", []):
        if isinstance(s, dict):
            abstract_texts.append(s.get("text", ""))
        else:
            abstract_texts.append(str(s))
    # Body sentences
    body_texts = []
    for b in doc.get("body_text", []):
        if isinstance(b, dict):
            body_texts.append(b.get("text", ""))
        else:
            body_texts.append(str(b))
    full_text = " ".join(abstract_texts + body_texts)

    # Everything else is output
    output = {k: v for k, v in doc.items() if k != "abstract" and k != "body_text"}
    output["abstract"] = doc.get("abstract", [])
    output["body_text"] = doc.get("body_text", [])

    return {"input": full_text, "output": output}

def process_all_documents(data_dir, output_file, batch_size=50):
    json_files = glob(os.path.join(data_dir, "**", "*.json"), recursive=True)
    print(f"Found {len(json_files)} JSON files.")

    if len(json_files) == 0:
        print("No JSON files found. Check the path.")
        return

    with open(output_file, "w", encoding="utf-8") as out_f:
        for i in range(0, len(json_files), batch_size):
            batch_files = json_files[i:i + batch_size]
            print(f"Processing batch {i//batch_size + 1} ({len(batch_files)} docs)...")
            for file_path in batch_files:
                with open(file_path, "r", encoding="utf-8") as f:
                    try:
                        raw_doc = json.load(f)
                        item = convert_document(raw_doc)
                        out_f.write(json.dumps(item) + "\n")
                    except Exception as e:
                        print(f"Error processing {file_path}: {e}")

    print(f"Structured dataset saved to {output_file}")

# ---------------- RUN ----------------
process_all_documents(DATA_DIR, OUTPUT_FILE, BATCH_SIZE)


Found 40091 JSON files.
Processing batch 1 (50 docs)...
Processing batch 2 (50 docs)...
Processing batch 3 (50 docs)...
Processing batch 4 (50 docs)...
Processing batch 5 (50 docs)...
Processing batch 6 (50 docs)...
Processing batch 7 (50 docs)...
Processing batch 8 (50 docs)...
Processing batch 9 (50 docs)...
Processing batch 10 (50 docs)...
Processing batch 11 (50 docs)...
Processing batch 12 (50 docs)...
Processing batch 13 (50 docs)...
Processing batch 14 (50 docs)...
Processing batch 15 (50 docs)...
Processing batch 16 (50 docs)...
Processing batch 17 (50 docs)...
Processing batch 18 (50 docs)...
Processing batch 19 (50 docs)...
Processing batch 20 (50 docs)...
Processing batch 21 (50 docs)...
Processing batch 22 (50 docs)...
Processing batch 23 (50 docs)...
Processing batch 24 (50 docs)...
Processing batch 25 (50 docs)...
Processing batch 26 (50 docs)...
Processing batch 27 (50 docs)...
Processing batch 28 (50 docs)...
Processing batch 29 (50 docs)...
Processing batch 30 (50 docs

In [4]:
import json

FILE_PATH = "structured_input_output.jsonl"

# ---------------- SHOW FIRST 2 EXAMPLES ----------------
with open(FILE_PATH, "r", encoding="utf-8") as f:
    for i in range(2):
        line = f.readline()
        if not line:
            break
        doc = json.loads(line)
        print(f"=== Example {i+1} ===")
        print("\n--- Input (Full Text) ---")
        print(doc["input"][:1000] + "..." if len(doc["input"]) > 500 else doc["input"])  # first 500 chars
        print("\n--- Output Keys ---")
        print(list(doc["output"].keys()))
        print("\n--- Metadata Sample ---")
        for k, v in doc["output"].get("metadata", {}).items():
            print(f"{k}: {v}")
        print("\n" + "-"*80 + "\n")


=== Example 1 ===

--- Input (Full Text) ---
O b j e c t i v e   T h e   o v e r a l l   r e s e a r c h   o b j e c t i v e   w a s   t o   t h e o r e t i c a l l y   a n d   e m p i r i c a l l y   d e v e l o p   t h e   i d e a s   a r o u n d   a   s y s t e m   o f   s a f e t y   m a n a g e m e n t   p r a c t i c e s   ( t e n   p r a c t i c e s   w e r e   e l a b o r a t e d ) ,   t o   t e s t   t h e i r   r e l a t i o n s h i p   w i t h   o b j e c t i v e   s a f e t y   s t a t i s t i c s   ( s u c h   a s   a c c i d e n t   r a t e s ) ,   a n d   t o   e x p l o r e   h o w   t h e s e   p r a c t i c e s   w o r k   t o   a c h i e v e   p o s i t i v e   s a f e t y   r e s u l t s   ( a c c i d e n t   p r e v e n t i o n )   t h r o u g h   w o r k e r   e n g a g e m e n t .   M e t h o d   D a t a   w e r e   c o l l e c t e d   u s i n g   s a f e t y   m a n a g e r ,   s u p e r v i s o r   a n d   e m p l o y e e   s u r v e y s   d e s i g n e d   t o

In [9]:
import json
import re 

INPUT_FILE = "structured_input_output.jsonl"
OUTPUT_FILE = "structured_input_output_fixed_1.jsonl"

fixed_count = 0

with open(INPUT_FILE, "r", encoding="utf-8") as in_f, \
     open(OUTPUT_FILE, "w", encoding="utf-8") as out_f:

    for line in in_f:
        try:
            doc = json.loads(line)
            
            # Rebuild full text from abstract + body_text
            abstract_texts = []
            for s in doc["output"].get("abstract", []):
                if isinstance(s, dict):
                    abstract_texts.append(s.get("text", ""))
                else:
                    abstract_texts.append(str(s))

            body_texts = []
            for b in doc["output"].get("body_text", []):
                if isinstance(b, dict):
                    body_texts.append(b.get("text", ""))
                else:
                    body_texts.append(str(b))

            # Join the text fragments; they still contain the bad spacing.
            full_text = " ".join(abstract_texts + body_texts)

            # --- START FIXING THE SPACING ---
            
            # 1. Replace non-breaking spaces ('\xa0') with standard spaces.
            cleaned_text = full_text.replace('\xa0', ' ')

            # 2. Use regex to find sequences of two or more spaces (which separate words) 
            # and replace them with a unique temporary separator ('|').
            # This isolates the words, which still have single internal spaces (e.g., 'O b j e c t i v e').
            text_with_separators = re.sub(r'\s{2,}', '|', cleaned_text).strip()

            # 3. Split the text by the separator.
            spaced_words = text_with_separators.split('|')
            
            # 4. For each resulting fragment, remove all remaining internal single spaces (' '), 
            # effectively collapsing 'O b j e c t i v e' into 'Objective'.
            clean_words = [word.replace(' ', '').strip() for word in spaced_words if word.strip()]
            
            # 5. Join the fully clean words with a single, correct space.
            final_full_text = ' '.join(clean_words)
            
            # --- END FIXING THE SPACING ---
            
            # Replace input with correctly fixed text
            doc["input"] = final_full_text

            out_f.write(json.dumps(doc, ensure_ascii=False) + "\n")
            fixed_count += 1

        except Exception as e:
            print(f"Error fixing line: {e}")

print(f"Fixed full text for {fixed_count} documents. Saved to {OUTPUT_FILE}")

Fixed full text for 40091 documents. Saved to structured_input_output_fixed_1.jsonl


In [10]:
import json

FILE_PATH = "structured_input_output_fixed_1.jsonl"

# ---------------- SHOW FIRST 2 EXAMPLES ----------------
with open(FILE_PATH, "r", encoding="utf-8") as f:
    for i in range(2):
        line = f.readline()
        if not line:
            break
        doc = json.loads(line)
        print(f"=== Example {i+1} ===")
        print("\n--- Input (Full Text) ---")
        print(doc["input"][:1000] + "..." if len(doc["input"]) > 500 else doc["input"])  # first 500 chars
        print("\n--- Output Keys ---")
        print(list(doc["output"].keys()))
        print("\n--- Metadata Sample ---")
        for k, v in doc["output"].get("metadata", {}).items():
            print(f"{k}: {v}")
        print("\n" + "-"*80 + "\n")


=== Example 1 ===

--- Input (Full Text) ---
Objective The overall research objective was to theoretically and empirically develop the ideas around a system of safety management practices (ten practices were elaborated), to test their relationship with objective safety statistics (such as accident rates), and to explore how these practices work to achieve positive safety results (accident prevention) through worker engagement. Method Data were collected using safety manager, supervisor and employee surveys designed to assess and link safety management system practices, employee perceptions resulting from existing practices, and safety performance outcomes. Results Results indicate the following: there is a significant negative relationship between the presence of ten individual safety management practices, as well as the composite of these practices, with accident rates; there is a significant negative relationship between the level of safety-focused worker emotional and cognitive enga

### formatter and train test valid split 

In [1]:
import json
import random
import os

# --- Configuration ---
INPUT_FILE = r"C:\Users\islam hitham\Paper-LLM-Extractor\notebooks\structured_input_output_fixed_1.jsonl"
TRAIN_FILE = "llama_train.jsonl"
VALID_FILE = "llama_valid.jsonl"
TEST_FILE = "llama_test.jsonl"

# Split percentages (80/10/10)
TRAIN_RATIO = 0.8
VALID_RATIO = 0.1
# Test ratio will be the remainder (0.1)

# --- Llama 3.2 Instruct Format Template ---
# The model is trained to predict the assistant's response given the user's prompt.
SYSTEM_INSTRUCTION = (
    "You are an expert academic document extractor. Your task is to process the full text of an "
    "academic paper and accurately extract the content. You MUST return the extracted content   "
    "as a single valid JSON object that adheres strictly to the provided schema. "
    "Do not add any conversational filler."
)

def format_to_llama_instruct(doc):
    """
    Converts a single structured document into the Llama 3.2 Instruct chat format.
    The output content (the assistant's response) is formatted as a JSON string.
    """
    try:
        # The prompt is the cleaned input text
        user_prompt = (
            f"Extract the content for the specified keys from the following paper text:\n\n"
            f"--- TEXT ---\n{doc['input']}"
        )

        # The expected output is the JSON object dumped as a string
        assistant_response = json.dumps(doc['output'], ensure_ascii=False)

        # Assemble the final Llama Instruct conversation structure
        return {
            "messages": [
                {"role": "system", "content": SYSTEM_INSTRUCTION},
                {"role": "user", "content": user_prompt},
                {"role": "assistant", "content": assistant_response}
            ]
        }
    except KeyError as e:
        print(f"Skipping document due to missing key: {e}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred during formatting: {e}")
        return None

def load_and_split_data():
    """
    Loads data, formats it, shuffles it, and splits it into train/valid/test sets.
    """
    if not os.path.exists(INPUT_FILE):
        print(f"Error: Input file not found at '{INPUT_FILE}'. Please check the filename.")
        return

    print(f"Loading data from {INPUT_FILE}...")
    formatted_data = []
    
    with open(INPUT_FILE, 'r', encoding='utf-8') as f:
        for line_num, line in enumerate(f):
            try:
                doc = json.loads(line)
                
                # IMPORTANT: Skip documents where the 'input' is empty (cleanup failure)
                if not doc.get('input'):
                    print(f"Skipping line {line_num + 1}: Empty 'input' field.")
                    continue
                
                # Format the document for Llama 3.2
                formatted = format_to_llama_instruct(doc)
                if formatted:
                    formatted_data.append(formatted)
                    
            except json.JSONDecodeError as e:
                print(f"Skipping line {line_num + 1}: JSON decode error: {e}")
            
    total_count = len(formatted_data)
    if total_count == 0:
        print("No valid data found after loading and formatting. Exiting.")
        return

    print(f"Total documents loaded and formatted: {total_count}")
    
    # Shuffle the entire dataset to ensure random distribution of samples
    random.seed(42) # Set seed for reproducibility
    random.shuffle(formatted_data)

    # Calculate split indices
    train_end = int(total_count * TRAIN_RATIO)
    valid_end = train_end + int(total_count * VALID_RATIO)

    # Split the data
    train_set = formatted_data[:train_end]
    valid_set = formatted_data[train_end:valid_end]
    test_set = formatted_data[valid_end:]

    print(f"--- Dataset Split Summary ---")
    print(f"Train set size: {len(train_set)}")
    print(f"Validation set size: {len(valid_set)}")
    print(f"Test set size: {len(test_set)}")

    # Save the split datasets
    save_data(train_set, TRAIN_FILE)
    save_data(valid_set, VALID_FILE)
    save_data(test_set, TEST_FILE)
    
    print("\nData splitting and formatting complete!")
    print(f"Files saved: {TRAIN_FILE}, {VALID_FILE}, {TEST_FILE}")


def save_data(data, filename):
    """Saves the list of dictionaries to a JSONL file."""
    with open(filename, 'w', encoding='utf-8') as f:
        for entry in data:
            f.write(json.dumps(entry, ensure_ascii=False) + '\n')
            
load_and_split_data()

Loading data from C:\Users\islam hitham\Paper-LLM-Extractor\notebooks\structured_input_output_fixed_1.jsonl...
Skipping line 102: Empty 'input' field.
Skipping line 1144: Empty 'input' field.
Skipping line 1257: Empty 'input' field.
Skipping line 1605: Empty 'input' field.
Skipping line 1747: Empty 'input' field.
Skipping line 2440: Empty 'input' field.
Skipping line 2445: Empty 'input' field.
Skipping line 2500: Empty 'input' field.
Skipping line 2597: Empty 'input' field.
Skipping line 2956: Empty 'input' field.
Skipping line 3258: Empty 'input' field.
Skipping line 3440: Empty 'input' field.
Skipping line 3724: Empty 'input' field.
Skipping line 4457: Empty 'input' field.
Skipping line 4726: Empty 'input' field.
Skipping line 4828: Empty 'input' field.
Skipping line 4846: Empty 'input' field.
Skipping line 4889: Empty 'input' field.
Skipping line 4896: Empty 'input' field.
Skipping line 5308: Empty 'input' field.
Skipping line 6124: Empty 'input' field.
Skipping line 6125: Empty 'in

In [2]:
import json
import os

def inspect_llama_file(filename, num_examples=2):
    """
    Loads a Llama-formatted JSONL file and prints the first few examples 
    in a readable structure for verification.
    """
    if not os.path.exists(filename):
        print(f"Error: Output file not found at '{filename}'. Run the data preprocessing cell first.")
        return

    print(f"--- Inspecting first {num_examples} documents from: {filename} ---")
    
    count = 0
    with open(filename, 'r', encoding='utf-8') as f:
        for line in f:
            if count >= num_examples:
                break
                
            try:
                example_doc = json.loads(line)
                messages = example_doc.get("messages", [])
                
                print("\n" + "="*80)
                print(f"DOCUMENT EXAMPLE #{count + 1}")
                print("="*80)
                
                for message in messages:
                    role = message.get("role", "N/A").upper()
                    content = message.get("content", "N/A")
                    
                    if role == "SYSTEM":
                        print(f"ROLE: SYSTEM\nCONTENT: {content[:100]}...\n")
                    elif role == "USER":
                        # Truncate user prompt (the full paper text) for clean display
                        print(f"ROLE: USER (Input Text)\nCONTENT: {content[:300]}... (Text truncated)\n")
                    elif role == "ASSISTANT":
                        # Pretty print the target JSON output
                        try:
                            # The content is a JSON string, parse it to display cleanly
                            parsed_json = json.loads(content)
                            pretty_json = json.dumps(parsed_json, indent=2, ensure_ascii=False)
                            print(f"ROLE: ASSISTANT (Target JSON)\nCONTENT:\n{pretty_json[:500]}... (JSON truncated)\n")
                        except json.JSONDecodeError:
                            print(f"ROLE: ASSISTANT (ERROR: Content is not valid JSON):\n{content[:200]}...\n")
                            
                count += 1
                
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON line: {e}")
                break

INSPECT_FILE = "llama_train.jsonl" 

inspect_llama_file(INSPECT_FILE, num_examples=2)

--- Inspecting first 2 documents from: llama_train.jsonl ---

DOCUMENT EXAMPLE #1
ROLE: SYSTEM
CONTENT: You are an expert academic document extractor. Your task is to process the full text of an academic ...

ROLE: USER (Input Text)
CONTENT: Extract the content for the specified keys from the following paper text:

--- TEXT ---
This paper presents the first detailed assessment of the invasive potential of Melaleuca hypericifolia Sm. in South Africa. This woody, fire-adapted shrub, native to Australia, is considered a high risk invader w... (Text truncated)

ROLE: ASSISTANT (Target JSON)
CONTENT:
{
  "author_highlights": [
    {
      "endOffset": 15253,
      "sentence": "The invasion risk of the Australian invader plant Melaleuca hypericifolia is assessed.",
      "startOffset": 15167
    },
    {
      "endOffset": 15356,
      "sentence": "It poses a high risk of invasion with the potential to occupy 4% of South Africa if allowed to spread.",
      "startOffset": 15254
    },
    {


In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch
import evaluate
import json
from rouge_score import rouge_scorer
from itertools import islice

In [2]:
file_path = r"C:\Users\islam hitham\Paper-LLM-Extractor\notebooks\structured_input_output_fixed_1.jsonl"

dataset = load_dataset("json", data_files=file_path, split="train", streaming=True)

In [3]:
# Iterate safely, skip empty/malformed lines
examples = []
for i, item in enumerate(islice(dataset, 3)):
    # Sometimes streaming returns string or dict
    if isinstance(item, str):
        item = item.strip()
        if not item:  # skip blank lines
            continue
        try:
            item = json.loads(item)
        except json.JSONDecodeError:
            print(f"Skipping malformed line {i+1}")
            continue
    elif not item:
        continue

    examples.append(item)
    print(f" Example {len(examples)} ")
    print("Input (first 300 chars):", item["input"][:300])
    print("Output keys:", list(item["output"].keys()))
    print()

if not examples:
    print("No valid examples found. Check your JSONL file for empty or malformed lines.")


 Example 1 
Input (first 300 chars): Objective The overall research objective was to theoretically and empirically develop the ideas around a system of safety management practices (ten practices were elaborated), to test their relationship with objective safety statistics (such as accident rates), and to explore how these practices wor
Output keys: ['author_highlights', 'bib_entries', 'docId', 'metadata', 'abstract', 'body_text']

 Example 2 
Input (first 300 chars): The objective of an accident-mapping algorithm is to snap traffic accidents onto the correct road segments. Assigning accidents onto the correct segments facilitate to robustly carry out some key analyses in accident research including the identification of accident hot-spots, network-level risk map
Output keys: ['author_highlights', 'bib_entries', 'docId', 'metadata', 'abstract', 'body_text']

 Example 3 
Input (first 300 chars): The Driver Behavior Questionnaire (DBQ) is a self-report measure of driving behavior that has

In [4]:
model_name = r"C:\Users\islam hitham\Paper-LLM-Extractor\models\llama-3.2-1b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


In [None]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    pred_str = tokenizer.batch_decode(preds, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    results = rouge.compute(predictions=pred_str, references=label_str)
    
    # Basic JSON structure accuracy
    correct_json = 0
    for p in pred_str:
        try:
            json.loads(p)
            correct_json += 1
        except:
            pass
    json_acc = correct_json / len(pred_str)
    
    return {"rougeL": results["rougeL"], "json_accuracy": json_acc}


In [None]:
training_args = TrainingArguments(
    output_dir="./qlora_llama3b_output",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    warmup_steps=20,
    num_train_epochs=1,
    learning_rate=2e-4,
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    fp16=False,
    bf16=True,
    report_to="none",
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained("./qlora_llama3b_adapter")
tokenizer.save_pretrained("./qlora_llama3b_adapter")

In [None]:
eval_results = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
print(eval_results)