### Data Preprocessing

In [7]:
import os
import json
from glob import glob

# ---------------- CONFIG ----------------
DATA_DIR = r"C:\Users\islam hitham\paper_extractor_dataset\json"
OUTPUT_FILE = "structured_input_output.jsonl"
BATCH_SIZE = 50  # optional, for processing in batches

# ---------------- LOAD & CONVERT ----------------
def convert_document(doc):
    # Get full text (abstract + body)
    full_text = ""
    # Abstract sentences
    abstract_texts = []
    for s in doc.get("abstract", []):
        if isinstance(s, dict):
            abstract_texts.append(s.get("text", ""))
        else:
            abstract_texts.append(str(s))
    # Body sentences
    body_texts = []
    for b in doc.get("body_text", []):
        if isinstance(b, dict):
            body_texts.append(b.get("text", ""))
        else:
            body_texts.append(str(b))
    full_text = " ".join(abstract_texts + body_texts)

    # Everything else is output
    output = {k: v for k, v in doc.items() if k != "abstract" and k != "body_text"}
    output["abstract"] = doc.get("abstract", [])
    output["body_text"] = doc.get("body_text", [])

    return {"input": full_text, "output": output}

def process_all_documents(data_dir, output_file, batch_size=50):
    json_files = glob(os.path.join(data_dir, "**", "*.json"), recursive=True)
    print(f"Found {len(json_files)} JSON files.")

    if len(json_files) == 0:
        print("No JSON files found. Check the path.")
        return

    with open(output_file, "w", encoding="utf-8") as out_f:
        for i in range(0, len(json_files), batch_size):
            batch_files = json_files[i:i + batch_size]
            print(f"Processing batch {i//batch_size + 1} ({len(batch_files)} docs)...")
            for file_path in batch_files:
                with open(file_path, "r", encoding="utf-8") as f:
                    try:
                        raw_doc = json.load(f)
                        item = convert_document(raw_doc)
                        out_f.write(json.dumps(item) + "\n")
                    except Exception as e:
                        print(f"Error processing {file_path}: {e}")

    print(f"Structured dataset saved to {output_file}")

# ---------------- RUN ----------------
process_all_documents(DATA_DIR, OUTPUT_FILE, BATCH_SIZE)


Found 40091 JSON files.
Processing batch 1 (50 docs)...
Processing batch 2 (50 docs)...
Processing batch 3 (50 docs)...
Processing batch 4 (50 docs)...
Processing batch 5 (50 docs)...
Processing batch 6 (50 docs)...
Processing batch 7 (50 docs)...
Processing batch 8 (50 docs)...
Processing batch 9 (50 docs)...
Processing batch 10 (50 docs)...
Processing batch 11 (50 docs)...
Processing batch 12 (50 docs)...
Processing batch 13 (50 docs)...
Processing batch 14 (50 docs)...
Processing batch 15 (50 docs)...
Processing batch 16 (50 docs)...
Processing batch 17 (50 docs)...
Processing batch 18 (50 docs)...
Processing batch 19 (50 docs)...
Processing batch 20 (50 docs)...
Processing batch 21 (50 docs)...
Processing batch 22 (50 docs)...
Processing batch 23 (50 docs)...
Processing batch 24 (50 docs)...
Processing batch 25 (50 docs)...
Processing batch 26 (50 docs)...
Processing batch 27 (50 docs)...
Processing batch 28 (50 docs)...
Processing batch 29 (50 docs)...
Processing batch 30 (50 docs

In [4]:
import json

FILE_PATH = "structured_input_output.jsonl"

# ---------------- SHOW FIRST 2 EXAMPLES ----------------
with open(FILE_PATH, "r", encoding="utf-8") as f:
    for i in range(2):
        line = f.readline()
        if not line:
            break
        doc = json.loads(line)
        print(f"=== Example {i+1} ===")
        print("\n--- Input (Full Text) ---")
        print(doc["input"][:1000] + "..." if len(doc["input"]) > 500 else doc["input"])  # first 500 chars
        print("\n--- Output Keys ---")
        print(list(doc["output"].keys()))
        print("\n--- Metadata Sample ---")
        for k, v in doc["output"].get("metadata", {}).items():
            print(f"{k}: {v}")
        print("\n" + "-"*80 + "\n")


=== Example 1 ===

--- Input (Full Text) ---
O b j e c t i v e   T h e   o v e r a l l   r e s e a r c h   o b j e c t i v e   w a s   t o   t h e o r e t i c a l l y   a n d   e m p i r i c a l l y   d e v e l o p   t h e   i d e a s   a r o u n d   a   s y s t e m   o f   s a f e t y   m a n a g e m e n t   p r a c t i c e s   ( t e n   p r a c t i c e s   w e r e   e l a b o r a t e d ) ,   t o   t e s t   t h e i r   r e l a t i o n s h i p   w i t h   o b j e c t i v e   s a f e t y   s t a t i s t i c s   ( s u c h   a s   a c c i d e n t   r a t e s ) ,   a n d   t o   e x p l o r e   h o w   t h e s e   p r a c t i c e s   w o r k   t o   a c h i e v e   p o s i t i v e   s a f e t y   r e s u l t s   ( a c c i d e n t   p r e v e n t i o n )   t h r o u g h   w o r k e r   e n g a g e m e n t .   M e t h o d   D a t a   w e r e   c o l l e c t e d   u s i n g   s a f e t y   m a n a g e r ,   s u p e r v i s o r   a n d   e m p l o y e e   s u r v e y s   d e s i g n e d   t o

In [9]:
import json
import re 

INPUT_FILE = "structured_input_output.jsonl"
OUTPUT_FILE = "structured_input_output_fixed_1.jsonl"

fixed_count = 0

with open(INPUT_FILE, "r", encoding="utf-8") as in_f, \
     open(OUTPUT_FILE, "w", encoding="utf-8") as out_f:

    for line in in_f:
        try:
            doc = json.loads(line)
            
            # Rebuild full text from abstract + body_text
            abstract_texts = []
            for s in doc["output"].get("abstract", []):
                if isinstance(s, dict):
                    abstract_texts.append(s.get("text", ""))
                else:
                    abstract_texts.append(str(s))

            body_texts = []
            for b in doc["output"].get("body_text", []):
                if isinstance(b, dict):
                    body_texts.append(b.get("text", ""))
                else:
                    body_texts.append(str(b))

            # Join the text fragments; they still contain the bad spacing.
            full_text = " ".join(abstract_texts + body_texts)

            # --- START FIXING THE SPACING ---
            
            # 1. Replace non-breaking spaces ('\xa0') with standard spaces.
            cleaned_text = full_text.replace('\xa0', ' ')

            # 2. Use regex to find sequences of two or more spaces (which separate words) 
            # and replace them with a unique temporary separator ('|').
            # This isolates the words, which still have single internal spaces (e.g., 'O b j e c t i v e').
            text_with_separators = re.sub(r'\s{2,}', '|', cleaned_text).strip()

            # 3. Split the text by the separator.
            spaced_words = text_with_separators.split('|')
            
            # 4. For each resulting fragment, remove all remaining internal single spaces (' '), 
            # effectively collapsing 'O b j e c t i v e' into 'Objective'.
            clean_words = [word.replace(' ', '').strip() for word in spaced_words if word.strip()]
            
            # 5. Join the fully clean words with a single, correct space.
            final_full_text = ' '.join(clean_words)
            
            # --- END FIXING THE SPACING ---
            
            # Replace input with correctly fixed text
            doc["input"] = final_full_text

            out_f.write(json.dumps(doc, ensure_ascii=False) + "\n")
            fixed_count += 1

        except Exception as e:
            print(f"Error fixing line: {e}")

print(f"Fixed full text for {fixed_count} documents. Saved to {OUTPUT_FILE}")

Fixed full text for 40091 documents. Saved to structured_input_output_fixed_1.jsonl


In [10]:
import json

FILE_PATH = "structured_input_output_fixed_1.jsonl"

# ---------------- SHOW FIRST 2 EXAMPLES ----------------
with open(FILE_PATH, "r", encoding="utf-8") as f:
    for i in range(2):
        line = f.readline()
        if not line:
            break
        doc = json.loads(line)
        print(f"=== Example {i+1} ===")
        print("\n--- Input (Full Text) ---")
        print(doc["input"][:1000] + "..." if len(doc["input"]) > 500 else doc["input"])  # first 500 chars
        print("\n--- Output Keys ---")
        print(list(doc["output"].keys()))
        print("\n--- Metadata Sample ---")
        for k, v in doc["output"].get("metadata", {}).items():
            print(f"{k}: {v}")
        print("\n" + "-"*80 + "\n")


=== Example 1 ===

--- Input (Full Text) ---
Objective The overall research objective was to theoretically and empirically develop the ideas around a system of safety management practices (ten practices were elaborated), to test their relationship with objective safety statistics (such as accident rates), and to explore how these practices work to achieve positive safety results (accident prevention) through worker engagement. Method Data were collected using safety manager, supervisor and employee surveys designed to assess and link safety management system practices, employee perceptions resulting from existing practices, and safety performance outcomes. Results Results indicate the following: there is a significant negative relationship between the presence of ten individual safety management practices, as well as the composite of these practices, with accident rates; there is a significant negative relationship between the level of safety-focused worker emotional and cognitive enga

In [6]:
# ---------------- EXPLORE FIRST EXAMPLE ----------------
import json

# Load just the first few documents for quick testing
examples = []
with open("processed_elsevier.jsonl", "r", encoding="utf-8") as f:
    for i, line in enumerate(f):
        examples.append(json.loads(line))
        if i >= 4:  # only load 5 docs for speed
            break

# Take first example
doc = examples[0]

# --- Metadata ---
print("DOC ID:", doc.get("docId"))
print("\n--- Metadata ---")
for k, v in doc["metadata"].items():
    print(f"{k}: {v}")

# --- Abstract ---
print("\n--- Abstract ---")
for sent in doc["abstract"][:3]:  # first 3 sentences
    print(sent["sentence"])

# --- Body Text ---
print("\n--- Body Text ---")
for sent in doc["body_text"][:3]:  # first 3 body sentences
    print(f"[{sent['title']}] {sent['sentence']}")

# --- Bibliography ---
print("\n--- Bibliography ---")
for ref_id, ref in list(doc["bib_entries"].items())[:2]:  # first 2 references
    print(f"{ref_id}: {ref['title']} by {[a['last'] for a in ref['authors']]}")

# --- First X & Y ---
full_text = " ".join([s["sentence"] for s in doc["abstract"]] + 
                     [s["sentence"] for s in doc["body_text"]])
print("\n--- First X (full text) ---")
print(full_text[:500], "...")  # print first 500 chars only

print("\n--- First Y (metadata) ---")
print(json.dumps(doc, indent=2))


DOC ID: 

--- Metadata ---
title: A system of safety management practices and worker engagement for reducing and preventing accidents: An empirical and theoretical investigation
authors: [{'first': 'Jan K.', 'initial': 'J.K.', 'last': '', 'email': 'jan.wachter@iup.edu'}, {'first': 'Patrick L.', 'initial': 'P.L.', 'last': '', 'email': None}]
issn: 00014575
volume: 
firstpage: 
lastpage: 
pub_year: None
doi: 10.1016/j.aap.2013.07.029
pmid: 
openaccess: Full
subjareas: []
keywords: ['Accident prevention', 'Accident rates', 'Human performance', 'Safety management systems', 'Worker engagement']
asjc: ['2213', '2739', '3307']

--- Abstract ---
O
b
j

--- Body Text ---
[] 
[] 
[] 

--- Bibliography ---
BIBREF0: bib0005 by []
BIBREF1: bib0010 by []

--- First X (full text) ---
O b j e c t i v e   T h e   o v e r a l l   r e s e a r c h   o b j e c t i v e   w a s   t o   t h e o r e t i c a l l y   a n d   e m p i r i c a l l y   d e v e l o p   t h e   i d e a s   a r o u n d   a   s y s t e 

In [1]:
from datasets import load_dataset
from transformers import AutoTokenizer
from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
import torch
import evaluate
import json
from rouge_score import rouge_scorer

In [3]:
# Load all splits simultaneously
dataset = load_dataset("orieg/elsevier-oa-cc-by", trust_remote_code=True)

# Access each split
train_dataset = dataset["train"]
test_dataset = dataset["test"]
validation_dataset = dataset["validation"]

print(f"Train size: {len(train_dataset)}")
print(f"Test size: {len(test_dataset)}")
print(f"Validation size: {len(validation_dataset)}")

Downloading data:   0%|          | 0.00/1.01G [00:00<?, ?B/s]

FSTimeoutError: 

In [None]:
model_name = "models/llama-3.2-1b-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def format_text(example):
    return {"text": example["text"]}

def tokenize_fn(examples):
    texts = [t for t in examples["text"]]
    return tokenizer(texts, truncation=True, padding="max_length", max_length=2048)

dataset = dataset.map(format_text)
tokenized_dataset = dataset.map(tokenize_fn, batched=True, remove_columns=dataset["train"].column_names)
tokenized_dataset.set_format("torch")

print(tokenized_dataset)


In [None]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    load_in_4bit=True,
    device_map="auto",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = prepare_model_for_kbit_training(model)

peft_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()


In [None]:
rouge = evaluate.load("rouge")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    pred_str = tokenizer.batch_decode(preds, skip_special_tokens=True)
    label_str = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    results = rouge.compute(predictions=pred_str, references=label_str)
    
    # Basic JSON structure accuracy
    correct_json = 0
    for p in pred_str:
        try:
            json.loads(p)
            correct_json += 1
        except:
            pass
    json_acc = correct_json / len(pred_str)
    
    return {"rougeL": results["rougeL"], "json_accuracy": json_acc}


In [None]:
training_args = TrainingArguments(
    output_dir="./qlora_llama3b_output",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=16,
    warmup_steps=20,
    num_train_epochs=1,
    learning_rate=2e-4,
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    fp16=False,
    bf16=True,
    report_to="none",
    save_total_limit=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["validation"],
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [None]:
trainer.train()

In [None]:
model.save_pretrained("./qlora_llama3b_adapter")
tokenizer.save_pretrained("./qlora_llama3b_adapter")

In [None]:
eval_results = trainer.evaluate(eval_dataset=tokenized_dataset["test"])
print(eval_results)