#Open Access Abstract Research Paper Scraper & NER Fine-Tuning

##Installation of all dependencies and libraries

In [None]:
!pip install pdfminer.six
##!pip install -U pip setuptools wheel
##!pip install 'spacy[transformers]'
##!pip install --upgrade transformers

In [1]:
import os
import requests
import re
import json
from collections import Counter
from tqdm.auto import tqdm

import spacy
from spacy.tokens import DocBin
from spacy.matcher import Matcher

##gpu = spacy.require_gpu()
##print('GPU:', gpu)

In [8]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


##Querying the Springer API

In [None]:
# Mount Google Secret (for Springer API)
from google.colab import userdata
userdata.get('springer_api_key')

In [None]:
# Define the API query URL
url = "https://api.springernature.com/meta/v2/json"

# Define the query parameters
params = {
    "q": "language:en openaccess:true journalonlinefirst:true (keyword:\"entrepreneurship\" OR keyword:\"accelerator\" OR keyword:\"startup\" OR keyword:\"incubator\" OR keyword:\"university entrepreneurship\")",
    "p": 100,
    "api_key": "springer_api_key"
}

# Send the request to the API
response = requests.get(url, params=params)

# Check if the request was successful
if response.status_code == 200:
    # Parse the JSON response
    data = response.json()

    # Extract relevant information from the response
    papers = data.get('records', [])

    # Print the titles of the papers
    for paper in papers:
        print("\n" + paper.get("abstract"))
else:
    print("Error:", response.status_code)


##Research Paper PDF Downloader

In [None]:
def download_papers(papers, folder_path):
    # Create the folder if it doesn't exist
    if not os.path.exists(folder_path):
        os.makedirs(folder_path)

    # Iterate over the papers and download them
    for i, paper in enumerate(papers, 1):
        title = paper.get("title", "paper_" + str(i))
        file_name = f"{title}.pdf"
        file_path = os.path.join(folder_path, file_name)

        # Find the PDF URL
        pdf_url = None
        for url_info in paper.get("url", []):
            if url_info.get("format") == "pdf":
                pdf_url = url_info.get("value")
                break

        # Check if PDF URL is available
        if pdf_url:
            try:
                # Download the paper
                response = requests.get(pdf_url)
                response.raise_for_status()  # Raise an error for 4xx or 5xx status codes

                # Save the paper to Google Drive
                with open(file_path, 'wb') as file:
                    file.write(response.content)

                print(f"Downloaded {file_name} ({i}/{len(papers)})")
            except Exception as e:
                print(f"Error downloading {file_name}: {e}")
                continue  # Skip to the next paper if there's an error
        else:
            print(f"No PDF URL found for {title}")

# Example usage
folder_path = '/content/drive/My Drive/llm_fine-tuning/research_scraper/research_papers'
download_papers(papers, folder_path)

##Text Extraction from PDF Script

In [None]:
from pdfminer.high_level import extract_text
import os

directory = "/content/drive/My Drive/llm_fine-tuning/research_scraper/research_papers/"

# New directory for storing the extracted text files
txt_directory = os.path.join(directory, "txt")

# Ensure the txt directory exists (create if it doesn't)
os.makedirs(txt_directory, exist_ok=True)

# List all PDF files in the base directory
pdf_files = [file for file in os.listdir(directory) if file.endswith(".pdf")]

for pdf_file in pdf_files:
    try:
        # Construct the full path to the PDF file
        filename_pdf = os.path.join(directory, pdf_file)
        # Construct the path to the new txt file within the txt directory
        # Note: We're now saving the .txt files in the txt_directory
        filename_txt = os.path.join(txt_directory, os.path.splitext(pdf_file)[0] + ".txt")

        # Extract text from the PDF and save it to the text file in the new location
        with open(filename_txt, "w", encoding="utf-8") as f:
            text = extract_text(filename_pdf)
            f.write(text)

        print(f"Processed {pdf_file}")

    except Exception as ex:
        print(pdf_file, str(ex))

##Import JSON label data

In [None]:
data_file = "/content/drive/My Drive/llm_fine-tuning/research_scraper/research_papers/data/admin.jsonl"

with open(data_file, 'r') as f:
    lines = list(f)

training_data: list = []

for line in lines:
    row = json.loads(line)
    training_data.append(  [ row["text"], { "entities": row["label"] } ] )

print(len(training_data))

##NER Model training

In [None]:
train_split = int(len(training_data) * 0.8) # 80% training and 20% deve set

train_data = training_data[:train_split]
dev_data = training_data[train_split:]

In [None]:
def convert(path, dataset):
    nlp = spacy.blank("en")
    ##spacy.require_gpu()
    db = DocBin()
    for text, annot in tqdm(dataset):
            doc = nlp.make_doc(text)
            ents = []
            for start, end, label in annot["entities"]:
                span = doc.char_span(start, end, label=label, alignment_mode="contract")
                if span is None:
                    print("Skipping nil entity")
                if span.text != span.text.strip():
                    print("Skipping entity spans with whitespace")
                else:
                    ents.append(span)
            doc.ents = ents

            db.add(doc)
    db.to_disk(path)

convert("train.spacy", train_data)
convert("dev.spacy", dev_data)

  0%|          | 0/63 [00:00<?, ?it/s]

Skipping entity spans with whitespace
Skipping entity spans with whitespace


  0%|          | 0/16 [00:00<?, ?it/s]

In [None]:
!python -m spacy info

[1m

spaCy version    3.7.4                         
Location         /usr/local/lib/python3.10/dist-packages/spacy
Platform         Linux-6.1.58+-x86_64-with-glibc2.35
Python version   3.10.12                       
Pipelines        en_core_web_sm (3.7.1)        



In [None]:
!python -m spacy init config config.cfg --lang en --pipeline ner --optimize efficiency --force
##--gpu

[38;5;4mℹ Generated config template specific for your use case[0m
- Language: en
- Pipeline: ner
- Optimize for: efficiency
- Hardware: CPU
- Transformer: None
[38;5;2m✔ Auto-filled config with all values[0m
[38;5;2m✔ Saved config[0m
config.cfg
You can now add your data and train your pipeline:
python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy


In [None]:
!spacy debug data config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy

[1m
[38;5;2m✔ Pipeline can be initialized with data[0m
[38;5;2m✔ Corpus is loadable[0m
[1m
Language: en
Training pipeline: tok2vec, ner
63 training docs
16 evaluation docs
[38;5;2m✔ No overlap between training and evaluation data[0m
[38;5;1m✘ Low number of examples to train a new pipeline (63)[0m
[1m
[38;5;4mℹ 1799957 total word(s) in the data (52889 unique)[0m
[38;5;4mℹ No word vectors present in the package[0m
[1m
[38;5;4mℹ 1 label(s)[0m
0 missing value(s) (tokens with '-' label)
[38;5;2m✔ Good amount of examples for all labels[0m
[38;5;2m✔ Examples without occurrences available for all labels[0m
[38;5;2m✔ No entities consisting of or starting/ending with whitespace[0m
[38;5;2m✔ No entities crossing sentence boundaries[0m
[1m
[38;5;2m✔ 7 checks passed[0m
[38;5;1m✘ 1 error[0m


In [None]:
!python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy --output "/content/drive/My Drive/llm_fine-tuning/research_scraper/research_papers/data/"
##--gpu-id 0

[38;5;4mℹ Saving to output directory: /content/drive/My
Drive/llm_fine-tuning/research_scraper/research_papers/data[0m
[38;5;4mℹ Using CPU[0m
[1m
[38;5;2m✔ Initialized pipeline[0m
[1m
[38;5;4mℹ Pipeline: ['tok2vec', 'ner'][0m
[38;5;4mℹ Initial learn rate: 0.001[0m
E    #       LOSS TOK2VEC  LOSS NER  ENTS_F  ENTS_P  ENTS_R  SCORE 
---  ------  ------------  --------  ------  ------  ------  ------
  0       0          0.00  13724.67    0.00    0.00    0.00    0.00
  3     200       9596.79  190897.00   78.57  100.00   64.71    0.79
  6     400      33851.70    341.71   66.67   76.92   58.82    0.67
  9     600        647.70     35.32   81.25   86.67   76.47    0.81
 12     800          0.28      0.13   83.87   92.86   76.47    0.84
 15    1000     144521.55    237.29   87.50   93.33   82.35    0.87
 19    1200      65490.01    132.57   81.25   86.67   76.47    0.81
 22    1400     119869.62    176.24   82.35   82.35   82.35    0.82
 25    1600          0.00      0.00   82.3

##Test extraction from Research Papers

In [9]:
from pdfminer.high_level import extract_text

text = extract_text("/content/drive/My Drive/llm_fine-tuning/research_scraper/research_papers/data/test3.pdf")

nlp = spacy.load("/content/drive/My Drive/llm_fine-tuning/research_scraper/research_papers/data/model-best")
doc = nlp(text)

for ent in doc.ents:
  print(ent.label_, ent.text)

Abstract Reinforcement learning from human feedback (RLHF) is a technique for training AI systems
to align with human goals. RLHF has emerged as the central method used to finetune state-
of-the-art large language models (LLMs). Despite this popularity, there has been relatively
little public work systematizing its flaws.
In this paper, we (1) survey open problems
and fundamental limitations of RLHF and related methods; (2) overview techniques to
understand, improve, and complement RLHF in practice; and (3) propose auditing and
disclosure standards to improve societal oversight of RLHF systems. Our work emphasizes
the limitations of RLHF and highlights the importance of a multi-layered approach to the
development of safer AI systems.
