## RAG Prototype - Data Extraction and Processing
First, multiple PDF documents are loaded, metadata extracted and irrelevant papers removed. Afterwards, the remaining papers are chunked into smaller pieces and assembled into a TSV format as the knowledge base for the retrieval model.

In [None]:
# Python default libriaries
import re
import glob
import csv
import json

# Python libraries for PDF (meta)data extraction and chunk generation
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
# Extract metadata from PDF and return metadata items as dictionary 
def extract_metadata(file): 
    meta = PdfReader(file)

    title_meta = meta.metadata.title
    if meta.metadata.author == None:
        author_meta = "Unknown author"
    else:
        author_meta = meta.metadata.author.strip()
    date_meta = str(meta.metadata.creation_date.date())
    subject_meta = meta.metadata.subject.split(',')[0]
    if meta.metadata.subject == '':
        doi_meta = "No DOI available"
    else: 
        doi_meta = meta.metadata.subject.split(',')[1].strip()
    try:
        keywords_meta = str(list(meta.metadata.items())[7][1])
    except:
        keywords_meta = ''

    keys = ['title', 'author', 'publish_date', 'subject', 'doi', 'keywords']
    content = [title_meta, author_meta, date_meta, subject_meta, doi_meta, keywords_meta]

    dict_metadata = dict(zip(keys, content))
    return dict_metadata

In [None]:
# Extract plain text from PDF file and remove unnecessary text fragments
def clean_document(file):
    pdf = PdfReader(file)
    text = " ".join(page.extract_text() for page in pdf.pages)
    # Remove newline characters and replace with a space
    clean_text = text.replace('\n', ' ')
    
    # Remove hyphen followed by space
    clean_text = clean_text.replace('- ', '')

    # Remove numeric references within square brackets
    clean_text = re.sub(r'\[\d+(?:,\s*\d+)*\]', '', clean_text)

    # Remove hyperlinks
    clean_text = re.sub(r'https?://\S+', '', clean_text)
    
    return clean_text

In [None]:
# Define chunk size and attach metadata to text chunk
def chunk_documents(text, metadata):
    custom_text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1024,
    chunk_overlap=30,
    length_function=len
    )
    
    lib = custom_text_splitter.create_documents([text])
    
    for j in range(len(lib)):
        lib[j].metadata.clear()
        lib[j].metadata.update(metadata)

    return lib

In [None]:
def main():
    PATH = 'research/'
    libs = []
    metadata_list = []
    for file in glob.glob('{}*.pdf'.format(PATH)):
        meta = PdfReader(file)
        title = meta.metadata.title
        # Check for invalid publications
        if title == None:
            pass
        else:
            print("Start processing: " + str(title))
            extracted_metadata = extract_metadata(file)
            metadata_list.append(extracted_metadata)
            cleaned_text = clean_document(file)
            lib = chunk_documents(cleaned_text, extracted_metadata)
            libs.append(lib)
    
    print('Chunks assembled!')

    # Create JSON file for all publication metadata
    with open('metadata.json', "w") as json_file:
        json.dump(metadata_list, json_file, indent=4)
    
    print('Metadata saved as JSON!')

    # Create TSV-file for the retrieval model
    with open('kb/collection1024token.tsv', 'wt', encoding='utf-8') as out_file:
        i = 0
        for chunk in libs:
            for line in chunk:
                tsv_writer = csv.writer(out_file, delimiter='\t')
                tsv_writer.writerow([i, line])
                i += 1
                
    print('Collection created!')
    
if __name__ == "__main__":
    main()