#### Extracting data from the rsis pdf (highly structured)

In [6]:
from typing import List, Dict

def find_keyword_index(keyword:str, text_list:List[str]) -> int:
    """
    Returns the index for a loose match

    Args:
        keyword (str): keyword to search for
        text_list (List[str]): list of text to search

    Returns:
        int: index
    """
    keyword = keyword.lower()
    for index, text in enumerate(text_list):
        if keyword in text.lower():
            return index
    return -1

def find_nth_match_index(keyword:str, n:int, text_list:List[str]) -> int:
    """
    Finding the nth exact match

    Args:
        keyword (str): keyword to match
        n (int): number of matches to skip
        text_list (List[str]): list of text to search

    Returns:
        int: index of the nth match
    """
    counter = 0
    for index, text in enumerate(text_list):
        if counter == n:
            return index
        if text == keyword:
            counter += 1
    return -1

def find_keyword_list_index(keyword_list:List[str], text_list:List[str]) -> int:
    """
    return index for any loose matches from the keyword list

    Args:
        keyword_list (List[str]): list of keywords 
        text_list (List[str]): list of text to search

    Returns:
        int: index of a match
    """
    for index, keyword in enumerate(keyword_list):
        keyword_list[index] = keyword.lower()

    for index, text in enumerate(text_list):
        text = text.lower()
        for keyword in keyword_list:
            if keyword in text:
                return index
    return -1

def contain_blanks(keyword_list:List[str], data:Dict[str, str]) -> bool:
    """
    Check if a certain dict, with a keyword list, has blanks

    Args:
        keyword_list (List[str]): keywords to search from
        data (Dict[str, str]): dict to inspect

    Returns:
        bool: returns True if there are blanks
    """
    for keyword in keyword_list:
        if keyword in data and data[keyword] == "":
            return True
    return False

def get_authors(authors:str) -> List[str]:
    """
    Splitting the authors portion for RSIS

    Args:
        authors (str): string representation of the entire author portion

    Returns:
        List[str]: authots spliited
    """
    # Getting keywords for the authors
    author_list = authors.split(" ")
    author_list = [
        author_list[1],
        author_list[-1]
    ]
    return author_list

In [None]:
import glob
import json
import tqdm
from pdfminer.high_level import extract_text
from QaGeneration import ensure_List_string

data_rsis_dir = "../data/context/rsis/pdf"

# Getting text from all pdf
print("extracting text from pdf")
file_list = glob.glob(data_rsis_dir + '/*')
progress_bar = tqdm.tqdm(total=len(file_list))
article_list = []

for file in file_list:
    progress_bar.set_postfix({'Info': file})
    text_list = [item.strip() for item in extract_text(file).split("\n")]

    # Getting index (pdf is extremely structured)
    date_index = find_nth_match_index("", 1, text_list)
    title_index = find_nth_match_index("", 3, text_list)
    synopsis_index = find_keyword_index("synopsis", text_list)

    title_list = [item for item in text_list[title_index: synopsis_index] if item != ""]

    if "By" not in title_list[-1]:
        continue

    author = title_list[-1]
    title = " ".join(title_list[:-1])

    author_list = get_authors(author)

    if synopsis_index == -1:
        continue

    commentary_index = find_keyword_index("commentary", text_list[synopsis_index:]) + synopsis_index

    end_index = -1
    end_index = find_keyword_list_index(author_list, text_list[synopsis_index:]) + synopsis_index
    
    # Getting text
    date = text_list[date_index]  if date_index != -1 else ""
    commentary = " ".join([item.replace("  ", " ") for item in text_list[commentary_index : end_index - 1] if item != ""])

    result = {
        "file_name": file,
        "title": title,
        "author(s)": author,
        "date": date,
        "content": commentary
    }

    keyword_list = [
        "title",
        "date",
        "authors(s)"
        "commentary"
    ]

    if contain_blanks(keyword_list=keyword_list, data=result):
        continue

    article_list.append(result)

    with open("../data/context/rsis/data_2021.json", "w") as f:
        json.dump(article_list, f, indent = 2)

    progress_bar.update(1)
