# Text extraction

In [None]:
# import packages
import pandas as pd
import time
import numpy as np
import numpy as np
import json
import shutil
import os
import pypdfium2 as pdfium

In [None]:
# import internal modules
import file_path_management as fpath
import public_library as plib
import parameters as params
import dataframe_columns as df_col

## Predefined fucntions:

In [None]:
def pdf2text(pdf_path, text_path): 
    try:   
        text = ""
        pdf_content = pdfium.PdfDocument(pdf_path)
        n_pages = len(pdf_content)
        # print(n_pages)

        # Read the text of each page and append it to the text variable
        for i in range(n_pages):
            page_content = pdf_content[i]
            page_text = page_content.get_textpage()
            text += page_text.get_text_range() + " "
            [g.close() for g in (page_text, page_content)]
        
        pdf_content.close()

        # preprocess the text
        text = plib.process_text(text, lower=False)

        with open(text_path, "w", encoding='ascii') as f:
            f.write(text)
    except:
        print("ERROR!")
        print("When converting pdf to text for: ", pdf_path)
# --------------------start of test code--------------------
# index = 99
# # index = 0
# pdf_folder = fpath.pdf_folder
# text_folder = fpath.text_folder

# pdf_file_name = str(index) + ".pdf"
# pdf_path = os.path.join(pdf_folder, pdf_file_name)
# text_path = os.path.join(text_folder, pdf_file_name.split(".pdf")[0] + ".txt")
# pdf2text(pdf_path, text_path)

# # with open(text_path, "r", encoding='ascii') as f:
# #     text = f.read()
# #     print(text)
# ---------------------end of test code---------------------

In [None]:
def json2text(json_path, text_path):    
    with open(json_path, "r") as f:
        json_file = json.load(f)
        text = json_file["full-text-retrieval-response"]["originalText"]

    title = json_file["full-text-retrieval-response"]["coredata"]["dc:title"]

    # preprocess the text
    text = text.split(title, 1)[1].strip()
    text = title + ". " + text
    # print(text)
    text = plib.process_text(text, lower=False)
    # print(text)

    with open(text_path, "w", encoding='ascii') as f:
        f.write(text)
# --------------------start of test code--------------------
# index = 99
# pdf_folder = fpath.pdf_folder
# text_folder = fpath.text_folder

# json_file_name = str(index) + ".json"
# json_path = os.path.join(pdf_folder, json_file_name)
# text_path = os.path.join(text_folder, str(index) + ".txt")
# # print(pdf_path)
# # print(text_path)
# json2text(json_path, text_path)
# ---------------------end of test code---------------------

## Main program:

In [None]:
# Extract text to store to a text file, record the articles whose pdfs or jsons are not available
input_path = fpath.poten_litera_db
df = pd.read_csv(input_path, header=None, sep=',')
df.columns = df_col.db_columns

poten_litera_pdf_not_available = fpath.poten_litera_pdf_not_available
plib.clear_file(poten_litera_pdf_not_available)

pdf_folder = fpath.pdf_folder
text_folder = fpath.text_folder

for ind in df.index:
    time.sleep(1)
    
    index = int(df.at[ind, "INDEX"])
    
    pdf_file_name = str(index) + ".pdf"
    json_file_name = str(index) + ".json"
    
    pdf_path = os.path.join(pdf_folder, pdf_file_name)
    json_path = os.path.join(pdf_folder, json_file_name)
    text_path = os.path.join(text_folder, str(index) + ".txt")

    if os.path.exists(pdf_path):
        # pass
        pdf2text(pdf_path, text_path)
    elif os.path.exists(json_path):
        # pass
        json2text(json_path, text_path)
    else:
        # Write the article information of the pdfs or jsons that are not available to a csv file
        # pass
        selected_row = df.iloc[[ind]]
        selected_row.to_csv(poten_litera_pdf_not_available, mode='a', header=False, index=False)
        # print(df.at[ind, "INDEX"], df.at[ind, "DOI"], df.at[ind, "PMID"], df.at[ind, "PMCID"])
        # print(df.at[ind, "TITLE"])
        # print(df.at[ind, "FULL_TEXT_URL"], df.at[ind, "FULL_TEXT_SOURCE"])
        # print(df.at[ind, "PDF_URL"], df.at[ind, "PDF_SOURCE"])
        # print("\n")

    print(ind, index)

In [None]:
# # Test if the text files are correctly generated, in case the texts of some pdfs are not extractable or not extracted correctly
# input_path = fpath.poten_litera_db
# df = pd.read_csv(input_path, header=None, sep=',')
# df.columns = df_col.db_columns

# pdf_folder = fpath.pdf_folder
# text_folder = fpath.text_folder
# processed_text_folder = fpath.processed_texts_of_length_500_folder
# not_recog_articles_folder = fpath.not_recog_articles_folder

# for ind in df.index:
#     time.sleep(1)
    
#     index = int(df.at[ind, "INDEX"])
    
#     pdf_path = os.path.join(pdf_folder, str(index) + ".pdf")
#     json_path = os.path.join(pdf_folder, str(index) + ".json")
#     text_path = os.path.join(text_folder, str(index) + ".txt")
#     text_processed_path = os.path.join(processed_text_folder, str(index) + ".txt")

#     if os.path.exists(text_path):
#         # first we test if the length of the text is greater than the length to extract
#         with open(text_path, 'r', encoding="ascii") as f:
#             text = f.read()
        
#         text_split = text.split()
    
#         if os.path.exists(json_path):
#             json2text(json_path, text_path)
            
#             # read from the text file again and check if the length is less than the length to extract
#             with open(text_path, 'r', encoding="ascii") as f:
#                 text_json = f.read()
            
#             text_json_split = text_json.split()
            
#             if len(text_json_split) >= len(text_split):
#                 pass
#             else:
#                 with open(text_path, 'w', encoding="ascii") as f:
#                     f.write(text)

#     print(ind, index)

In [None]:
# # Iterate the folder of text_path and if the length of the text is less than the length to extract, then delete the file
# input_path = fpath.poten_litera_db
# df = pd.read_csv(input_path, header=None, sep=',')
# df.columns = df_col.db_columns

# text_folder = fpath.text_folder

# for ind in df.index:
#     index = int(df.at[ind, "INDEX"])
#     text_path = os.path.join(text_folder, str(index) + ".txt")
        
#     if os.path.exists(text_path):
#         # first we test if the length of the text is greater than the length to extract
#         with open(text_path, 'r', encoding="ascii") as f:
#             text = f.read()
        
#         text_split = text.split()
        
#         if len(text_split) == 0:
#             os.remove(text_path)
#             print("The text file is deleted: ", index)

In [None]:
# # Extract text of length params.text_length_to_extract from the text files and store to a text file
# input_path = fpath.poten_litera_db
# df = pd.read_csv(input_path, header=None, sep=',')
# df.columns = df_col.db_columns

# pdf_folder = fpath.pdf_folder
# text_folder = fpath.text_folder
# processed_text_folder = fpath.processed_texts_of_length_500_folder
# not_recog_articles_folder = fpath.not_recog_articles_folder

# for ind in df.index:
#     time.sleep(1)
    
#     index = int(df.at[ind, "INDEX"])
    
#     pdf_path = os.path.join(pdf_folder, str(index) + ".pdf")
#     json_path = os.path.join(pdf_folder, str(index) + ".json")
#     text_path = os.path.join(text_folder, str(index) + ".txt")
#     text_processed_path = os.path.join(processed_text_folder, str(index) + ".txt")

#     if os.path.exists(text_path):
#         # write the text to the processed text file
#         with open(text_path, 'r', encoding="ascii") as f:
#             text = f.read()
        
#         text_split = text.split()
        
#         if len(text_split) > params.text_length_to_extract:
#             pass
#         else:            
#             # Iterate the text_500 until it reaches the length to extract
#             while len(text.split()) < params.text_length_to_extract:
#                 text += " " + text

#             # copy this file to a folder for manual check
#             if os.path.exists(pdf_path):
#                 shutil.copy(pdf_path, not_recog_articles_folder)

#             if os.path.exists(json_path):
#                 shutil.copy(json_path, not_recog_articles_folder)

#             # print(df.at[ind, "INDEX"], df.at[ind, "DOI"], df.at[ind, "PMID"], df.at[ind, "PMCID"])
#             # print(df.at[ind, "TITLE"])
#             # print(df.at[ind, "FULL_TEXT_URL"], df.at[ind, "FULL_TEXT_SOURCE"])
#             # print(df.at[ind, "PDF_URL"], df.at[ind, "PDF_SOURCE"])
#             # print("\n")
        
#         text_500 = ' '.join(word for word in text.split()[:params.text_length_to_extract])
        
#         with open(text_processed_path, 'w', encoding="ascii") as f:
#             f.write(text_500)
#     else:
#         pass

#     # print(ind, index)

In [None]:
# # Count the number of literatures whose pdfs or jsons are not available
# input_path = fpath.poten_litera_pdf_not_available
# df = pd.read_csv(input_path, header=None, sep=',')

# # print number of rows
# print(df.shape[0])