#### The goal of this notebook is to make a .pkl file where the abstract, introduction, body, conclusion and the main text and the doi entry are in a dataframe to be used by DataLocationProcess.ipynb

In [None]:
import pandas as pd
import pdfplumber
import os
import fitz
import re
import requests

pdf_location = './data/balasz'
csv_location = './data/files/20231030_BAN_shared_fixed.csv'
data = []

In [None]:
# Define a function to clean illegal characters
def clean_illegal_characters(value):
    if isinstance(value, str):
        # Remove control characters
        value = re.sub(r'[\x00-\x1F\x7F-\x9F]', '', value)
    return value

def get_doi_metadata(doi):
    # CrossRef REST API URL
    url = f"https://api.crossref.org/works/{doi}"

    try:
        # Send a GET request to the API
        response = requests.get(url)
        # Check if the response is successful
        if response.status_code == 200:
            # Return the JSON metadata
            return response.json()['message']
        else:
            print(f"Error fetching DOI {doi}: {response.status_code}")
            return None
    except Exception as e:
        print(f"Error fetching DOI {doi}: {e}")
        return None



class cleanArticleText:
    def __init__(self, text: str):
        self.text = text
        self.abstract = None  # To store the abstract text

    def cleanText(self):
        # Regular expressions to locate sections
        intro_pattern = re.compile(r'\bIntroduction\b', re.IGNORECASE)
        abs_pattern = re.compile(r'\bA\s*B\s*S\s*T\s*R\s*A\s*C\s*T\b', re.IGNORECASE)
        refs_pattern = re.compile(r'\bReferences\b', re.IGNORECASE)
        ack_pattern = re.compile(r'\bAcknowledgements?\b', re.IGNORECASE)

        # Find start of "Introduction" and "Abstract"
        intro_match = intro_pattern.search(self.text)
        abs_match = abs_pattern.search(self.text)

        # Extract Abstract if it exists
        if abs_match:
            abs_start = abs_match.end()
            abs_end = intro_match.start() if intro_match else None
            self.abstract = self.text[abs_start:abs_end].strip()

        # Determine the start position based on the first occurrence of Abstract or Introduction
        if abs_match and intro_match:
            start_position = min(abs_match.end(), intro_match.end())
        elif abs_match:
            start_position = abs_match.end()
        elif intro_match:
            start_position = intro_match.end()
        else:
            # If neither Abstract nor Introduction is found, start from the beginning
            start_position = 0

        # Find first occurrence of "References" or "Acknowledgements"
        refs_match = refs_pattern.search(self.text)
        ack_match = ack_pattern.search(self.text)

        # Determine the end position based on the first occurrence of References or Acknowledgements
        end_position = None
        if refs_match and ack_match:
            end_position = min(refs_match.start(), ack_match.start())
        elif refs_match:
            end_position = refs_match.start()
        elif ack_match:
            end_position = ack_match.start()

        # Handle cases where end_position might be invalid
        if end_position and start_position > end_position:
            start_position = 0

        # Slice the text based on the determined start and end positions
        if end_position:
            cleaned_text = self.text[start_position:end_position]
        else:
            cleaned_text = self.text[start_position:]

        return cleaned_text.strip(), self.abstract  # Return both the cleaned text and the abstract

    def extractSection(self, keywords: str):
        # Regular expressions to locate sections
        pattern = re.compile(r'\b{}\b'.format(keywords), re.IGNORECASE)
        match = pattern.search(self.text)
        if match:
            start_position = match.end()
            return self.text[start_position:]
        else:
            return None
        

def extract_section_between(text, start_pattern, end_pattern):
    """Extract text between two patterns."""
    start_match = re.search(start_pattern, text, re.IGNORECASE)
    end_match = re.search(end_pattern, text, re.IGNORECASE)

    if start_match:
        start_index = start_match.end()
        end_index = end_match.start() if end_match else len(text)
        return text[start_index:end_index].strip()
    return "Section not found"

In [None]:
df = pd.read_csv(csv_location)
print('number of unique doi:', len(df['reported'].unique()))
df.head()

In [None]:
# Process PDFs
for filename in os.listdir(pdf_location):
    if filename.endswith(".pdf"):
        # Extract DOI from filename
        downloaded_doi = filename.replace(".pdf", "")[:-2].replace("_", "/")
        doi_metadata = get_doi_metadata(downloaded_doi)

        # Read PDF content
        pdf_path = os.path.join(pdf_location, filename)
        doc = fitz.open(pdf_path)
        full_text = ""
        for page in doc:
            full_text += page.get_text("text")

        # Use the cleanArticleText class to process the text
        cleaner = cleanArticleText(full_text)
        cleaned_text, abstract = cleaner.cleanText()

        # Fallback if abstract is not found
        if not abstract:
            abstract = "Abstract not found. Extracted from start of article."

        # Extract title from DOI metadata
        title = doi_metadata.get('title', ['N/A'])[0] if doi_metadata else 'N/A'

        # Append to the data list
        data.append({
            "Filename": filename,
            "Title": title.strip(),
            "DOI": downloaded_doi,
            "Abstract": abstract.strip(),
            "Main Content": cleaned_text.strip()
        })

# Convert the data to a DataFrame
articles_df = pd.DataFrame(data)


In [None]:
downloaded_doi = []
for filename in os.listdir(pdf_location):
    if filename.endswith(".pdf"):
        downloaded_doi.append(filename.replace(".pdf", "")[:-2].replace("_", "/"))

downloaded_set = set(downloaded_doi)
reported_set = set(df['reported'].unique())
undownloaded_dois = reported_set - downloaded_set
print('Undownloaded DOIs:', undownloaded_dois)

# Find any DOIs in downloaded_set that are not in reported_set
missing_in_reported = downloaded_set - reported_set

# Check if there are any missing DOIs
if missing_in_reported:
    print('DOIs in downloaded set but not in reported set:', missing_in_reported)
else:
    print('All downloaded DOIs are in the reported set.')

duplicates = [doi for doi in downloaded_doi if downloaded_doi.count(doi) > 1]
# Print the duplicates
if duplicates:
    print('Duplicates in downloaded DOIs:', set(duplicates))
else:
    print('No duplicates in downloaded DOIs.')


In [None]:
articles_df = articles_df.applymap(clean_illegal_characters)
articles_df.to_excel("processed_articles.xlsx", index=False, engine="openpyxl")
articles_df.to_csv("processed_articles.csv", index=False, encoding="utf-8", sep=",")

In [None]:
articles_df

In [None]:
# Adjusted start pattern to include symbols and whitespace around INTRODUCTION
articles_df = pd.read_pickle("processed_articles.pkl")
i = 69
# columnHeader = 'Abstract'
# columnHeader = 'Introduction'
columnHeader = 'Conclusion'
start_pattern = r'Conclusions'
end_pattern = r'Conﬂicts of interest' 
text_to_extract = articles_df['Main Content'][i]
# print('main content: ', articles_df['Main Content'][i])
print('abstract: ', articles_df['Abstract'][i])
print(articles_df['Filename'][i])

In [None]:
# Manual extraction of abstract, introduction, and conclusion sections
def extract_section_between(text, start_pattern, end_pattern):
    """Extract text between two patterns."""
    start_match = re.search(start_pattern, text, re.IGNORECASE)
    end_match = re.search(end_pattern, text, re.IGNORECASE)
    print('start_match:', start_match)
    print('end_match:', end_match)

    if start_match:
        start_index = start_match.end()
        end_index = end_match.start() if end_match else len(text)
        return text[start_index:end_index].strip()
    return "Section not found"

# Call the function
temp = extract_section_between(text_to_extract, start_pattern, end_pattern)
print('Extracted Section:', temp)


In [None]:
# If pattern matching is not working, try to copy and pasete the text manually in temp
# temp = ''''''
# articles_df.at[i, columnHeader] = temp

In [None]:
# Save the DataFrame as a pickle file
articles_df.to_pickle('processed_articles.pkl')