In [3]:
import os
import pymupdf
import csv

def extract_text_from_pdf(pdf_path):
    """Extract text from a given PDF file."""
    doc = pymupdf.open(pdf_path)
    text = ""
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        text += page.get_text()
    doc.close()  # Close the PDF document
    return text

def find_contexts(text, keywords, context_size=200):
    """Find and extract the context around each keyword in the text (case insensitive)."""
    contexts = []
    for keyword in keywords:
        keyword_lower = keyword.strip().lower()  # Strip whitespace and convert to lowercase
        start = 0
        text_lower = text.lower()
        while (index := text_lower.find(keyword_lower, start)) != -1:
            start_context = max(index - context_size, 0)
            end_context = min(index + len(keyword_lower) + context_size, len(text))
            context = text[start_context:end_context].strip()
            contexts.append((keyword.strip(), context))
            start = index + len(keyword_lower)
    return contexts

def save_contexts_to_csv(contexts, output_path):
    """Save extracted contexts to a CSV file."""
    with open(output_path, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["Filename", "Keyword", "Context"])
        writer.writerows(contexts)

def main(folder_path, output_csv, keywords):
    all_contexts = []

    for filename in os.listdir(folder_path):
        if filename.endswith(".pdf"):
            print(f"Processing file: {filename}")
            pdf_path = os.path.join(folder_path, filename)
            text = extract_text_from_pdf(pdf_path)
            contexts = find_contexts(text, keywords)
            contexts_with_filename = [(filename, keyword, context) for keyword, context in contexts]
            all_contexts.extend(contexts_with_filename)

    save_contexts_to_csv(all_contexts, output_csv)

if __name__ == "__main__":
    folder_path = "D:/Fact_fiction_corpus/royal society/pdf"
    output_csv = "extracted_contexts.csv"
    keywords = [" fact ", " fiction "]  # Add more keywords here as needed
    main(folder_path, output_csv, keywords)


Processing file: rstl_1665_0051.pdf
Processing file: rstl_1665_0052.pdf
Processing file: rstl_1665_0053.pdf
Processing file: rstl_1665_0054.pdf
Processing file: rstl_1665_0055.pdf
Processing file: rstl_1665_0056.pdf
Processing file: rstl_1665_0057.pdf
Processing file: rstl_1665_0042.pdf
Processing file: rstl_1665_0043.pdf
Processing file: rstl_1665_0044.pdf
Processing file: rstl_1665_0050.pdf
Processing file: rstl_1665_0041.pdf
Processing file: rstl_1665_0034.pdf
Processing file: rstl_1665_0025.pdf
Processing file: rstl_1665_0018.pdf
Processing file: rstl_1665_0012.pdf
Processing file: rstl_1665_0068.pdf
Processing file: rstl_1665_0067.pdf
Processing file: rstl_1665_0066.pdf
Processing file: rstl_1665_0065.pdf
Processing file: rstl_1665_0064.pdf
Processing file: rstl_1665_0155.pdf
Processing file: rstl_1665_0063.pdf
Processing file: rstl_1665_0062.pdf
Processing file: rstl_1665_0061.pdf
Processing file: rstl_1665_0060.pdf
Processing file: rstl_1665_0059.pdf
Processing file: rstl_1665_0

In [2]:
import pandas as pd

df = pd.read_csv('extracted_contexts.csv')

for index, row in df.head(10).iterrows():
    print(f"Keyword: {row['Keyword']}")
    print(f"Context: {row['Context']}")
    print()

Keyword:  fact 
Context: kk ~ 
the 
 Downloaded from https://royalsocietypublishing.org/ on 04 June 2024 
( 6062) 
the Guts, that is,thewithinLaclcousVeios,but ever white and 
uaifo1m.Whence we judge it not very feafable to tioge the 
Ve• 
nal chy]e in a well and found animal. 
And He that would 
dc:mut1firate the matter of fact to the Eye, mufi probably do 
it by giving him fome fuch thing io the food , as £hall caufe a 
Diabetes or fome difiemper equivalent to it. 
Thouih we have obferv' 
d many odd things in the feveral · 
Excrcifes of this nature; yet we (hall not trouble you at prc-
fent with any other particulars, tha

Keyword:  fact 
Context: raxin 
attinentia 
profert>
 ut 
opt Torni & tam
hujus quam reliquarum quinque Teftudinum fiant exempla- 
ria: Atque in hanerrem 
fa h  
ilia
•v^
 
' 
confhuit
 Downloaded from https://royalsocietypublishing.org/ on 04 June 2024 
i  * 7 )
unfirait 
quorum 
omnium 
deW
ionftra 
ah 
cow
fulto omwiffee fact 
dime 
ex 
nmc 
proferendu 
Quod qua