Gets contexts for each keyword of interest

In [None]:
import os
import re
import csv
import pandas as pd

class TextContextExtractor:
    def __init__(self, keywords, sentences_before, sentences_after):
        self.keywords = [kw.strip() for kw in keywords]
        self.sentences_before = sentences_before #how many sentences before are counted
        self.sentences_after = sentences_after #how many sentences after are counted

    def extract_text_from_txt(self, txt_path):
        """Extract text from a given TXT file."""
        with open(txt_path, 'r', encoding='utf-8') as file:
            return file.read()

    def find_contexts(self, text):
        """
        Extracts contextual snippets around specified keywords within a text. 

        How it works:
        1. Splits the input text into sentences by looking for punctuation (., !, ?) 
        followed by a space and a capital letter (indicating the start of a new sentence).
        2. Iterates over each keyword defined in keywords.
        3. For each keyword, searches all sentences for matches (case-insensitive, 
        and matching whole words only to avoid partial matches).
        4. When a keyword is found, selects a range of sentences around it:
        - `sentences_before` sentences before the keyword sentence
        - `sentences_after` sentences after the keyword sentence
        5. Joins these sentences together into a single context snippet and stores 
        it along with the keyword.
        
        Returns:
            A list of tuples: [(keyword1, context1), (keyword2, context2), ...]
            Each tuple contains the keyword and the extracted context surrounding it.
        """

        contexts = []
        sentence_pattern = r'(?<=[.!?])\s+(?=[A-Z])' #pattern of sentences, question marks and capitalised letters
        sentences = re.split(sentence_pattern, text)
        sentences = [sent.strip() for sent in sentences if sent.strip()]
        
        for keyword in self.keywords:
            keyword_pattern = r'\b' + re.escape(keyword) + r'\b' #word boundary + special char won't break regex
            for i, sentence in enumerate(sentences):
                if re.search(keyword_pattern, sentence, flags=re.IGNORECASE):
                    start_idx = max(0, i - self.sentences_before)
                    end_idx = min(len(sentences), i + 1 + self.sentences_after)
                    context_sentences = sentences[start_idx:end_idx]
                    context = ' '.join(context_sentences)
                    contexts.append((keyword, context))
        return contexts

    def save_contexts_to_csv(self, contexts, output_path):
        """Save extracted contexts to a CSV file."""
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        
        with open(output_path, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(["Filename", "Keyword", "Context", "Author", "Title", "Date"])
            writer.writerows(contexts)

    def convert_csv_to_excel(self, input_csv, output_excel):
        """
        Convert a CSV file to an Excel file, ensuring that context text is formatted as plain text in Excel.
       
        Parameters:
        - input_csv (str): Path to the input CSV file.
        - output_excel (str): Path to the output Excel file.
        """
        df = pd.read_csv(input_csv)
       
        def format_as_text(value):
            """Ensure that the text is formatted as plain text in Excel."""
            if isinstance(value, str) and value.startswith('='):
                return f"'{value}"
            return value
        
        df['Context'] = df['Context'].apply(format_as_text)
        
        # Ensure the output directory exists
        os.makedirs(os.path.dirname(output_excel), exist_ok=True)
       
        df.to_excel(output_excel, index=False, engine='xlsxwriter')
       

    def load_metadata_csv(self, metadata_path):
        """Load metadata from a CSV file into a dictionary."""
        metadata = {}
        with open(metadata_path, mode='r', newline='', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            for row in reader:
                if 'filename' in row:
                    file_location = row['filename']
                else:
                    file_location = row.get('pdf_link', '').split('/')[-1]
                metadata[file_location] = row
        return metadata

    def process_texts(self, folder_path, output_csv, metadata_csv, create_excel=True):
        """
        Extract keyword contexts from all text files in a folder, enrich them with 
        metadata (extracted from data before in R), and save to a CSV. 
        Optionally converts the CSV to Excel.

        Steps:
        1. Loads metadata from a CSV file into a dictionary.
        2. Iterates through each TXT file in `folder_path`.
        3. Extracts text, finds contexts around keywords, and combines with metadata 
        (author, title, date).
        4. Saves all contexts to `output_csv`.
        5. Optionally creates an Excel version in a subfolder 'excel_ver'.

        Args:
            folder_path (str): Folder containing text files to process.
            output_csv (str): Path to save the combined contexts CSV.
            metadata_csv (str): Path to CSV containing metadata for the files.
            create_excel (bool, default=True): If True, also creates an Excel file.
        """
        metadata = self.load_metadata_csv(metadata_csv)
        all_contexts = []

        for filename in os.listdir(folder_path):
            if filename.endswith(".txt"):
                txt_path = os.path.join(folder_path, filename)
                text = self.extract_text_from_txt(txt_path)
                contexts = self.find_contexts(text)

                for keyword, context in contexts:
                    pdf_filename = os.path.splitext(filename)[0] + ".pdf"
                    txt_filename = filename
                
                    #messy way of getting the pdf_filename between diff types of metadata
                    #could be improved but works for now
                    if pdf_filename in metadata:
                        file_metadata = metadata[pdf_filename]
                    elif txt_filename in metadata:
                        file_metadata = metadata[txt_filename]
                    elif 'filename' in metadata and txt_filename in metadata['filename']:
                        file_metadata = metadata['filename'][txt_filename]
                    else:
                        print(f"No metadata found for {filename}")
                        continue

                    author = file_metadata.get('author', '')
                    title = file_metadata.get('title', '')
                    date = file_metadata.get('date', '')

                    all_contexts.append((filename, keyword, context, author, title, date))

        self.save_contexts_to_csv(all_contexts, output_csv) #might not need both csv and excel
        
        # Optionally create Excel file in excel_ver subfolder (easier to work with/possibly read)
        # should be a separate function
        if create_excel:
            csv_dir = os.path.dirname(output_csv)
            csv_basename = os.path.splitext(os.path.basename(output_csv))[0]
            excel_dir = os.path.join(csv_dir, 'excel_ver')
            output_excel = os.path.join(excel_dir, csv_basename + '.xlsx') #one folder down from the csv
            #output_excel = os.path.join(csv_basename + '.xlsx') #if in same folder

            self.convert_csv_to_excel(output_csv, output_excel)

In [None]:
keywords = ["fact", "fiction", "facts", "facſ", "fictions", "ficſions", "factual", "fictional", "fictionally", "factually", "fictionality", "factuality", "fictionalized", "factualized", "fictive", "factive", "fictitious", "factious"]

extractor = TextContextExtractor(
    keywords=keywords,
    sentences_before=2,
    sentences_after=2,
)

#RSTA (#royal society)
extractor.process_texts(
    folder_path="D:/Fact_fiction_corpus/texts/royal society/txt_rsta",
    output_csv="../data_to_view/contexts_all_together/contexts_RSTA.csv",
    metadata_csv="D:/Fact_fiction_corpus/texts/royal society/royalsociety_metadata_rsta.csv",
    create_excel=True
)

#RSTB (#royal society)
extractor.process_texts(
    folder_path="D:/Fact_fiction_corpus/texts/royal society/txt_rstb",
    output_csv="../data_to_view/contexts_all_together/contexts_RSTB.csv",
    metadata_csv="D:/Fact_fiction_corpus/texts/royal society/royalsociety_metadata_rstb.csv",
    create_excel=True
)

#RSTL (#royal society)
extractor.process_texts(
    folder_path="D:/Fact_fiction_corpus/texts/royal society/txt_rstl",
    output_csv="../data_to_view/contexts_all_together/contexts_RSTL.csv",
    metadata_csv="D:/Fact_fiction_corpus/texts/royal society/royalsociety_metadata_rstl.csv",
    create_excel=True
)

In [20]:
extractor = TextContextExtractor(
    keywords=keywords,
    sentences_before=2,
    sentences_after=2,
)

#General Magazine of Arts and Sciences (albeit empty)
extractor.process_texts(
    folder_path="D:/Fact_fiction_corpus/texts/General Magazine of Arts and Sciences/txt",
    output_csv="../data_to_view/contexts_all_together/contexts_general_magazine.csv",
    metadata_csv="D:/Fact_fiction_corpus/texts/General Magazine of Arts and Sciences/general_magazine_metadata.csv",
    create_excel=True
)


In [18]:
extractor = TextContextExtractor(
    keywords=keywords,
    sentences_before=2,
    sentences_after=2,
)

#The Spectator
extractor.process_texts(
    folder_path="D:/Fact_fiction_corpus/texts/spectator/txt",
    output_csv="../data_to_view/contexts_all_together/contexts_spectator.csv",
    metadata_csv="D:/Fact_fiction_corpus/texts/spectator/spectator_metadata.csv",
    create_excel=True
)

In [15]:
#testing the output txt
import pandas as pd

df = pd.read_csv("../data_to_view/contexts_all_together/extracted_contexts_from_spectator.csv")

for index, row in df.head(10).iterrows():
    print(f"Filename: {row['Filename']}")
    print(f"Keyword: {row['Keyword']}")
    print(f"Author: {row['Author']}")
    print(f"Title: {row['Title']}")
    print(f"Context: {row['Context']}")

    print()

FileNotFoundError: [Errno 2] No such file or directory: '../data_to_view/contexts_all_together/extracted_contexts_from_spectator.csv'