Gets contexts for each keyword of interest
(can speed this up by having it write the files as it runs, etc.)

In [9]:
import os
import re
import csv
import pandas as pd
from tqdm import tqdm

import nltk
nltk.download('punkt', quiet=True)
nltk.download('punkt_tab', quiet=True)

import spacy

#might be a bit redundant to have both spacy and nltk, but can easily switch between them when testing (and remove one later)
#spacy is better at splitting into sentences, but much slower performance
try:
    nlp = spacy.load("en_core_web_sm")
    nlp.max_length = 3_000_000 #set a different max to accom. longer docs
except OSError:
    print("Downloading spaCy model 'en_core_web_sm'...")
    os.system("python -m spacy download en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

class TextContextExtractor:
    def __init__(self, keywords, sentences_before, sentences_after, newest_year_included=1950, skip_after_specific_year=True, use_spacy=False, 
                  create_excel=True):
        self.keywords = [kw.strip() for kw in keywords]
        self.sentences_before = sentences_before #how many sentences before are counted
        self.sentences_after = sentences_after #how many sentences after are counted
        self.use_spacy = use_spacy #if not, uses nltk
        self.newest_year_included = newest_year_included
        self.create_excel = create_excel
        self.skip_after_specific_year = skip_after_specific_year
        
        self.keyword_patterns = [
            (kw, re.compile(r'\b' + re.escape(kw) + r'\b', flags=re.IGNORECASE))
            for kw in self.keywords
        ]

    def extract_text_from_txt(self, txt_path):
        """Extract text from a given TXT file."""
        with open(txt_path, 'r', encoding='utf-8') as file:
            return file.read()

    def _tokenize_sentences(self, text):
        """Tokenize text into sentences using either spaCy or NLTK."""
        if self.use_spacy:
            doc = nlp(text)
            sentences = [sent.text.strip() for sent in doc.sents if sent.text.strip()]
        else: #nltk
            sentences = nltk.tokenize.sent_tokenize(text, language='english')
            sentences = [sent.strip() for sent in sentences if sent.strip()]
        return sentences

    def _get_context_window(self, sentences, match_idx):
        """Extract a window of sentences around a matched keyword."""
        start_idx = max(0, match_idx - self.sentences_before)
        end_idx = min(len(sentences), match_idx + 1 + self.sentences_after)
        context_sentences = sentences[start_idx:end_idx]
        return ' '.join(context_sentences)

    def find_contexts(self, text):
        """
        Extracts contextual snippets around specified keywords within a text. 

        How it works:
        1. Splits the input text into sentences using either spaCy or NLTK tokenization.
        2. Iterates over each keyword defined in keywords.
        3. For each keyword, searches all sentences for matches (case-insensitive, 
        and matching whole words only to avoid partial matches).
        4. When a keyword is found, selects a range of sentences around it:
        - `sentences_before` sentences before the keyword sentence
        - `sentences_after` sentences after the keyword sentence
        5. Joins these sentences together into a single context snippet and stores 
        it along with the keyword.
        
        Returns:
            A list of tuples: [(keyword1, context1), (keyword2, context2), ...]
            Each tuple contains the keyword and the extracted context surrounding it.
        """
        contexts = []
        sentences = self._tokenize_sentences(text)
        
        for keyword, pattern in self.keyword_patterns:
            for i, sentence in enumerate(sentences):
                if pattern.search(sentence):
                    context = self._get_context_window(sentences, i)
                    contexts.append((keyword, context))
        return contexts

    def save_contexts_to_csv(self, contexts, output_path):
        """Save extracted contexts to a CSV file."""
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        
        with open(output_path, mode='w', newline='', encoding='utf-8') as file:
            writer = csv.writer(file)
            writer.writerow(["Filename", "Keyword", "Context", "Author", "Title", "Date"])
            writer.writerows(contexts)

    def _format_as_text(self, value):
        """Ensure that the text is formatted as plain text in Excel."""
        if isinstance(value, str) and value.startswith('='):
            return f"'{value}"
        return value

    def convert_csv_to_excel(self, input_csv, output_excel):
        """
        Convert a CSV file to an Excel file, ensuring that context text is formatted as plain text in Excel.
       
        Parameters:
        - input_csv (str): Path to the input CSV file.
        - output_excel (str): Path to the output Excel file.
        """
        df = pd.read_csv(input_csv)
        df['Context'] = df['Context'].apply(self._format_as_text)
        
        # Ensure the output directory exists
        os.makedirs(os.path.dirname(output_excel), exist_ok=True)
       
        df.to_excel(output_excel, index=False, engine='xlsxwriter')

    def load_metadata_csv(self, metadata_path):
        """Load metadata from a CSV file into a dictionary."""
        metadata = {}
        with open(metadata_path, mode='r', newline='', encoding='utf-8') as file:
            reader = csv.DictReader(file)
            for row in reader:
                if 'filename' in row:
                    file_location = row['filename']
                else:
                    file_location = row.get('pdf_link', '').split('/')[-1]
                
                base_name = os.path.splitext(file_location)[0]
                metadata[file_location] = row
                metadata[base_name + '.txt'] = row
                metadata[base_name + '.pdf'] = row
        return metadata

    def _get_file_metadata(self, metadata, filename):
        """Get metadata for a file, trying multiple filename formats."""
        #messy way of getting the pdf_filename between diff types of metadata
        #could be improved but works for now
        
        if filename in metadata:
            return metadata[filename]
        
        base_name = os.path.splitext(filename)[0]
        pdf_filename = base_name + ".pdf"
        txt_filename = base_name + ".txt"
        
        if pdf_filename in metadata:
            return metadata[pdf_filename]
        elif txt_filename in metadata:
            return metadata[txt_filename]
        
        return None

    def _should_skip_by_year(self, date):
        """Check if a file should be skipped based on its year."""
        year_match = re.search(r'\d{4}', str(date))
        if year_match:
            year = int(year_match.group())
            if year > self.newest_year_included:
                #print(f"{filename} article, year {year}, is too new to be of interest (newer than year:{newest_year_included})")
                return True
        return False

    def _create_excel_path(self, output_csv):
        """Generate the Excel output path in the excel_ver subfolder."""
        csv_dir = os.path.dirname(output_csv)
        csv_basename = os.path.splitext(os.path.basename(output_csv))[0]
        excel_dir = os.path.join(csv_dir, 'excel_ver')
        output_excel = os.path.join(excel_dir, csv_basename + '.xlsx') #one folder down from the csv
        #output_excel = os.path.join(csv_basename + '.xlsx') #if in same folder
        return output_excel

    def process_texts(self, folder_path, output_csv, metadata_csv):
        """
        Extract keyword contexts from all text files in a folder, enrich them with 
        metadata (extracted from data before in R), and save to a CSV. 
        Optionally converts the CSV to Excel.

        Steps:
        1. Loads metadata from a CSV file into a dictionary.
        2. Iterates through each TXT file in `folder_path`.
        3. Extracts text, finds contexts around keywords, and combines with metadata 
        (author, title, date).
        4. Saves all contexts to `output_csv`.
        5. Optionally creates an Excel version in a subfolder 'excel_ver'.

        Args:
            folder_path (str): Folder containing text files to process.
            output_csv (str): Path to save the combined contexts CSV.
            metadata_csv (str): Path to CSV containing metadata for the files.
        """
        metadata = self.load_metadata_csv(metadata_csv)
        all_contexts = []

        txt_files = [f for f in os.listdir(folder_path) if f.endswith(".txt")]
        print(f"Beginning processing {len(txt_files)} text files in folder: {folder_path}")

        for filename in tqdm(txt_files, desc="Processing files", unit="file"):
            txt_path = os.path.join(folder_path, filename)
            text = self.extract_text_from_txt(txt_path)
            
            file_metadata = self._get_file_metadata(metadata, filename)
            
            if not file_metadata:
                print(f"No metadata found for {filename}")
                continue

            author = file_metadata.get('author', '')
            title = file_metadata.get('title', '')
            date = file_metadata.get('date', '')

            #skip after specific year (too new stuff shouldn't be included); the filename and year are actually somewhat different, might have been scraped metadata Date
            if self.skip_after_specific_year:
                if self._should_skip_by_year(date):
                    continue
            
            contexts = self.find_contexts(text)

            for keyword, context in contexts:
                all_contexts.append((filename, keyword, context, author, title, date))

        self.save_contexts_to_csv(all_contexts, output_csv) #might not need both csv and excel
        
        # Optionally create Excel file in excel_ver subfolder (easier to work with/possibly read)
        # should be a separate function
        if self.create_excel:
            output_excel = self._create_excel_path(output_csv)
            self.convert_csv_to_excel(output_csv, output_excel)

In [2]:
keywords = ["fact", "fiction", "facts", "facſ", "fictions", "ficſions", "factual", "fictional", "fictionally", "factually", "fictionality", "factuality", "fictionalized", "factualized", "fictive", "factive", "fictitious", "factious"]
#could add stemming instead of hardcoding them (so fact/facts automatically)

extractor = TextContextExtractor(
    keywords=keywords,
    sentences_before=2,
    sentences_after=2,
    newest_year_included=1950,
    skip_after_specific_year=True,
    use_spacy=True, #nltk if False
    create_excel=True
)

In [None]:
#RSTA (#royal society)
extractor.process_texts(
    folder_path="D:/Fact_fiction_corpus/texts/royal society/txt_rsta",
    output_csv="../data_to_view/contexts_all_together/contexts_RSTA.csv",
    metadata_csv="D:/Fact_fiction_corpus/texts/royal society/royalsociety_metadata_rsta.csv"
)

#RSTB (#royal society)
extractor.process_texts(
    folder_path="D:/Fact_fiction_corpus/texts/royal society/txt_rstb",
    output_csv="../data_to_view/contexts_all_together/contexts_RSTB.csv",
    metadata_csv="D:/Fact_fiction_corpus/texts/royal society/royalsociety_metadata_rstb.csv"
)

Beginning processing 4390 text files in folder: D:/Fact_fiction_corpus/texts/royal society/txt_rsta


Processing files:  69%|██████▉   | 3049/4390 [00:00<00:00, 6051.38file/s]Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x0000025399AD8F10>>
Traceback (most recent call last):
  File "C:\Users\Igiba\AppData\Roaming\Python\Python310\site-packages\ipykernel\ipkernel.py", line 788, in _clean_thread_parent_frames
    if phase != "start":
KeyboardInterrupt: 


In [3]:
#RSTL (#royal society)
extractor.process_texts(
    folder_path="D:/Fact_fiction_corpus/texts/royal society/txt_rstl",
    output_csv="../data_to_view/contexts_all_together/contexts_RSTL.csv",
    metadata_csv="D:/Fact_fiction_corpus/texts/royal society/royalsociety_metadata_rstl.csv"
)

Beginning processing 8520 text files in folder: D:/Fact_fiction_corpus/texts/royal society/txt_rstl


Processing files: 100%|██████████| 8520/8520 [1:45:38<00:00,  1.34file/s]  


In [10]:
#General Magazine of Arts and Sciences (albeit empty)
extractor.process_texts(
    folder_path="D:/Fact_fiction_corpus/texts/General Magazine of Arts and Sciences/txt",
    output_csv="../data_to_view/contexts_all_together/contexts_general_magazine.csv",
    metadata_csv="D:/Fact_fiction_corpus/texts/General Magazine of Arts and Sciences/general_magazine_metadata.csv"
)


Beginning processing 1 text files in folder: D:/Fact_fiction_corpus/texts/General Magazine of Arts and Sciences/txt


Processing files: 100%|██████████| 1/1 [00:36<00:00, 36.19s/file]


In [11]:
#The Spectator
extractor.process_texts(
    folder_path="D:/Fact_fiction_corpus/texts/spectator/txt",
    output_csv="../data_to_view/contexts_all_together/contexts_spectator.csv",
    metadata_csv="D:/Fact_fiction_corpus/texts/spectator/spectator_metadata.csv")

Beginning processing 632 text files in folder: D:/Fact_fiction_corpus/texts/spectator/txt


Processing files: 100%|██████████| 632/632 [02:19<00:00,  4.53file/s]


In [12]:
#testing the output txt
import pandas as pd

df = pd.read_csv("../data_to_view/contexts_all_together/contexts_RSTA.csv")

for index, row in df.head(10).iterrows():
    print(f"Filename: {row['Filename']}")
    print(f"Keyword: {row['Keyword']}")
    print(f"Author: {row['Author']}")
    print(f"Title: {row['Title']}")
    print(f"Context: {row['Context']}")

    print()

Filename: rsta_1887_0008.txt
Keyword: fact
Author: George Howard Darwin
Title: VIII. Note on Mr. Davison’s paper on the straining of the Earth's crust in cooling
Context: In that letter it is pointed out that the stratum of
the Earth where the.rate of cooling is most rapid lies some miles below the Earth's
surface. Commenting on this, I wrote :— ;

“The Rev. O. FisHer very justly remarks that the more rapid contraction of the
internal than the external strata would cause a wrinkling of the surface, although he
does not admit that this can be the sole cause of geological distortion. The fact that
the region of maximum rate of cooling is so near to the surface recalls the interesting
series of experiments recently made by M. Favre (‘ Nature,’ vol. 19, p. 108), where
all the phenomena of geological contortion were reproduced in a layer of clay placed
on a stretched india-rubber membrane, which was afterwards allowed to contract. Does it not seem possible that Mr. Fister may have under-est

In [None]:
import os

os.system(f"shutdown /s /t 60")
