In [1]:
import pandas as pd

In [2]:
CORPUS_LOCATION = "."
DOCUMENTS_LOCATION = f"{CORPUS_LOCATION}/Documents"
NUM_CASES = 100
CHUNK_SIZE = 500

CATEGORY_MAP = {
    'B': 'Philosophy',
    'G': 'Geography',
    'H': 'Social Sciences',
    'J': 'Politics'
}

In [3]:
import re

def get_text(docid):
    with open(DOCUMENTS_LOCATION + '/' + docid + '.txt', 'r', encoding='utf-8') as file:
        text = file.read()

    # Remove newlines entirely, replace extra spaces with a single space, and strip leading/trailing spaces
    text = re.sub(r'\n+', '', text)  # Remove all newline characters
    text = re.sub(r'\s+', ' ', text).strip()  # Replace multiple spaces with a single space

    return text

In [11]:
def build_training_set(corpus_location, num_cases, chunk_size=100, use_title=False, repect_sentence_boundaries=False):
    # Read the CSV file
    df_full = pd.read_csv(f"{CORPUS_LOCATION}/docid_category.csv")

    # Convert 'Category' column values using CATEGORY_MAP
    df_full["Category"] = df_full["Category"].map(CATEGORY_MAP)
    df_full.drop(columns=["Authors", "Title"], inplace=True)
    # Ensure 'DocID' is a string and pad it with zeros to make it 6 digits
    df_full["DocID"] = df_full.DocID.apply(lambda docid: str(int(docid)).zfill(6) )
    
    # Select random documents based on NUM_CASES
    df_sampled = df_full.sample(NUM_CASES)
    # Get text from Documents using DocID
    df_sampled["Text"] = df_sampled['DocID'].apply(get_text)
    # Create chunks
    df_sampled["Chunk"] = df_sampled.Text.apply(lambda t: extract_chunk(t, CHUNK_SIZE, repect_sentence_boundaries))
    df_sampled.drop(columns=["Text"], inplace=True)
    return df_sampled

In [7]:
import random

def extract_chunk(text, chunk_length, respect_sentence_boundaries=False):
    if not text or len(text) <= chunk_length:
        return text  # Return the whole text if it's too short
    
    if respect_sentence_boundaries:
        # Find all sentence boundaries using regex (detects ., !, ? followed by space or end of string)
        sentences = re.split(r'(?<=[.!?])\s+', text)
        
        # Keep selecting random sentences until we reach the required chunk size
        chunk = []
        current_length = 0
        
        while current_length <= chunk_length and sentences:
            sentence = sentences.pop(0)
            if (current_length + len(sentence) + 1) > chunk_length:
                break
            current_length += len(sentence) + 1  # +1 for space
            chunk.append(sentence)
            
        
        return " ".join(chunk).strip()

    # If not respecting sentence boundaries, pick a random substring
    docpos = random.randrange(len(text) - chunk_length)
    return text[docpos:docpos + chunk_length]