# Phase 1, Step 1: Data Preparation

This notebook handles the complete data preparation pipeline for Phase 1 of the HGC Thesis project. It performs the following steps:

1.  **Task A Data (Broad Knowledge):** Fetches, cleans, and processes a corpus of articles on finance and economics from Wikipedia.
2.  **Task B Data (Specialized Knowledge):** Loads, cleans, and processes a dataset of corporate earnings call transcripts.
3.  **Structuring & Splitting:** Formats both datasets into a consistent structure, splits them into training, validation, and testing sets.
4.  **Saving:** Saves the final, analysis-ready datasets to the `../data/` directory.

---

### 1. Setup and Dependencies

In [1]:
import wikipediaapi
import pandas as pd
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import re
import os
from tqdm import tqdm

# --- Configuration ---
WIKI_LANG = 'en'
DATA_DIR = '../data'
TEST_SPLIT_SIZE = 0.15
VALIDATION_SPLIT_SIZE = 0.15 
RANDOM_STATE = 42

# Ensure the data directory exists
os.makedirs(DATA_DIR, exist_ok=True)

print("Setup complete. Dependencies loaded and configuration set.")

Setup complete. Dependencies loaded and configuration set.


### 2. Task A: Fetch and Process Wikipedia Data (Finance & Economics)

In [2]:
wiki_wiki = wikipediaapi.Wikipedia(
    language=WIKI_LANG,
    user_agent="HGC_Thesis_Research/1.0 (justin.arndt@email.com)", # Replace with your actual contact info
    extract_format=wikipediaapi.ExtractFormat.WIKI
)

SEED_TOPICS = [
    # Macroeconomics
    'Macroeconomics', 'Fiscal policy', 'Monetary policy', 'Inflation', 'Gross domestic product', 'Unemployment',
    # Microeconomics
    'Microeconomics', 'Supply and demand', 'Market structure', 'Game theory', 'Opportunity cost',
    # Financial Markets
    'Stock market', 'Bond (finance)', 'Foreign exchange market', 'Derivative (finance)', 'Financial regulation',
    # Corporate Finance
    'Corporate finance', 'Capital budgeting', 'Valuation (finance)', 'Financial statement', 'Dividend policy',
    'Mergers and acquisitions'
]

print(f"Fetching content for {len(SEED_TOPICS)} seed topics from Wikipedia...")

all_pages = []
processed_titles = set()

# Check if Wikipedia data already exists to avoid re-downloading
task_a_train_path = os.path.join(DATA_DIR, 'task_a_wikipedia_train.parquet')
if os.path.exists(task_a_train_path):
    print("Wikipedia data already processed. Skipping download.")
    wiki_df = pd.read_parquet(os.path.join(DATA_DIR, 'task_a_wikipedia_full_unsplit.parquet'))
else:
    for topic in tqdm(SEED_TOPICS, desc="Processing Seed Topics"):
        page = wiki_wiki.page(topic)
        if page.exists() and topic not in processed_titles:
            all_pages.append({'title': page.title, 'text': page.text, 'source': 'wikipedia'})
            processed_titles.add(page.title)

            # Get linked pages to broaden the corpus
            for link in page.links:
                if link not in processed_titles:
                    linked_page = wiki_wiki.page(link)
                    if linked_page.exists() and linked_page.text:
                        all_pages.append({'title': linked_page.title, 'text': linked_page.text, 'source': 'wikipedia'})
                        processed_titles.add(linked_page.title)
    
    wiki_df = pd.DataFrame(all_pages)
    print(f"Successfully fetched {len(wiki_df)} pages from Wikipedia.")
    print("Performing initial cleaning...")

    # --- Basic Cleaning ---
    wiki_df['text_len'] = wiki_df['text'].str.len()
    wiki_df = wiki_df[wiki_df['text_len'] > 500].copy()

    def clean_wiki_text(text):
        text = re.split(r'\n==\s?See also\s?==', text, flags=re.IGNORECASE)[0]
        text = re.split(r'\n==\s?References\s?==', text, flags=re.IGNORECASE)[0]
        text = re.split(r'\n==\s?External links\s?==', text, flags=re.IGNORECASE)[0]
        text = re.sub(r'\n+', '\n', text).strip()
        return text

    wiki_df['text'] = wiki_df['text'].apply(clean_wiki_text)
    wiki_df = wiki_df[['text', 'source']].copy()
    # Save the full unsplit data for caching
    wiki_df.to_parquet(os.path.join(DATA_DIR, 'task_a_wikipedia_full_unsplit.parquet'), index=False)

    print(f"Wikipedia data cleaned. Final corpus size: {len(wiki_df)} documents.")

Fetching content for 22 seed topics from Wikipedia...


Processing Seed Topics: 100%|██████████████████████████████████████████████████████████| 22/22 [07:38<00:00, 20.83s/it]


Successfully fetched 1687 pages from Wikipedia.
Performing initial cleaning...
Wikipedia data cleaned. Final corpus size: 1672 documents.


### 3. Task B: Load and Process Earnings Call Transcripts

In [4]:
print("Loading earnings call transcript dataset from Hugging Face...")

# FIX: Switched to a comprehensive and reliable dataset: 'kurry/sp500_earnings_transcripts'
earnings_dataset = load_dataset("kurry/sp500_earnings_transcripts", split='train')
earnings_df = earnings_dataset.to_pandas()

print(f"Loaded {len(earnings_df)} transcripts.")
print("Performing cleaning and structuring...")

# The dataset has a 'content' column which we will rename to 'text'
earnings_df.rename(columns={'content': 'text'}, inplace=True)
earnings_df['source'] = 'earnings_call'

# Clean up dataframe
earnings_df = earnings_df[['text', 'source']].copy()
earnings_df.dropna(inplace=True)
earnings_df = earnings_df[earnings_df['text'].str.len() > 500] # Increased min length for quality

print(f"Earnings call data cleaned. Final corpus size: {len(earnings_df)} documents.")

Loading earnings call transcript dataset from Hugging Face...


README.md: 0.00B [00:00, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


part-0.parquet:   0%|          | 0.00/1.82G [00:00<?, ?B/s]

Generating train split:   0%|          | 0/33362 [00:00<?, ? examples/s]

Loaded 33362 transcripts.
Performing cleaning and structuring...
Earnings call data cleaned. Final corpus size: 33248 documents.


### 4. Split Datasets and Save to Disk

In [None]:
def split_and_save(df, task_name):
    print(f"\nSplitting and saving data for {task_name}...")
    
    # First split: separate out the test set
    train_val_df, test_df = train_test_split(
        df, 
        test_size=TEST_SPLIT_SIZE, 
        random_state=RANDOM_STATE
    )
    
    # Second split: separate the training and validation sets
    # Adjusting the validation size relative to the remaining data
    relative_val_size = VALIDATION_SPLIT_SIZE / (1 - TEST_SPLIT_SIZE)
    train_df, val_df = train_test_split(
        train_val_df, 
        test_size=relative_val_size, 
        random_state=RANDOM_STATE
    )
    
    # Define file paths
    train_path = os.path.join(DATA_DIR, f"{task_name}_train.parquet")
    val_path = os.path.join(DATA_DIR, f"{task_name}_val.parquet")
    test_path = os.path.join(DATA_DIR, f"{task_name}_test.parquet")
    
    # Save to Parquet format (more efficient than CSV)
    train_df.to_parquet(train_path, index=False)
    val_df.to_parquet(val_path, index=False)
    test_df.to_parquet(test_path, index=False)
    
    print(f"Data for {task_name} saved successfully:")
    print(f"  Training set size:   {len(train_df)}")
    print(f"  Validation set size: {len(val_df)}")
    print(f"  Test set size:       {len(test_df)}")

# Process and save Task A data
split_and_save(wiki_df, 'task_a_wikipedia')

# Process and save Task B data
split_and_save(earnings_df, 'task_b_earnings')

print("\n--- Data Preparation Complete ---")