In [1]:
# Import required libraries
import os
import sys
import pandas as pd
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Install required packages if not already installed
import subprocess
packages = ['PyPDF2', 'pandas', 'pathlib']

for package in packages:
    try:
        __import__(package)
    except ImportError:
        print(f"Installing {package}...")
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', package])

print("✅ All required packages are available")

✅ All required packages are available


In [2]:
sys.path.append("scripts") 
# Import the parser 
from hsbc_earnings_qa_parser import HSBCEarningsParser

# Initialize the parser
parser = HSBCEarningsParser()
print("✅ Parser initialized successfully")

✅ Parser initialized successfully


In [3]:
# Simple approach - navigate from where your notebook is
import os
from pathlib import Path

# If your notebook is in the same project folder, try this:
project_root = Path.cwd()
while project_root.name != "cam_ds_ep_FinSight" and project_root != project_root.parent:
    project_root = project_root.parent

if project_root.name == "cam_ds_ep_FinSight":
    input_dir = project_root / "data" / "raw" / "hsbc"
    output_dir = project_root / "data" / "processed" / "hsbc"
    print(f"✅ Found project root: {project_root}")
else:
    # Fallback to manual path
    input_dir = "Documents/2. learn_data-science/cam_ds_course_4_ep/cam_ds_ep_FinSight/data/raw/hsbc"  # Replace with your actual path
    output_dir = "Documents/2. learn_data-science/cam_ds_course_4_ep/cam_ds_ep_FinSight/data/processed/hsbc"  # Replace with your actual path

print(f"📁 Input directory: {input_dir}")
print(f"📁 Output directory: {output_dir}")

# Test the path
if os.path.exists(str(input_dir)):
    pdf_files = [f for f in os.listdir(str(input_dir)) if f.lower().endswith('.pdf')]
    print(f"✅ Found {len(pdf_files)} PDF files!")
else:
    print(f"❌ Still can't find the directory")

✅ Found project root: /Users/jerome.ahye/Documents/2. learn_data-science/cam_ds_course_4_ep/cam_ds_ep_FinSight
📁 Input directory: /Users/jerome.ahye/Documents/2. learn_data-science/cam_ds_course_4_ep/cam_ds_ep_FinSight/data/raw/hsbc
📁 Output directory: /Users/jerome.ahye/Documents/2. learn_data-science/cam_ds_course_4_ep/cam_ds_ep_FinSight/data/processed/hsbc
✅ Found 10 PDF files!


In [4]:
# Test the corrected speaker extraction
parser = HSBCEarningsParser()

# Test HSBC executive (should have role=title, company=HSBC)
test1 = "NOEL QUINN, GROUP CHIEF EXECUTIVE: Thank you for joining us today."
result1 = parser.extract_speaker_info(test1)
print("HSBC Executive test:", result1)

# Test analyst (should have role=Analyst, company=firm)  
test2 = "MANUS COSTELLO, AUTONOMOUS: Hi, thanks for taking my question."
result2 = parser.extract_speaker_info(test2)
print("Analyst test:", result2)

HSBC Executive test: {'speaker_name': 'Noel Quinn', 'role': 'Group Chief Executive', 'company': 'HSBC', 'content_start': 'Thank you for joining us today.'}
Analyst test: {'speaker_name': 'Manus Costello', 'role': 'Analyst', 'company': 'Autonomous', 'content_start': 'Hi, thanks for taking my question.'}


In [5]:
# Process all PDF files in the input directory
if pdf_files:
    print("🚀 Starting processing...\n")
    
    # Run the parser
    df_combined = parser.process_directory(input_dir, output_dir)
    
    if not df_combined.empty:
        print("\n✅ Processing completed successfully!")
        print(f"\n📊 Summary Statistics:")
        print(f"   Total records: {len(df_combined):,}")
        print(f"   Data shape: {df_combined.shape}")
        print(f"   Years covered: {sorted(df_combined['year'].unique())}")
        print(f"   Quarters covered: {sorted(df_combined['quarter'].unique())}")
        print(f"   Sections: {df_combined['section'].value_counts().to_dict()}")
        print(f"   Speakers: {df_combined['speaker_name'].nunique()} unique speakers")
        print(f"   Companies: {df_combined['company'].nunique()} unique companies")
    else:
        print("\n❌ No data was successfully processed. Please check the PDF files and try again.")
else:
    print("❌ No PDF files to process. Please add files to the input directory first.")

🚀 Starting processing...


✅ Processing completed successfully!

📊 Summary Statistics:
   Total records: 376
   Data shape: (376, 11)
   Years covered: [np.int64(2023), np.int64(2024), np.int64(2025)]
   Quarters covered: ['Q1', 'Q2', 'Q3', 'Q4']
   Sections: {'qa': 354, 'presentation': 22}
   Speakers: 41 unique speakers
   Companies: 23 unique companies
