In [27]:
import json
import os
import sys
from collections import defaultdict
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
import pandas as pd

# Paths and inputs
repo = Path('..').resolve()
coverage_path = repo / 'coverage_table_updated.csv'
validation_path = repo / 'validation_results.csv'
candidates_path = repo / 'report_candidates.csv'

coverage = pd.read_csv(coverage_path, sep='\t')
candidates = pd.read_csv(candidates_path)

# Load validation results to skip already-validated reports
validated_df = pd.read_csv(validation_path)
validated_set = set(
    (row['CID'], row['Year']) 
    for _, row in validated_df[validated_df['Valid'] == True].iterrows()
)
print(f"âœ“ Already validated: {len(validated_set)} company-year combinations")

# Filter to incomplete rows that are NOT already validated
incomplete = coverage[coverage['Priority'] != 'Complete âœ“'].copy()

def needs_download(row):
    return (row['Company_Identifier'], row['FiscalYear']) not in validated_set

incomplete = incomplete[incomplete.apply(needs_download, axis=1)]
incomplete = incomplete[['CompanyName', 'Company_Identifier', 'FiscalYear', 'IR_URL']]

print(f"â†’ Reports to download: {len(incomplete)}")
print(f"â†’ Candidates available: {len(candidates)}")

# Show summary by company
by_company = incomplete.groupby('CompanyName').size().sort_values(ascending=False)
print(f"\nCompanies needing downloads ({len(by_company)}):")
for company, count in by_company.head(10).items():
    print(f"  {company}: {count} years")

âœ“ Already validated: 98 company-year combinations
â†’ Reports to download: 49
â†’ Candidates available: 22

Companies needing downloads (10):
  IndustrivÃ¤rden C: 12 years
  Nordea Bank Abp: 10 years
  Hennes & Mauritz B: 8 years
  Sv. Handelsbanken A: 6 years
  NIBE Industrier B: 5 years
  SEB A: 4 years
  Boliden: 1 years
  Lifco B: 1 years
  Skanska B: 1 years
  Telia Company: 1 years


In [28]:
# Download PDFs using precomputed candidates from step 3
from aspiratio.utils.report_downloader import download_pdf

def download_company_year(cid, company, year, ir_url, company_candidates):
    """Download a single company-year report, trying all candidates."""
    result = {
        'cid': cid,
        'company': company,
        'year': year,
        'status': 'failed',
        'url': '',
        'source_page': ir_url,
        'error': '',
        'pages': 0,
        'size_mb': 0.0
    }
    
    if company_candidates.empty:
        result['error'] = 'No candidate URL for year'
        return result
    
    # Try each candidate until one succeeds
    for _, cand in company_candidates.iterrows():
        url = cand['URL']
        title = cand.get('Title', '')
        source_page = cand.get('Source_Page', ir_url) or ir_url
        output_path = repo / 'companies' / cid / f"{cid}_{year}_Annual_Report.pdf"
        
        # Create directory if needed
        output_path.parent.mkdir(parents=True, exist_ok=True)
        
        dl_result = download_pdf(url, str(output_path), year_hint=year)
        
        if dl_result.get('success'):
            result['status'] = 'success'
            result['url'] = url
            result['source_page'] = source_page
            result['title'] = title
            result['pages'] = dl_result.get('pages', 0)
            result['size_mb'] = dl_result.get('size_mb', 0.0)
            result['output_path'] = str(output_path)
            return result
        else:
            result['error'] = dl_result.get('error', 'download failed')
            result['url'] = url
    
    return result

# Build download tasks
download_tasks = []
for _, row in incomplete.iterrows():
    cid = row['Company_Identifier']
    company = row['CompanyName']
    year = int(row['FiscalYear'])
    ir_url = row['IR_URL']
    company_candidates = candidates[(candidates['Company_Identifier'] == cid) & (candidates['Year'] == year)]
    download_tasks.append((cid, company, year, ir_url, company_candidates))

print(f"ðŸ“¥ Starting downloads for {len(download_tasks)} reports...")

# Download in parallel (but limit to 3 concurrent to be respectful to servers)
download_results = []
max_workers = min(3, len(download_tasks)) if download_tasks else 1

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    futures = {
        executor.submit(download_company_year, cid, company, year, ir_url, cands): (cid, company, year)
        for cid, company, year, ir_url, cands in download_tasks
    }
    
    for future in as_completed(futures):
        cid, company, year = futures[future]
        try:
            result = future.result()
            download_results.append(result)
            if result['status'] == 'success':
                print(f"  âœ“ {company} ({cid}) {year}: {result['pages']} pages")
            else:
                print(f"  âœ— {company} ({cid}) {year}: {result['error']}")
        except Exception as e:
            print(f"  âœ— {company} ({cid}) {year}: Exception - {e}")
            download_results.append({
                'cid': cid, 'company': company, 'year': year,
                'status': 'failed', 'error': str(e)
            })

# Convert to DataFrame for easier analysis
results_df = pd.DataFrame(download_results)

# Persist summary
download_summary_path = repo / 'download_summary_from_candidates.json'
with open(download_summary_path, 'w') as f:
    json.dump(download_results, f, indent=2, default=str)

print(f"\nðŸ“Š Download summary saved to: {download_summary_path}")
results_df.head(10)

ðŸ“¥ Starting downloads for 49 reports...
  â†’ Downloading from: https://mb.cision.com/Public/997/3056319/925822ca04929dc0.pdf
  âœ— NIBE Industrier B (S18) 2019: No candidate URL for year
  â†’ Downloading from: https://network.s-z.se/app/uploads/sites/16/Lifco-annual-report-2017.pdf
  âœ— NIBE Industrier B (S18) 2020: No candidate URL for year
  âœ— NIBE Industrier B (S18) 2021: No candidate URL for year
  âœ— NIBE Industrier B (S18) 2022: No candidate URL for year
  âœ— NIBE Industrier B (S18) 2023: No candidate URL for year
  â†’ Downloading from: https://mb.cision.com/Main/4324/3222287/1323639.pdf
  â†’ Saving to: /Users/jakobjohannesson/Documents/github_repos/aspiratio/companies/S23/S23_2019_Annual_Report.pdf
  â†’ Downloaded: 0.0 MB
  â†’ Validating PDF...
  â†’ PDF has 1 pages
  âœ— Only 1 pages (min 50 required)
  âœ— SEB A (S23) 2019: Only 1 pages (min 50 required)
  â†’ Downloading from: https://mb.cision.com/Main/4324/3298906/1381441.pdf
  â†’ Saving to: /Users/jakobjohann

ignore '/Perms' verify failed


  â†’ Downloading from: https://www.nordea.com/en/doc/annual-report-nordea-bank-abp-2024-0.pdf#page=5
  â†’ Downloaded: 8.3 MB
  â†’ Validating PDF...
  â†’ PDF has 87 pages
  âœ“ Success: 87 pages, 8.3 MB
  â†’ Downloading from: https://mb.cision.com/Main/769/3743344/1953371.pdf
  âœ“ Hennes & Mauritz B (S13) 2020: 87 pages
  â†’ PDF has 386 pages
  âœ“ Success: 386 pages, 9.2 MB
  âœ“ Nordea Bank Abp (S19) 2023: 386 pages
  â†’ Downloading from: https://mb.cision.com/Main/769/3952727/2698038.pdf
  â†’ Saving to: /Users/jakobjohannesson/Documents/github_repos/aspiratio/companies/S13/S13_2023_Annual_Report.pdf
  â†’ Saving to: /Users/jakobjohannesson/Documents/github_repos/aspiratio/companies/S13/S13_2022_Annual_Report.pdf
  â†’ Saving to: /Users/jakobjohannesson/Documents/github_repos/aspiratio/companies/S19/S19_2024_Annual_Report.pdf
  â†’ Downloaded: 13.5 MB
  â†’ Validating PDF...
  â†’ Downloaded: 4.3 MB
  â†’ Validating PDF...
  â†’ PDF has 384 pages
  â†’ PDF has 152 pages
  âœ“

ignore '/Perms' verify failed


  â†’ PDF has 386 pages
  âœ“ Success: 386 pages, 9.2 MB
  âœ“ Nordea Bank Abp (S19) 2023: 386 pages
  â†’ Downloading from: https://www.nordea.com/en/doc/annual-report-nordea-bank-abp-2024-0.pdf#page=5
  â†’ Saving to: /Users/jakobjohannesson/Documents/github_repos/aspiratio/companies/S19/S19_2024_Annual_Report.pdf
  â†’ Downloaded: 8.7 MB
  â†’ Validating PDF...
  â†’ PDF has 364 pages
  âœ“ Success: 364 pages, 8.7 MB
  âœ“ Nordea Bank Abp (S19) 2022: 364 pages
  â†’ Downloaded: 13.5 MB
  â†’ Validating PDF...
  â†’ PDF has 384 pages
  âœ“ Success: 384 pages, 13.5 MB
  âœ“ Nordea Bank Abp (S19) 2024: 384 pages

ðŸ“Š Download summary saved to: /Users/jakobjohannesson/Documents/github_repos/aspiratio/download_summary_from_candidates.json


Unnamed: 0,cid,company,year,status,url,source_page,error,pages,size_mb,title,output_path
0,S18,NIBE Industrier B,2019,failed,,https://www.nibe.com/investors,No candidate URL for year,0,0.0,,
1,S18,NIBE Industrier B,2020,failed,,https://www.nibe.com/investors,No candidate URL for year,0,0.0,,
2,S18,NIBE Industrier B,2021,failed,,https://www.nibe.com/investors,No candidate URL for year,0,0.0,,
3,S18,NIBE Industrier B,2022,failed,,https://www.nibe.com/investors,No candidate URL for year,0,0.0,,
4,S18,NIBE Industrier B,2023,failed,,https://www.nibe.com/investors,No candidate URL for year,0,0.0,,
5,S23,SEB A,2019,failed,https://mb.cision.com/Main/4324/3222287/132363...,https://sebgroup.com/investor-relations,Only 1 pages (min 50 required),0,0.0,,
6,S23,SEB A,2020,failed,https://mb.cision.com/Main/4324/3298906/138144...,https://sebgroup.com/investor-relations,Only 1 pages (min 50 required),0,0.0,,
7,S23,SEB A,2021,failed,https://mb.cision.com/Main/4324/3516765/154275...,https://sebgroup.com/investor-relations,Only 2 pages (min 50 required),0,0.0,,
8,S23,SEB A,2022,failed,https://mb.cision.com/Main/4324/3724878/188109...,https://sebgroup.com/investor-relations,Only 2 pages (min 50 required),0,0.0,,
9,S24,Skanska B,2019,failed,,https://group.skanska.com/investors/,No candidate URL for year,0,0.0,,


In [29]:
# Summarize download outcomes
successes = results_df[results_df['status'] == 'success']
failures = results_df[results_df['status'] == 'failed']

print('Download Summary')
print('=' * 50)
print(f"âœ“ Successful downloads: {len(successes)}")
print(f"âœ— Failed downloads: {len(failures)}")
print(f"Total attempted: {len(results_df)}")

if len(successes) > 0:
    print(f"\nSuccessful downloads:")
    for _, row in successes.iterrows():
        pages = row.get('pages', '?')
        print(f"  âœ“ {row['company']} ({row['cid']}) {row['year']}: {pages} pages")

if len(failures) > 0:
    print(f"\nFailed downloads (by error type):")
    error_counts = failures.groupby('error').size().sort_values(ascending=False)
    for error, count in error_counts.items():
        print(f"  â€¢ {error}: {count}")
    
    print(f"\nFailed details:")
    for _, row in failures.iterrows():
        print(f"  âœ— {row['company']} ({row['cid']}) {row['year']}: {row['error']}")

Download Summary
âœ“ Successful downloads: 22
âœ— Failed downloads: 27
Total attempted: 49

Successful downloads:
  âœ“ Lifco B (S17) 2020: 92 pages
  âœ“ Sv. Handelsbanken A (S26) 2019: 260 pages
  âœ“ Sv. Handelsbanken A (S26) 2021: 284 pages
  âœ“ Sv. Handelsbanken A (S26) 2022: 272 pages
  âœ“ Sv. Handelsbanken A (S26) 2023: 312 pages
  âœ“ Sv. Handelsbanken A (S26) 2024: 388 pages
  âœ“ Hennes & Mauritz B (S13) 2022: 152 pages
  âœ“ Hennes & Mauritz B (S13) 2020: 87 pages
  âœ“ Hennes & Mauritz B (S13) 2023: 148 pages
  âœ“ Hennes & Mauritz B (S13) 2024: 87 pages
  âœ“ Nordea Bank Abp (S19) 2019: 296 pages
  âœ“ Nordea Bank Abp (S19) 2022: 364 pages
  âœ“ Hennes & Mauritz B (S13) 2020: 87 pages
  âœ“ Nordea Bank Abp (S19) 2023: 386 pages
  âœ“ Nordea Bank Abp (S19) 2024: 384 pages
  âœ“ Hennes & Mauritz B (S13) 2022: 152 pages
  âœ“ Hennes & Mauritz B (S13) 2024: 87 pages
  âœ“ Nordea Bank Abp (S19) 2019: 296 pages
  âœ“ Hennes & Mauritz B (S13) 2023: 148 pages
  âœ“ Nordea Bank A

In [30]:
# Validate successful downloads, copy to companies_validated/{CID}/, and update coverage table
import importlib.util
from datetime import datetime

# Load validation function from scripts
validate_script_path = repo / 'scripts' / 'validate_reports.py'
spec = importlib.util.spec_from_file_location("validate_reports", validate_script_path)
validate_module = importlib.util.module_from_spec(spec)
spec.loader.exec_module(validate_module)
validate_pdf = validate_module.validate_pdf

# Setup directories
validated_dir = repo / 'companies_validated'
validated_dir.mkdir(exist_ok=True)

# Reload coverage table (in case it was modified)
coverage_df = pd.read_csv(coverage_path, sep='\t')

validation_records = []

for _, dl in successes.iterrows():
    cid = dl['cid']
    company = dl['company']
    year = dl['year']
    output_path = dl.get('output_path', '')
    
    if not output_path:
        output_path = str(repo / 'companies' / cid / f"{cid}_{year}_Annual_Report.pdf")
    
    pdf_path = Path(output_path)
    
    if not pdf_path.exists():
        validation_records.append({
            'company': company, 'cid': cid, 'year': year,
            'valid': False, 'issues': 'file missing after download'
        })
        print(f"âš  Missing file: {pdf_path}")
        continue

    # Run validation
    validation = validate_pdf(str(pdf_path), company_name=company, expected_year=year)
    is_valid = validation.get('valid', False)
    issues = '; '.join(validation.get('issues', [])) if validation.get('issues') else ''
    size_mb = os.path.getsize(pdf_path) / (1024 * 1024)
    
    validation_records.append({
        'company': company, 'cid': cid, 'year': year,
        'valid': is_valid, 'issues': issues,
        'confidence': validation.get('confidence'),
        'pages': validation.get('pages'),
        'size_mb': size_mb
    })

    if is_valid:
        # Copy to validated directory
        dest_dir = validated_dir / cid
        dest_dir.mkdir(exist_ok=True)
        dest_path = dest_dir / pdf_path.name
        dest_path.write_bytes(pdf_path.read_bytes())
        print(f"âœ“ Validated: {company} {year} ({validation.get('pages')} pages, {validation.get('confidence'):.1f}% confidence)")

        # Update coverage table
        mask = (coverage_df['Company_Identifier'] == cid) & (coverage_df['FiscalYear'] == year)
        coverage_df.loc[mask, 'Report_URL'] = dl.get('url', '')
        coverage_df.loc[mask, 'Source_Page'] = dl.get('source_page', '')
        coverage_df.loc[mask, 'Pages'] = validation.get('pages', '')
        coverage_df.loc[mask, 'Size_MB'] = f"{size_mb:.2f}"
        coverage_df.loc[mask, 'Validation_Status'] = 'Valid'
        coverage_df.loc[mask, 'Validation_Confidence'] = f"{validation.get('confidence', 0.0):.1f}%"
        coverage_df.loc[mask, 'Validation_Issues'] = issues
        coverage_df.loc[mask, 'Validation_Date'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
        coverage_df.loc[mask, 'Priority'] = 'Complete âœ“'
        coverage_df.loc[mask, 'CaptureStatus'] = 'Validated'
    else:
        print(f"âœ— Failed validation: {company} {year} ({issues})")

# Summary
print('\n' + '=' * 50)
print('Validation Summary')
print('=' * 50)
valid_count = sum(1 for r in validation_records if r['valid'])
print(f"âœ“ Valid: {valid_count} / {len(validation_records)}")
print(f"âœ— Invalid: {len(validation_records) - valid_count}")

if any(not r['valid'] for r in validation_records):
    print('\nInvalid reports:')
    for r in validation_records:
        if not r['valid']:
            print(f"  âœ— {r['company']} ({r['cid']}) {r['year']}: {r.get('issues','')}")

# Save updated coverage table
coverage_df.to_csv(coverage_path, sep='\t', index=False)
print(f"\nâœ“ Coverage table updated: {coverage_path}")

âœ— Failed validation: Lifco B 2020 (Year 2020 not found in PDF)
âœ— Failed validation: Sv. Handelsbanken A 2019 (Company name "Sv. Handelsbanken A" not found in PDF)
âœ— Failed validation: Sv. Handelsbanken A 2021 (Company name "Sv. Handelsbanken A" not found in PDF)
âœ— Failed validation: Sv. Handelsbanken A 2022 (Company name "Sv. Handelsbanken A" not found in PDF)
âœ— Failed validation: Sv. Handelsbanken A 2023 (Company name "Sv. Handelsbanken A" not found in PDF)
âœ— Failed validation: Sv. Handelsbanken A 2024 (Company name "Sv. Handelsbanken A" not found in PDF)
âœ— Failed validation: Hennes & Mauritz B 2022 (Company name "Hennes & Mauritz B" not found in PDF)
âœ— Failed validation: Hennes & Mauritz B 2020 (Company name "Hennes & Mauritz B" not found in PDF)
âœ— Failed validation: Hennes & Mauritz B 2023 (Company name "Hennes & Mauritz B" not found in PDF)
âœ— Failed validation: Hennes & Mauritz B 2024 (Company name "Hennes & Mauritz B" not found in PDF)
âœ— Failed validation: No

ignore '/Perms' verify failed
ignore '/Perms' verify failed


âœ— Failed validation: Nordea Bank Abp 2023 (Company name "Nordea Bank Abp" not found in PDF)
âœ— Failed validation: Nordea Bank Abp 2024 (Company name "Nordea Bank Abp" not found in PDF)
âœ— Failed validation: Hennes & Mauritz B 2022 (Company name "Hennes & Mauritz B" not found in PDF)
âœ— Failed validation: Hennes & Mauritz B 2024 (Company name "Hennes & Mauritz B" not found in PDF)
âœ— Failed validation: Nordea Bank Abp 2019 (Company name "Nordea Bank Abp" not found in PDF)
âœ— Failed validation: Hennes & Mauritz B 2023 (Company name "Hennes & Mauritz B" not found in PDF)


ignore '/Perms' verify failed
ignore '/Perms' verify failed


âœ— Failed validation: Nordea Bank Abp 2023 (Company name "Nordea Bank Abp" not found in PDF)
âœ— Failed validation: Nordea Bank Abp 2022 (Company name "Nordea Bank Abp" not found in PDF)
âœ— Failed validation: Nordea Bank Abp 2024 (Company name "Nordea Bank Abp" not found in PDF)

Validation Summary
âœ“ Valid: 0 / 22
âœ— Invalid: 22

Invalid reports:
  âœ— Lifco B (S17) 2020: Year 2020 not found in PDF
  âœ— Sv. Handelsbanken A (S26) 2019: Company name "Sv. Handelsbanken A" not found in PDF
  âœ— Sv. Handelsbanken A (S26) 2021: Company name "Sv. Handelsbanken A" not found in PDF
  âœ— Sv. Handelsbanken A (S26) 2022: Company name "Sv. Handelsbanken A" not found in PDF
  âœ— Sv. Handelsbanken A (S26) 2023: Company name "Sv. Handelsbanken A" not found in PDF
  âœ— Sv. Handelsbanken A (S26) 2024: Company name "Sv. Handelsbanken A" not found in PDF
  âœ— Hennes & Mauritz B (S13) 2022: Company name "Hennes & Mauritz B" not found in PDF
  âœ— Hennes & Mauritz B (S13) 2020: Company name "Henn

In [31]:
# Append new validation records to validation_results.csv
if validation_records:
    new_records = []
    for r in validation_records:
        if r['valid']:
            cid = r['cid']
            year = r['year']
            company = r['company']
            pdf_name = f"{cid}_{year}_Annual_Report.pdf"
            
            # Find the download result for this record
            dl_info = successes[(successes['cid'] == cid) & (successes['year'] == year)].iloc[0] if len(successes[(successes['cid'] == cid) & (successes['year'] == year)]) > 0 else {}
            
            new_records.append({
                'CID': cid,
                'Company_Name': company,
                'Year': year,
                'Filename': pdf_name,
                'Valid': True,
                'Confidence': r.get('confidence', 0),
                'Pages': r.get('pages', 0),
                'Company_Found': True,
                'Year_Found': True,
                'Issues': r.get('issues', ''),
                'Source_Path': f"companies/{cid}/{pdf_name}",
                'Validated_Date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                'IR_URL': dl_info.get('source_page', '') if isinstance(dl_info, dict) else (dl_info['source_page'] if 'source_page' in dl_info.index else ''),
                'Pattern_Type': 'Downloaded via notebook'
            })
    
    if new_records:
        # Load existing validation results
        existing_validation = pd.read_csv(validation_path)
        new_df = pd.DataFrame(new_records)
        
        # Remove duplicates (in case we're re-running)
        existing_keys = set(zip(existing_validation['CID'], existing_validation['Year']))
        new_df = new_df[~new_df.apply(lambda r: (r['CID'], r['Year']) in existing_keys, axis=1)]
        
        if len(new_df) > 0:
            updated_validation = pd.concat([existing_validation, new_df], ignore_index=True)
            updated_validation.to_csv(validation_path, index=False)
            print(f"âœ“ Added {len(new_df)} new records to validation_results.csv")
        else:
            print("â„¹ All validated records already exist in validation_results.csv")
else:
    print("â„¹ No validation records to add")