In [29]:
import pandas as pd
from pathlib import Path
import importlib
import aspiratio.utils.ir_search as ir_search_module

# Reload to pick up latest changes
importlib.reload(ir_search_module)
from aspiratio.utils.ir_search import search_ir_url

repo = Path('..').resolve()
master_path = repo / 'instrument_master.csv'
coverage_path = repo / 'coverage_table_updated.csv'
master = pd.read_csv(master_path, sep='\t')
coverage = pd.read_csv(coverage_path, sep='\t')

needs_ir = master[master['investor_relations_url'].isna() | (master['investor_relations_url'] == '')]
print(f"Companies needing IR URLs: {len(needs_ir)}")
needs_ir[['CID', 'CompanyName']].head()

Companies needing IR URLs: 355


Unnamed: 0,CID,CompanyName
36,S37,Acrinova B
37,S38,Actic Group
38,S39,Active Biotech
41,S42,Alimak Group
43,S44,Alligator Bioscience


In [30]:
# Search IR URLs for missing companies
# Score > 200 = high confidence, auto-validate
VALIDATION_THRESHOLD = 200

updated = []
for _, row in needs_ir.iterrows():
    cid = row['CID']
    name = row['CompanyName']
    print(f'Finding IR URL for {name} ({cid})')
    ir_url, score = search_ir_url(name)
    updated.append((cid, ir_url, score))

for cid, ir_url, score in updated:
    if ir_url:
        master.loc[master['CID'] == cid, 'investor_relations_url'] = ir_url
        # Auto-validate if score is above threshold
        if score > VALIDATION_THRESHOLD:
            master.loc[master['CID'] == cid, 'validated'] = True
            print(f"✓ {cid}: Auto-validated (score {score})")

master.to_csv(master_path, sep='\t', index=False)
print(f"\nUpdated {len([u for u in updated if u[1]])} companies with IR URLs")
print(f"Auto-validated {len([u for u in updated if u[2] > VALIDATION_THRESHOLD])} companies (score > {VALIDATION_THRESHOLD})")
master[master['CID'].isin([cid for cid, _, _ in updated])][['CID', 'CompanyName', 'investor_relations_url', 'validated']].head(10)

Finding IR URL for Acrinova B (S37)
Searching for IR URL with queries: ['Acrinova investor relations', 'Acrinova investors', 'Acrinova investor relations site', 'Acrinova financial reports', 'Acrinova investerare', 'Acrinova investors official site', '"Acrinova" investor relations', 'Acrinova årsredovisning']
Searching with DuckDuckGo...
Scored https://www.acrinova.se/hem/investor-relations/: 280 (Preferred TLD boost (.se) (+50), Swedish .se TLD extra boost (+30), Domain segment exact match (+45), IR path/subdomain boost (+40), Company in title (+20), Company in desc (+10), IR keyword in title (+15), IR keyword in desc (+10), Strong IR content validation (+60, 11 terms))
Found high-confidence match (score 280), stopping search.
Best IR URL: https://www.acrinova.se/hem/investor-relations/ (score 280)
Finding IR URL for Actic Group (S38)
Searching for IR URL with queries: ['Actic Group investor relations', 'Actic investors', 'Actic investor relations site', 'Actic financial reports', 'Ac

  ir_url, score = search_ir_url(name)


Scored https://www.acticgroup.se/en/financial-information: 245 (Preferred TLD boost (.se) (+50), Swedish .se TLD extra boost (+30), Group boost (+30), Domain segment substring match (+40), Company in title (+20), IR keyword in title (+15), Strong IR content validation (+60, 11 terms))
Found high-confidence match (score 245), stopping search.
Best IR URL: https://www.acticgroup.se/en/financial-information (score 245)
Finding IR URL for Active Biotech (S39)
Searching for IR URL with queries: ['Active Biotech investor relations', 'Active Biotech investors', 'Active Biotech investor relations site', 'Active Biotech financial reports', 'Active Biotech investerare', 'Active Biotech investors official site', '"Active Biotech" investor relations', 'Active Biotech årsredovisning']
Searching with DuckDuckGo...


  ir_url, score = search_ir_url(name)


Scored http://investors.aduro.com.cutestat.com/: 115 (Preferred TLD boost (.com) (+50), Subdomain boost (investors) (+40), IR keyword in title (+15), Weak IR content validation (+10, 2 terms))
Found high-confidence match (score 115), stopping search.
Best IR URL: http://investors.aduro.com.cutestat.com/ (score 115)
Finding IR URL for Alimak Group (S42)
Searching for IR URL with queries: ['Alimak Group investor relations', 'Alimak investors', 'Alimak investor relations site', 'Alimak financial reports', 'Alimak investerare', 'Alimak Group investors official site', '"Alimak" investor relations', 'Alimak årsredovisning']
Searching with DuckDuckGo...


  ir_url, score = search_ir_url(name)


Scored https://corporate.alimakgroup.com/en/about-us/organisation/: 235 (Preferred TLD boost (.com) (+50), Group boost (+30), Domain segment substring match (+40), Company in title (+20), Company in desc (+10), IR keyword in title (+15), IR keyword in desc (+10), Strong IR content validation (+60, 9 terms))
Found high-confidence match (score 235), stopping search.
Best IR URL: https://corporate.alimakgroup.com/en/about-us/organisation/ (score 235)
Finding IR URL for Alligator Bioscience (S44)
Searching for IR URL with queries: ['Alligator Bioscience investor relations', 'Alligator Bioscience investors', 'Alligator Bioscience investor relations site', 'Alligator Bioscience financial reports', 'Alligator Bioscience investerare', 'Alligator Bioscience investors official site', '"Alligator Bioscience" investor relations', 'Alligator Bioscience årsredovisning']
Searching with DuckDuckGo...


  ir_url, score = search_ir_url(name)


Scored https://news.cision.com/alligator-bioscience/r/alligator-bioscience-to-participate-in-upcoming-investor-conferences,c3482570: -25 (Aggregator penalty (news.cision.com) (-100), Preferred TLD boost (.com) (+50), Very long path penalty (-30), Company in title (+20), Company in desc (+10), IR keyword in title (+15), Weak IR content validation (+10, 2 terms))
Scored https://news.cision.com/alligator-bioscience: 50 (Aggregator penalty (news.cision.com) (-100), Preferred TLD boost (.com) (+50), Company in title (+20), Company in desc (+10), IR keyword in desc (+10), Strong IR content validation (+60, 5 terms))
Scored https://news.cision.com/alligator-bioscience/r/alligator-bioscience-appoints-chief-scientific-officer,c3268220: -40 (Aggregator penalty (news.cision.com) (-100), Preferred TLD boost (.com) (+50), Very long path penalty (-30), Company in title (+20), Company in desc (+10), Weak IR content validation (+10, 2 terms))
Scored https://www.potterclarkson.com:443/case-studies/stra

  ir_url, score = search_ir_url(name)


DuckDuckGo search error for query 'Alligo investors official site': No results found.
Scored https://www.alligo.com/en/investor-relations/: 290 (Preferred TLD boost (.com) (+50), Domain segment exact match (+45), IR path/subdomain boost (+40), Exact IR path match (+60), Company in title (+20), IR keyword in title (+15), Strong IR content validation (+60, 10 terms))
Found high-confidence match (score 290), stopping search.
Best IR URL: https://www.alligo.com/en/investor-relations/ (score 290)
Finding IR URL for Alvotech SDB (S46)
Searching for IR URL with queries: ['Alvotech SDB investor relations', 'Alvotech SDB investors', 'Alvotech SDB investor relations site', 'Alvotech SDB financial reports', 'Alvotech SDB investerare', 'Alvotech SDB investors official site', '"Alvotech SDB" investor relations', 'Alvotech SDB årsredovisning']
Searching with DuckDuckGo...


  ir_url, score = search_ir_url(name)


Scored https://investors.alvotech.com/news-releases/news-release-details/alvotechs-private-placement-completed-delivery-sdrs-and-shares: 145 (Preferred TLD boost (.com) (+50), Subdomain boost (investors) (+40), Very long path penalty (-30), IR keyword in title (+15), IR keyword in desc (+10), Strong IR content validation (+60, 7 terms))
Found high-confidence match (score 145), stopping search.
Best IR URL: https://investors.alvotech.com/news-releases/news-release-details/alvotechs-private-placement-completed-delivery-sdrs-and-shares (score 145)
Finding IR URL for Ambea (S47)
Searching for IR URL with queries: ['Ambea investor relations', 'Ambea investors', 'Ambea investor relations site', 'Ambea financial reports', 'Ambea investerare', 'Ambea investors official site', '"Ambea" investor relations', 'Ambea årsredovisning']
Searching with DuckDuckGo...


  data = " ".join(x.strip() for x in item.xpath(value))


DuckDuckGo search error for query 'Ambea investors official site': No results found.
Scored https://ambea.com/investor-relations/: 300 (Preferred TLD boost (.com) (+50), Domain segment exact match (+45), IR path/subdomain boost (+40), Exact IR path match (+60), Company in title (+20), Company in desc (+10), IR keyword in title (+15), Strong IR content validation (+60, 10 terms))
Found high-confidence match (score 300), stopping search.
Best IR URL: https://ambea.com/investor-relations/ (score 300)
Finding IR URL for Annehem Fastigheter B (S48)
Searching for IR URL with queries: ['Annehem Fastigheter investor relations', 'Annehem Fastigheter investors', 'Annehem Fastigheter investor relations site', 'Annehem Fastigheter financial reports', 'Annehem Fastigheter investerare', 'Annehem Fastigheter investors official site', '"Annehem Fastigheter" investor relations', 'Annehem Fastigheter årsredovisning', 'Annehem Fastigheter investerare']
Searching with DuckDuckGo...


  ir_url, score = search_ir_url(name)


DuckDuckGo search error for query 'Annehem Fastigheter investors official site': No results found.
Scored https://quartr.com/companies/annehem-fastigheter_14810: 65 (Aggregator penalty (quartr.com) (-100), Preferred TLD boost (.com) (+50), Company in title (+20), Company in desc (+10), IR keyword in title (+15), IR keyword in desc (+10), Strong IR content validation (+60, 6 terms))
Scored https://financialreports.eu/companies/annehem-fastigheter/: 30 (Aggregator penalty (financialreports.eu) (-100), Acceptable TLD boost (.eu) (+15), Company in title (+20), Company in desc (+10), IR keyword in title (+15), IR keyword in desc (+10), Strong IR content validation (+60, 6 terms))
Scored https://www.investing.com/equities/annehem-fastigheter-ab: -70 (Aggregator penalty (investing.com) (-100), Preferred TLD boost (.com) (+50), HTTP 403 or non-HTML penalty (-20))
Scored https://www.di.se/bors/aktier/anne-b-5210344/: 70 (Aggregator penalty (di.se) (-100), Preferred TLD boost (.se) (+50), Swedis

  ir_url, score = search_ir_url(name)


DuckDuckGo search error for query 'Anoto Group investors official site': No results found.
Scored https://financialreports.eu/companies/anoto-group/: 30 (Aggregator penalty (financialreports.eu) (-100), Acceptable TLD boost (.eu) (+15), Company in title (+20), Company in desc (+10), IR keyword in title (+15), IR keyword in desc (+10), Strong IR content validation (+60, 7 terms))
Scored https://investorshangout.com/anoto-group-announces-updated-trading-dates-for-rights-issue-134738-/: -90 (Aggregator penalty (investorshangout.com) (-100), Preferred TLD boost (.com) (+50), Long path penalty (-20), HTTP 403 or non-HTML penalty (-20))
Scored https://www.anoto.com/about-us/history/: 145 (Preferred TLD boost (.com) (+50), Domain segment exact match (+45), Company in title (+20), Moderate IR content validation (+30, 3 terms))
Found high-confidence match (score 145), stopping search.
Best IR URL: https://www.anoto.com/about-us/history/ (score 145)
Finding IR URL for Apotea (S50)
Searching for 

  ir_url, score = search_ir_url(name)


DuckDuckGo search error for query 'Apotea investerare': No results found.
DuckDuckGo search error for query 'Apotea investors official site': No results found.
DuckDuckGo search error for query '"Apotea" investor relations': No results found.
DuckDuckGo search error for query 'Apotea årsredovisning': No results found.
Scored https://news.cision.com/apotea/r/number-of-shares-and-votes-in-apotea,c4085944: -20 (Aggregator penalty (news.cision.com) (-100), Preferred TLD boost (.com) (+50), Medium path/query penalty (-10), Company in title (+20), Company in desc (+10), Weak IR content validation (+10, 2 terms))
Scored https://news.cision.com/apotea: 30 (Aggregator penalty (news.cision.com) (-100), Preferred TLD boost (.com) (+50), Company in title (+20), Strong IR content validation (+60, 9 terms))
Scored https://news.cision.com/apotea/r/decision-from-imy-after-inspection-of-apotea-ab--publ-,c4085285: -40 (Aggregator penalty (news.cision.com) (-100), Preferred TLD boost (.com) (+50), Long p

  ir_url, score = search_ir_url(name)


DuckDuckGo search error for query 'Arctic Paper investor relations': No results found.
DuckDuckGo search error for query 'Arctic Paper investors': No results found.
DuckDuckGo search error for query 'Arctic Paper investor relations site': No results found.
DuckDuckGo search error for query 'Arctic Paper financial reports': No results found.
DuckDuckGo search error for query 'Arctic Paper investerare': No results found.
DuckDuckGo search error for query 'Arctic Paper investors official site': No results found.
DuckDuckGo search error for query '"Arctic Paper" investor relations': No results found.
DuckDuckGo search error for query 'Arctic Paper årsredovisning': No results found.
Few results, trying DuckDuckGo with Swedish region...
DuckDuckGo SE search error: No results found.
DuckDuckGo SE search error: No results found.
DuckDuckGo SE search error: No results found.
No valid IR URL found for Arctic Paper
Finding IR URL for Arion Banki SDB (S52)
Searching for IR URL with queries: ['Ario

  self._valid_flags = {flag.name for flag in defined_flags}
  self._valid_flags = {flag.name for flag in defined_flags}


DuckDuckGo search error for query 'Arjo årsredovisning': No results found.
Few results, trying DuckDuckGo with Swedish region...
DuckDuckGo SE search error: No results found.
DuckDuckGo SE search error: No results found.
DuckDuckGo SE search error: No results found.
No valid IR URL found for Arjo B
Finding IR URL for Arla Plast (S54)
Searching for IR URL with queries: ['Arla Plast investor relations', 'Arla Plast investors', 'Arla Plast investor relations site', 'Arla Plast financial reports', 'Arla Plast investerare', 'Arla Plast investors official site', '"Arla Plast" investor relations', 'Arla Plast årsredovisning']
Searching with DuckDuckGo...
DuckDuckGo search error for query 'Arla Plast investor relations': No results found.
DuckDuckGo search error for query 'Arla Plast investors': No results found.
DuckDuckGo search error for query 'Arla Plast investor relations site': No results found.
DuckDuckGo search error for query 'Arla Plast financial reports': No results found.
DuckDuckG

  m = attrfind_tolerant.match(rawdata, k)


Scored https://www.ascelia.com/mfn_news/ascelia-pharma-announces-management-changes-to-support-future-growth/: 120 (Preferred TLD boost (.com) (+50), Long path penalty (-20), Company in title (+20), Company in desc (+10), Strong IR content validation (+60, 7 terms))
Found high-confidence match (score 120), stopping search.
Best IR URL: https://www.ascelia.com/mfn_news/ascelia-pharma-announces-management-changes-to-support-future-growth/ (score 120)
Finding IR URL for Asker Healthcare Group (S56)
Searching for IR URL with queries: ['Asker Healthcare Group investor relations', 'Asker Healthcare investors', 'Asker Healthcare investor relations site', 'Asker Healthcare financial reports', 'Asker Healthcare investerare', 'Asker Healthcare Group investors official site', '"Asker Healthcare" investor relations', 'Asker Healthcare årsredovisning']
Searching with DuckDuckGo...
Scored https://www.asker.com/: 155 (Preferred TLD boost (.com) (+50), Company in title (+20), IR keyword in title (+15)

KeyboardInterrupt: 

In [None]:
# Sync coverage table IR URLs from master
coverage = coverage.merge(master[['CID', 'investor_relations_url']], left_on='Company_Identifier', right_on='CID', how='left')
coverage['IR_URL'] = coverage['investor_relations_url']
coverage.drop(columns=['CID', 'investor_relations_url'], inplace=True)
coverage.to_csv(coverage_path, sep='\t', index=False)
coverage[['CompanyName', 'FiscalYear', 'IR_URL']].head()

Unnamed: 0,CompanyName,FiscalYear,IR_URL
0,ABB Ltd,2019,https://global.abb/group/en/investors/annual-r...
1,ABB Ltd,2020,https://global.abb/group/en/investors/annual-r...
2,ABB Ltd,2021,https://global.abb/group/en/investors/annual-r...
3,ABB Ltd,2022,https://global.abb/group/en/investors/annual-r...
4,ABB Ltd,2023,https://global.abb/group/en/investors/annual-r...
