# **Portfolio Intelligence Through Semantic Analysis**

In [None]:
# Connect to Mongo
import pymongo

db_username = 'project_user_name'
db_password = 'password'
URI = f'url'

client = pymongo.MongoClient(URI)
print("Connected to MongoDB.")

Connected to MongoDB.


In [None]:
# ===============================
# Upload R2K Holdings
# ===============================
from datetime import datetime
import pandas as pd
today_date = datetime.today().strftime('%Y%m%d')
today_date

'20251108'

In [None]:
iwb_url = "https://www.ishares.com/us/products/239707/ishares-russell-1000-etf/1467271812596.ajax?fileType=csv&fileName=IWB_holdings"
iwb_data = pd.read_csv(iwb_url, skiprows=9, thousands=',')

iwb_data = iwb_data[iwb_data['Asset Class'] == 'Equity']
iwb_data = iwb_data.loc[iwb_data['Ticker'].str.len() <= 4]
iwb_data = iwb_data.loc[~iwb_data['Ticker'].str.contains(r'\.')]
iwb_data = iwb_data.drop(columns=['Currency', 'FX Rate', 'Market Currency',
                                  'Accrual Date', 'Notional Value', 'Quantity', 'Asset Class'])
iwb_data.columns = iwb_data.columns.str.lower().str.replace(' (%)', '').str.replace(' ', '_')
iwb_data['etf_holding_date'] = today_date

# Standardize ticker symbols
iwb_data.ticker = iwb_data.ticker.map(lambda k: {
    'BRK.B': 'BRK.B', 'LEN.B': 'LEN.B', 'BFA': 'BFA',
    'BFB': 'BFB', 'HEIA': 'HEIA'
}.get(k, k))

iwb_data

Unnamed: 0,ticker,name,sector,market_value,weight,price,location,exchange,etf_holding_date
0,NVDA,NVIDIA CORP,Information Technology,3.131587e+09,7.13,188.08,United States,NASDAQ,20251108
1,AAPL,APPLE INC,Information Technology,2.797971e+09,6.37,269.77,United States,NASDAQ,20251108
2,MSFT,MICROSOFT CORP,Information Technology,2.622815e+09,5.97,497.10,United States,NASDAQ,20251108
3,AMZN,AMAZON COM INC,Consumer Discretionary,1.658607e+09,3.77,243.04,United States,NASDAQ,20251108
5,AVGO,BROADCOM INC,Information Technology,1.168316e+09,2.66,355.59,United States,NASDAQ,20251108
...,...,...,...,...,...,...,...,...,...
1010,CAI,CARIS LIFE SCIENCES INC,Health Care,3.919142e+05,0.00,24.61,United States,NASDAQ,20251108
1011,UHAL,U HAUL HOLDING,Industrials,3.352734e+05,0.00,53.10,United States,New York Stock Exchange Inc.,20251108
1012,INGM,INGRAM MICRO HOLDING CORP,Information Technology,2.802101e+05,0.00,22.13,United States,New York Stock Exchange Inc.,20251108
1013,-,RAYONIER INC CASH ACCRUAL 2,Real Estate,1.132814e+05,0.00,1.05,United States,NO MARKET (E.G. UNLISTED),20251108


In [None]:
# ===============================
# Insert into MongoDB
# ===============================


In [None]:
collection = client['project3']['wikipedia_holdings']
collection.create_index([('ticker', 1), ('etf_holding_date', 1)], unique=True)

'ticker_1_etf_holding_date_1'

In [None]:
# Upload Wikipedia (Entity Resolution)
import wikipedia
import re
from bs4 import BeautifulSoup
import time

# --- Helper Functions ---
def clean_wiki_content(content):
    """
    Cleans Wikipedia content by removing citations, structural noise, and extra whitespace.
    """
    if not content:
        return ""

    # Remove citation markers [1], [23], etc.
    content = re.sub(r'\[\d+\]', '', content)
    # Remove editorial notes like [citation needed]
    content = re.sub(r'\[[a-zA-Z\s]+\]', '', content)

    # Remove structural sections at the end of article
    end_sections = [
        'See also', 'References', 'External links',
        'Further reading', 'Notes', 'Citations'
    ]
    for section in end_sections:
        content = re.split(rf'\n==\s*{section}\s*==\n', content, flags=re.IGNORECASE)[0]

    # Remove excessive whitespace
    content = re.sub(r'\n{3,}', '\n\n', content)

    return content.strip()


def parse_vcard(html_content):
    """
    Parses the Wikipedia infobox (vcard) from the page's HTML.
    """
    soup = BeautifulSoup(html_content, 'html.parser')
    infobox = soup.find('table', class_=['infobox', 'vcard'])

    if not infobox:
        return {}

    vcard_data = {}
    for row in infobox.find_all('tr'):
        header = row.find('th')
        data = row.find('td')
        if header and data:
            key = header.get_text(strip=True).replace('\xa0', ' ')
            value = data.get_text(separator=' ', strip=True).replace('\xa0', ' ')
            value = re.sub(r'\[\d+\]', '', value)
            if key and value:
                vcard_data[key] = value

    return vcard_data


def fetch_wikipedia_data(company_name, ticker, url=''):
    """
    Fetches and validates Wikipedia data (URL, vcard, content) for a company.
    Returns: (url, vcard_dict, cleaned_content)
    """
    # Step 1: Determine Page Title (from URL or search)
    if url:
        try:
            page_title = url.split('/wiki/')[-1]
            if not page_title:
                raise ValueError(f"Invalid URL format: {page_title}")
            print(f"[INFO] Using provided URL. Extracted title: {page_title}")
        except Exception as e:
            print(f"[FAIL] Could not parse provided URL '{url}': {e}")
            return None, None, None
    else:
        print(f"[INFO] No URL provided. Searching for: {company_name}")
        try:
            search_results = wikipedia.search(company_name, results=1)
            if not search_results:
                print(f"[FAIL] No page found via search for: {company_name}")
                return None, None, None
            page_title = search_results[0]
        except Exception as e:
            print(f"[FAIL] Wikipedia search failed for {company_name}: {e}")
            return None, None, None

    # Step 2: Get Page Object
    if not page_title:
        print(f"[FAIL] Could not determine page title for {company_name}.")
        return None, None, None

    try:
        page = wikipedia.page(page_title, auto_suggest=False, redirect=True)
    except wikipedia.exceptions.PageError as e:
        print(f"[FAIL] Page '{page_title}' does not exist ({e}).")
        return None, None, None
    except wikipedia.exceptions.DisambiguationError as e:
        print(f"[FAIL] Page '{page_title}' is ambiguous: {e}")
        return None, None, None
    except Exception as e:
        print(f"[FAIL] Unhandled error getting page '{page_title}': {e}")
        return None, None, None

    # Step 3: Extract Data
    url = page.url
    vcard_dict = parse_vcard(page.html())
    print(vcard_dict)
    cleaned_content = clean_wiki_content(page.content)

    # Step 4: Validate ticker presence
    if ticker not in vcard_dict.get('Traded as', ''):
        print(f"[FAIL] Ticker '{ticker}' not found in vcard or content for page '{url}'.")
        return None, None, None

    return url, vcard_dict, cleaned_content


In [None]:
# Example tests
fetch_wikipedia_data('SentinelOne Inc', 'S')

[INFO] No URL provided. Searching for: SentinelOne Inc
{'Formerly': 'Sentinel Labs, Inc. (2013–2021)', 'Company type': 'Public', 'Traded as': 'NYSE : S Russell 1000 component', 'Industry': 'Cybersecurity', 'Founded': '2013 ; 12 years ago ( 2013 )', 'Founders': 'Ehud Shamir Tomer Weingarten Almog Cohen', 'Headquarters': 'Mountain View, California , U.S.', 'Key people': 'Tomer Weingarten ( chairman & CEO )', 'Revenue': 'US$ 821 million (2025)', 'Operating income': 'US$−329 million (2025)', 'Net income': 'US$−288 million (2025)', 'Total assets': 'US$2.41 billion (2025)', 'Total equity': 'US$1.67 billion (2025)', 'Number of employees': 'c. 2,800 (2024)', 'Subsidiaries': 'Scalyr Attivo Networks PinnacleOne PingSafe', 'Website': 'sentinelone .com'}


('https://en.wikipedia.org/wiki/SentinelOne',
 {'Formerly': 'Sentinel Labs, Inc. (2013–2021)',
  'Company type': 'Public',
  'Traded as': 'NYSE : S Russell 1000 component',
  'Industry': 'Cybersecurity',
  'Founded': '2013 ; 12 years ago ( 2013 )',
  'Founders': 'Ehud Shamir Tomer Weingarten Almog Cohen',
  'Headquarters': 'Mountain View, California , U.S.',
  'Key people': 'Tomer Weingarten ( chairman & CEO )',
  'Revenue': 'US$ 821 million (2025)',
  'Operating income': 'US$−329 million (2025)',
  'Net income': 'US$−288 million (2025)',
  'Total assets': 'US$2.41 billion (2025)',
  'Total equity': 'US$1.67 billion (2025)',
  'Number of employees': 'c. 2,800 (2024)',
  'Subsidiaries': 'Scalyr Attivo Networks PinnacleOne PingSafe',
  'Website': 'sentinelone .com'},
 'SentinelOne, Inc. is an American cybersecurity company listed on NYSE based in Mountain View, California. The company was founded in 2013 by Tomer Weingarten, Almog Cohen and Ehud ("Udi") Shamir. Weingarten acts as the com

In [None]:
fetch_wikipedia_data('Walt Disney', 'DIS')
fetch_wikipedia_data('Walt Disney', 'DIS', 'https://en.wikipedia.org/wiki/The_Walt_Disney_Company')

[INFO] No URL provided. Searching for: Walt Disney
{'Born': '( 1901-12-05 ) December 5, 1901 Chicago , Illinois, U.S.', 'Died': 'December 15, 1966 (1966-12-15) (aged 65) Burbank, California , U.S.', 'Resting place': 'Forest Lawn Memorial Park , Glendale, California , U.S.', 'Occupations': 'Animator film producer voice actor entrepreneur', 'Title': 'President of the Walt Disney Company [ 1 ]', 'Spouse': 'Lillian Bounds \u200b ( m. 1925) \u200b', 'Children': '2, including Diane Disney Miller', 'Relatives': 'Disney family', 'Awards': '26 Academy Awards [ a ] 3 Golden Globe Awards 1 Emmy Award'}
[FAIL] Ticker 'DIS' not found in vcard or content for page 'https://en.wikipedia.org/wiki/Walt_Disney'.
[INFO] Using provided URL. Extracted title: The_Walt_Disney_Company
{'Formerly': 'Disney Brothers Cartoon Studio (1923–1926) Walt Disney Studio (1926–1929) Walt Disney Productions (1929–1986)', 'Company type': 'Public', 'Traded as': 'NYSE : DIS DJIA component S&P 100 component S&P 500 component',

('https://en.wikipedia.org/wiki/The_Walt_Disney_Company',
 {'Formerly': 'Disney Brothers Cartoon Studio (1923–1926) Walt Disney Studio (1926–1929) Walt Disney Productions (1929–1986)',
  'Company type': 'Public',
  'Traded as': 'NYSE : DIS DJIA component S&P 100 component S&P 500 component',
  'ISIN': 'US2546871060',
  'Industry': 'Media entertainment',
  'Predecessor': 'Laugh-O-Gram Studio',
  'Founded': 'October 16, 1923 ; 102 years ago ( 1923-10-16 )',
  'Founders': 'Walt Disney Roy O. Disney',
  'Headquarters': 'Walt Disney Studios , Burbank, California , US',
  'Area served': 'Worldwide',
  'Key people': 'James P. Gorman ( chairman ) Bob Iger ( CEO )',
  'Revenue': 'US$ 91.361 billion (2024)',
  'Operating income': 'US$15.601 billion (2024)',
  'Net income': 'US$4.972 billion (2024)',
  'Total assets': 'US$196.219 billion (2024)',
  'Total equity': 'US$105.522 billion (2024)',
  'Number of employees': '225,000 (2023)',
  'Divisions': 'Disney Entertainment Disney Experiences ESPN (

In [None]:
# =========================================
# Initialize the Wikipedia API
# =========================================

# Set a custom user agent as required by Wikipedia's terms of service
wikipedia.set_user_agent('LRCM_Project (merlin.gemini@fordham.edu)')

# Pull all records that have not been Wikipedia-resolved yet
todo_df = pd.DataFrame(collection.find({"wiki_resolver": {"$exists": False}}))

# Iterate over each company row in the holdings DataFrame
for row in todo_df.itertuples():
    ticker = row.ticker
    company_name = row.name
    print(f"Processing: {ticker} ({company_name})")

    try:
        # Fetch Wikipedia data (url, vcard, content)
        url, vcard, content = fetch_wikipedia_data(company_name, ticker)

        if url:
            doc = {
                "wiki_url": url,
                "wiki_content": content,
                "wiki_vcard": vcard,
                "wiki_resolver": "wikipedia"
            }

            # Update MongoDB record
            collection.update_one(
                {"ticker": ticker, "etf_holding_date": row.etf_holding_date},
                {"$set": doc}
            )

            print(f"[SUCCESS] Fetched data for {ticker}: {url}")
        else:
            print(f"[FAIL] No valid Wikipedia data for {ticker}.")

    except Exception as e:
        print(f"[ERROR] Unhandled exception for {ticker}: {e}")

    # Be polite to Wikipedia’s servers
    time.sleep(0.3)

print("\n--- Fetching Complete ---")



--- Fetching Complete ---


In [None]:
# Validation Checks

# 1️ : Check which records are still unresolved
todo_df = pd.DataFrame(collection.find({"wiki_resolver": {"$exists": False}}))
print(todo_df.shape)


(0, 0)


In [None]:
# 2️: Check which ones are resolved
done_df = pd.DataFrame(collection.find({"wiki_resolver": {"$exists": True}}))

# Check for duplicate URLs
s = done_df.wiki_url.value_counts()
s[s > 1]

wiki_url
https://en.wikipedia.org/wiki/CrowdStrike        2
https://en.wikipedia.org/wiki/Fox_Corporation    2
https://en.wikipedia.org/wiki/Zillow             2
https://en.wikipedia.org/wiki/News_Corp          2
https://en.wikipedia.org/wiki/Under_Armour       2
Name: count, dtype: int64

In [None]:
# Example: view one company’s Mongo record
list(collection.find({"ticker": "S"}))

[{'_id': ObjectId('690c2f0530de78ca9390acdd'),
  'ticker': 'S',
  'name': 'SENTINELONE INC CLASS A',
  'sector': 'Information Technology',
  'market_value': 3309927.74,
  'weight': 0.01,
  'price': 16.87,
  'location': 'United States',
  'exchange': 'New York Stock Exchange Inc.',
  'etf_holding_date': '20251106',
  'wiki_content': 'CrowdStrike Holdings, Inc. is an American cybersecurity technology company based in Austin, Texas. It provides endpoint security, threat intelligence, and cyberattack response services.\nCrowdstrike has investigated several high-profile cyberattacks, including the 2014 Sony Pictures hack, the 2015-16 cyberattacks on the Democratic National Committee (DNC), and the 2016 email leak involving the DNC. On July 19, 2024, it issued a faulty update to its security software that caused global computer outages that disrupted air travel, banking, broadcasting, and other services.\n\n== History ==\nCrowdStrike was co-founded in 2011 by George Kurtz (CEO), Dmitri Alper

In [None]:
# =========================================
# Regex-Based Data Quality Check
# =========================================
# Detect mismatched or incorrect Wikipedia entries
query = {
    "$and": [
        {
            "$expr": {
                "$eq": [
                    False,
                    {
                        "$regexMatch": {
                            "input": {"$replaceAll": {"input": "$wiki_content", "find": "'", "replacement": ""}},
                            "regex": {
                                "$let": {
                                    "vars": {
                                        "firstWord": {"$arrayElemAt": [{"$split": ["$name", " "]}, 0]}
                                    },
                                    "in": {"$substrCP": ["$$firstWord", 0, 6]}
                                }
                            },
                            "options": "i"  # Case-insensitive
                        }
                    }
                ]
            }
        },
        {"wiki_resolver": {"$exists": True}}
    ]
}



In [None]:
# Execute query
mismatched_docs = list(collection.find(query))
mismatched_docs

[{'_id': ObjectId('690c2f0530de78ca9390a990'),
  'ticker': 'BRKB',
  'name': 'BERKSHIRE HATHAWAY INC CLASS B',
  'sector': 'Financials',
  'market_value': 639616318.98,
  'weight': 1.45,
  'price': 487.66,
  'location': 'United States',
  'exchange': 'New York Stock Exchange Inc.',
  'etf_holding_date': '20251106',
  'wiki_content': '',
  'wiki_vcard': {},
  'wiki_resolver': 'yfinance'},
 {'_id': ObjectId('690c2f0530de78ca9390aa73'),
  'ticker': 'GWW',
  'name': 'WW GRAINGER INC',
  'sector': 'Industrials',
  'market_value': 30032479.53,
  'weight': 0.07,
  'price': 955.87,
  'location': 'United States',
  'exchange': 'New York Stock Exchange Inc.',
  'etf_holding_date': '20251106',
  'wiki_content': 'W. W. Grainger, Inc. is an American Fortune 500 industrial supply company founded in 1927 in Chicago by William W. (Bill) Grainger. He founded the company to provide consumers with access to a consistent supply of motors.  The company now serves more than 4.5 million customers worldwide w

In [None]:

# =========================================
# Unset documents that fail data quality check
# =========================================
for doc in mismatched_docs:
    print(doc["ticker"], doc["name"])
    update_operation = {
        "$unset": {
            "wiki_resolver": ""  # Just removes the key
        }
    }
    result = collection.update_one({"_id": doc["_id"]}, update_operation)


BRKB BERKSHIRE HATHAWAY INC CLASS B
GWW WW GRAINGER INC
WRB WR BERKLEY CORP
HEIA HEICO CORP CLASS A
ELS EQUITY LIFESTYLE PROPERTIES REIT I
S SENTINELONE INC CLASS A
BFB BROWN FORMAN CORP CLASS B
BFA BROWN FORMAN CORP CLASS A
LENB LENNAR CORP CLASS B
NIQ NIQ GLOBAL INTELLIGENCE PLC


In [None]:

# =========================================
# Bing Search (Fallback for Missing Wikipedia Pages)

import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import quote
from random_user_agent.user_agent import UserAgent
from random_user_agent.params import SoftwareName, OperatingSystem

software_names = [SoftwareName.CHROME.value]
operating_systems = [OperatingSystem.WINDOWS.value, OperatingSystem.LINUX.value]
user_agent_rotator = UserAgent(software_names=software_names, operating_systems=operating_systems, limit=100)

query = quote("Wikipedia W R Berkley Corp")
url = "https://www.bing.com/search?q=" + query
res = requests.get(url, headers={'user-agent': user_agent_rotator.get_random_user_agent()})

if res.status_code == 200:
    soup = BeautifulSoup(res.text, "html.parser")
else:
    print(f"Error: {res.status_code}")

In [None]:

# =========================================
# Selenium Fallback (Stealth Scraper)
# =========================================


from selenium import webdriver
import time
import re
from urllib.parse import quote

software_names = [SoftwareName.CHROME.value]
operating_systems = [OperatingSystem.WINDOWS.value, OperatingSystem.LINUX.value]
user_agent_rotator = UserAgent(software_names=software_names, operating_systems=operating_systems, limit=100)

# Close existing driver
try:
    driver.quit()
except:
    pass

# Initialize Chrome driver
driver = webdriver.Chrome()
driver.execute_cdp_cmd("Network.setUserAgentOverride", {"userAgent": user_agent_rotator.get_random_user_agent()})


{}

In [None]:
# Bing Search Automation Function
# =========================================
def search_bing(query):
    driver.get("https://www.bing.com/")
    time.sleep(2)

    search_box = driver.find_element("name", "q")
    search_box.send_keys(query)
    search_box.submit()
    time.sleep(2)

    soup = BeautifulSoup(driver.page_source, "html.parser")

    try:
        soup = soup.find("main", {"aria-label": "Search Results"})
        tags = [t for t in soup.find_all("a", href=True) if "Wikipedia" in t.text]
        return ''.join([t["href"] for t in tags if "wikipedia.org" in t["href"]])
    except Exception:
        search_results = [a["href"] for a in soup.find_all("a", href=True) if "wikipedia" in a["href"]]
        if search_results:
            return search_results[0]


In [None]:
# Example
search_bing("SENTINELONE INC CLASS A Wikipedia")
# -> 'https://en.wikipedia.org/wiki/SentinelOne'

''

In [None]:
# =========================================
# Determine Exchange Prefix and Run Bing + Wikipedia Fetch
# =========================================
print(todo_df.exchange.unique())  # e.g., ['New York Stock Exchange Inc.', 'NASDAQ']

AttributeError: 'DataFrame' object has no attribute 'exchange'

In [None]:
# Initialize Wikipedia API
wikipedia.set_user_agent('LRCM_Project (merlin.gemini@fordham.edu)')

todo_df = pd.DataFrame(collection.find({"wiki_resolver": {"$exists": False}}))

# Iterate and use Bing fallback
for row in todo_df.itertuples():
    ticker = row.ticker
    company_name = row.name

    # Determine exchange prefix
    if row.exchange == "New York Stock Exchange Inc.":
        tickerexch = f"NYSE:{ticker}"
    else:
        tickerexch = f"NASDAQ:{ticker}"

    print(f"Processing: {tickerexch} {company_name}")

    # Use Bing to search for Wikipedia page
    url = search_bing(f"{tickerexch} {company_name} Wikipedia")

    try:
        # Fetch Wikipedia content
        url, vcard, content = fetch_wikipedia_data(company_name, ticker, url)

        if url:
            doc = {
                "ticker": ticker,
                "company_name": company_name,
                "wiki_url": url,
                "wiki_content": content,
                "wiki_vcard": vcard,
                "wiki_resolver": "bing"
            }

            # Update Mongo
            collection.update_one(
                {"ticker": ticker, "etf_holding_date": row.etf_holding_date},
                {"$set": doc}
            )

            print(f"[SUCCESS] Fetched data for {ticker}: {url}")
        else:
            print(f"[FAIL] No valid Wikipedia data for {ticker}.")
    except Exception as e:
        print(f"[ERROR] Unhandled exception for {ticker}: {e}")

    # Delay to avoid rate limiting
    time.sleep(1)

print("\n--- Fetching Complete ---")


Processing: NYSE:BRKB BERKSHIRE HATHAWAY INC CLASS B
[INFO] No URL provided. Searching for: BERKSHIRE HATHAWAY INC CLASS B
{'Company type': 'Public', 'Traded as': 'NYSE : BRK.A ( Class A ) NYSE : BRK.B ( Class B ) S&P 100 component (BRK.B) S&P 500 component (BRK.B)', 'ISIN': 'US0846707026', 'Industry': 'Conglomerate', 'Predecessor': 'Valley Falls Company (1839–1929) Berkshire Fine Spinning Associates (1929–1955) Hathaway Manufacturing Company (1888–1955)', 'Founded': '1839 ; 186 years ago ( 1839 )', 'Founder': 'Oliver Chace', 'Headquarters': 'Blackstone Plaza , Omaha, Nebraska , U.S.', 'Area served': 'Worldwide', 'Key people': 'Warren Buffett (chairman and CEO) Greg Abel (vice chairman, Non-Insurance) Ajit Jain (vice chairman, Insurance)', 'Products': 'List Property & casualty insurance Reinsurance Rail transport Electric power Natural gas Real estate services Industrial parts & materials Mobile homes Building materials Recreational vehicles Apparel Retail stores & services Aviation se

In [None]:
# =========================================
# Test Individual Wikipedia Fetch
# =========================================
fetch_wikipedia_data("SPOTIFY TECHNOLOGY SA", "S")

[INFO] No URL provided. Searching for: SPOTIFY TECHNOLOGY SA
{'Type of business': 'Public', 'Traded as': 'NYSE : SPOT Russell 1000 component', 'Founded': '23 April 2006 ; 19 years ago ( 2006-04-23 )', 'Headquarters': 'Luxembourg , Luxembourg (registered) [ 1 ] Stockholm , Sweden (operational) [ 1 ]', 'Country of origin': 'Sweden', 'No. of locations': '15 offices [ 2 ]', 'Area served': 'Worldwide (except blocked countries)', 'Founders': 'Daniel Ek Martin Lorentzon', 'Key people': 'Daniel Ek (Chairman & CEO) Martin Lorentzon (treasurer)', 'Industry': 'Audio streaming Podcasting', 'Revenue': '€15.67 billion (2024) [ 1 ]', 'Operating income': '€1.365 billion (2024) [ 1 ]', 'Net income': '€1.138 billion (2024) [ 1 ]', 'Total assets': '€12.01 billion (2024) [ 1 ]', 'Total equity': '€5.525 billion (2024) [ 1 ]', 'Employees': '7,323 (September 2025) [ 3 ]', 'Subsidiaries': 'Spotify AB [ 1 ] :\u200a38 Spotify USA Inc. [ 1 ] :\u200a38 Spotify Ltd (UK) [ 1 ] :\u200a38 Several other regional subsi

('https://en.wikipedia.org/wiki/Spotify',
 {'Type of business': 'Public',
  'Traded as': 'NYSE : SPOT Russell 1000 component',
  'Founded': '23 April 2006 ; 19 years ago ( 2006-04-23 )',
  'Headquarters': 'Luxembourg , Luxembourg (registered) [ 1 ] Stockholm , Sweden (operational) [ 1 ]',
  'Country of origin': 'Sweden',
  'No. of locations': '15 offices [ 2 ]',
  'Area served': 'Worldwide (except blocked countries)',
  'Founders': 'Daniel Ek Martin Lorentzon',
  'Key people': 'Daniel Ek (Chairman & CEO) Martin Lorentzon (treasurer)',
  'Industry': 'Audio streaming Podcasting',
  'Revenue': '€15.67 billion (2024) [ 1 ]',
  'Operating income': '€1.365 billion (2024) [ 1 ]',
  'Net income': '€1.138 billion (2024) [ 1 ]',
  'Total assets': '€12.01 billion (2024) [ 1 ]',
  'Total equity': '€5.525 billion (2024) [ 1 ]',
  'Employees': '7,323 (September 2025) [ 3 ]',
  'Subsidiaries': 'Spotify AB [ 1 ] :\u200a38 Spotify USA Inc. [ 1 ] :\u200a38 Spotify Ltd (UK) [ 1 ] :\u200a38 Several other 

In [None]:
# Recheck Remaining Unresolved Records

todo_df = pd.DataFrame(collection.find({"wiki_resolver": {"$exists": False}}))
display(todo_df.head(2))


Unnamed: 0,_id,ticker,name,sector,market_value,weight,price,location,exchange,etf_holding_date,wiki_content,wiki_vcard
0,690c2f0530de78ca9390a990,BRKB,BERKSHIRE HATHAWAY INC CLASS B,Financials,639616300.0,1.45,487.66,United States,New York Stock Exchange Inc.,20251106,,{}
1,690c2f0530de78ca9390ab30,HEIA,HEICO CORP CLASS A,Industrials,13092110.0,0.03,246.12,United States,New York Stock Exchange Inc.,20251106,,{}


In [None]:

# =========================================
# yfFinance as Backup Plan
# =========================================
import yfinance as yf

for i, row in todo_df.copy().iterrows():
    yftic = yf.Ticker(row.ticker)
    if "longBusinessSummary" not in yftic.info:
        yftic = yf.Ticker(row.ticker.replace(".", "-"))  # handle BRK.B style

    try:
        vcard_cols = ["address1", "city", "state", "zip", "country", "phone",
                      "website", "industry", "industryKey", "industryDisp"]

        vcard_dict = {k: v for k, v in yftic.info.items() if k in vcard_cols}
        content = yftic.info.get("longBusinessSummary", "")

        print(row.ticker, content)

        collection.update_one(
            {"ticker": row.ticker, "etf_holding_date": row.etf_holding_date},
            {
                "$set": {
                    "wiki_resolver": "yfinance",
                    "wiki_content": content,
                    "wiki_vcard": vcard_dict
                }
            }
        )
    except Exception as e:
        print(f"Error with {row.ticker}: {e}")


BRKB 
HEIA 
ELS We are a self-administered, self-managed real estate investment trust (REIT) with headquarters in Chicago. As of September 30, 2025, we own or have an interest in 455 properties in 35 states and British Columbia consisting of 173,341 sites.
BFB 
BFA 


HTTP Error 404: {"quoteSummary":{"result":null,"error":{"code":"Not Found","description":"Quote not found for symbol: LENB"}}}
HTTP Error 404: {"quoteSummary":{"result":null,"error":{"code":"Not Found","description":"Quote not found for symbol: LENB"}}}


LENB 
NIQ 


In [None]:
# =========================================
# Validation Summary
# =========================================
todo_df = pd.DataFrame(collection.find({"wiki_resolver": {"$exists": False}}))
display(todo_df.head(2))

In [None]:
# Count how many docs were resolved by each method
all_df = pd.DataFrame(collection.find())
all_df.wiki_resolver.value_counts()

# Example Output:
# wikipedia    868
# yfinance      82
# bing          43


wiki_resolver
wikipedia    866
yfinance     125
bing           5
Name: count, dtype: int64

In [None]:
# =========================================
# Detect Long Wikipedia Pages (for further summarization)
# =========================================
all_df.loc[all_df.wiki_content.str.split().str.len() > 14000]

Unnamed: 0,_id,ticker,name,sector,market_value,weight,price,location,exchange,etf_holding_date,wiki_content,wiki_resolver,wiki_url,wiki_vcard,company_name
7,690c2f0530de78ca9390a98f,TSLA,TESLA INC,Consumer Discretionary,888641700.0,2.01,444.26,United States,NASDAQ,20251106,"Tesla, Inc. ( TEZ-lə or TESS-lə), is an Amer...",wikipedia,"https://en.wikipedia.org/wiki/Tesla,_Inc.","{'Formerly': 'Tesla Motors, Inc. (2003–2017)',...",
16,690c2f0530de78ca9390a998,WMT,WALMART INC,Consumer Staples,316412000.0,0.71,102.27,United States,New York Stock Exchange Inc.,20251106,Walmart Inc. is an American multinational reta...,wikipedia,https://en.wikipedia.org/wiki/Walmart,{'Formerly': 'Wal-Mart Discount City (1962–196...,
60,690c2f0530de78ca9390a9c4,INTC,INTEL CORPORATION CORP,Information Technology,115116600.0,0.26,37.03,United States,NASDAQ,20251106,Intel Corporation is an American multinational...,wikipedia,https://en.wikipedia.org/wiki/Intel,"{'Trade name': 'Intel', 'Formerly': 'NM Electr...",


In [None]:
collection.count_documents({"wiki_resolver": {"$exists": False}})


0

In [None]:
# # shows which resolver fetched Walt Disney
collection.find_one({"ticker": "DIS"}, {"wiki_resolver": 1, "company_name": 1, "_id": 0})

{'wiki_resolver': 'yfinance'}

In [None]:
#Resolved for ticker HEIA & NIQ

for ticker in ["HEIA", "NIQ"]:
    doc = collection.find_one({"ticker": ticker}, {"ticker": 1, "wiki_resolver": 1, "_id": 0})
    print(doc)


{'ticker': 'HEIA', 'wiki_resolver': 'yfinance'}
{'ticker': 'NIQ', 'wiki_resolver': 'yfinance'}
