In [2]:
import requests
import yfinance as yf
import pandas as pd
import re
from lxml import etree
import requests

**Get Company Code given Company Name**

In [4]:
def search_ticker_exact(dataframe, search_term):
    mask = (
        (dataframe['name'].str.lower() == search_term.lower()) |
        (dataframe['short_name'].str.lower() == search_term.lower()) |
        (dataframe['tiny_name'].str.lower() == search_term.lower())
    )
    return dataframe[mask]["ticker"].values[0] if not dataframe[mask].empty else ""


ticker_path = "us_symbols.csv"
df = pd.read_csv(ticker_path) # list of correspoding Ticker given Company Name

---

## Scraping Risk Factors Report from 10K Annual Report

**Get 10K document**

In [60]:
def get_10k_url(ticker, ticker_path):
    df = pd.read_csv(ticker_path)
    company_data =  yf.Ticker(ticker)
    api_response_10k = list(filter(lambda record: record['type'] == "10-K",company_data.sec_filings)) # 10-K
    return max(api_response_10k, key=lambda x: x['date'])['exhibits']['10-K']

get_10k_url("AAPL", ticker_path) # returns URL of 10-K form for the given ticker

'https://cdn.yahoofinance.com/prod/sec-filings/0000320193/000032019323000106/aapl-20230930.htm'

**Get Risk Factors Section**

In [133]:
def get_risk_factors_url(url):
    # returns link to section by title
    def get_sublink_by_title(title, tree):
        spans = tree.xpath('//span[a]')
        for span in spans:
            links = span.xpath('.//a')
            for link in links:
                href = link.get('href')
                link_text = link.xpath('string()').strip()
                if link_text == title: return href
                else: continue
        return f"No match found for {title}."
    
    # concate 10K document with only Section of Interest
    def get_para_btw_divs(tree, start_div_id, end_div_id):   
        start_div = tree.xpath(f'//*[@id="{start_div_id}"]')[0]
        end_div = tree.xpath(f'//*[@id="{end_div_id}"]')[0]
        spans_between = tree.xpath(f'//*[@id="{start_div_id}"]/following::span[following::div[@id="{end_div_id}"]]')

        return spans_between
    
    # clean spans and output text(string)
    def spans_to_text(spans):
        to_remove = ['\xa0\xa0\xa0\xa0', None, '\r', 'Table of Contents']
        lst_spans = [span.text for span in spans if span.text not in to_remove]
        str_spans = "\n".join(lst_spans)
        para_cleaned = str_spans.replace("Class\xa0B", " ").replace("Class\xa0A", " ").replace("•\n", " ").replace("\xa0", " ").replace("  ", " ")
        return para_cleaned

    response = requests.get(url)
    tree = etree.fromstring(response.content, etree.HTMLParser())

    start_div_id = get_sublink_by_title("Risk Factors", tree)[1:]
    end_div_id = get_sublink_by_title("Unresolved Staff Comments", tree)[1:]

    spans = get_para_btw_divs(tree, start_div_id, end_div_id)

    return spans_to_text(spans)

def get_risk_factors(company_name):
    ticker = search_ticker_exact(df, company_name)
    url = get_10k_url(ticker, ticker_path)
    return get_risk_factors_url(url)

get_risk_factors("nvidia") # returns Risk Factors for the given company

"ITEM 1A. RISK FACTORS\nIn evaluating NVIDIA, the following risk factors should be considered in addition to the other information in this Annual Report on Form 10-K. Purchasing or owning NVIDIA common stock involves investment risks including, but not limited to, the risks described below. Any one of the following risks could harm our business, financial condition, results of operations or reputation, which could cause our stock price to decline, and you may lose all or a part of your investment. Additional risks, trends and uncertainties not presently known to us or that we currently believe are immaterial may also harm our business, financial condition, results of operations or reputation.\nRisk Factors Summary\nRisks Related to Our Industry and Markets\n Failure to meet the evolving needs of our industry and markets may adversely impact our financial results.\n15\n Competition in our current and target markets could cause us to lose market share and revenue.\nRisks Related to Deman

**[Helper Functions Demonstration]**

In [None]:
# sample 10K documents for testing
apple_url = "https://cdn.yahoofinance.com/prod/sec-filings/0000320193/000032019323000106/aapl-20230930.htm"
nvidia_url = 'https://cdn.yahoofinance.com/prod/sec-filings/0001045810/000104581024000029/nvda-20240128.htm'
amazon_url = 'https://cdn.yahoofinance.com/prod/sec-filings/0001018724/000101872424000008/amzn-20231231.htm'

In [136]:
def get_sublink_by_title(title, url):
    response = requests.get(url)
    tree = etree.fromstring(response.content, etree.HTMLParser())

    spans = tree.xpath('//span[a]')
    for span in spans:
        links = span.xpath('.//a')
        for link in links:
            href = link.get('href')
            link_text = link.xpath('string()').strip()
            if link_text == title: return href
            else: continue
    return f"No match found for {title}."

print(get_sublink_by_title("Risk Factors", amazon_url)) # returns URL of the sublink with the given title

#i9b49001f922340eeba23291553f14c70_16


In [138]:
url = amazon_url
response = requests.get(url)

# Parse the HTML content
tree = etree.fromstring(response.content, etree.HTMLParser())

start_div_id = get_sublink_by_title("Risk Factors", url)[1:]
end_div_id = get_sublink_by_title("Unresolved Staff Comments", url)[1:]

print(f"Start div id: {start_div_id}, End div id: {end_div_id}")

# XPath to find the start and end div elements

def get_para_btw_divs(tree, start_div_id, end_div_id):   
    start_div = tree.xpath(f'//*[@id="{start_div_id}"]')[0]
    end_div = tree.xpath(f'//*[@id="{end_div_id}"]')[0]
    spans_between = tree.xpath(f'//*[@id="{start_div_id}"]/following::span[following::div[@id="{end_div_id}"]]')

    for span in spans_between:
        print(span.text)

get_para_btw_divs(tree, start_div_id, end_div_id)

Start div id: i9b49001f922340eeba23291553f14c70_16, End div id: i9b49001f922340eeba23291553f14c70_19
Item 1A.
Risk Factors
Please carefully consider the following discussion of significant factors, events, and uncertainties that make an investment in our securities risky. The events and consequences discussed in these risk factors could, in circumstances we may or may not be able to accurately predict, recognize, or control, have a material adverse effect on our business, growth, reputation, prospects, financial condition, operating results (including components of our financial results), cash flows, liquidity, and stock price. These risk factors do not identify all risks that we face; our operations could also be affected by factors, events, or uncertainties that are not presently known to us or that we currently do not consider to present significant risks to our operations. In addition to the factors discussed in Item 7 of Part II, “Management’s Discussion and Analysis of Financial 