# Overview
1. Gather a list of each company in the S&P 500
   1. From Wikipedia
      1. Company Name
      2. Ticker
      3. Headquarters Location
      4. GCIS Sub-Industry
      5. Year Founded
   2. From 

# Beautiful Soup Approach

In [101]:
import requests
from bs4 import BeautifulSoup
import pandas as pd

# Get the page's HTML content
URL = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
response = requests.get(URL)

# Convert the table to a Pandas DataFrame
df = pd.read_html(response.content, flavor='lxml')[0]

# Display the first few rows of the table
display(df)

Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
1,AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
2,ABT,Abbott Laboratories,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,1800,1888
3,ABBV,AbbVie,Health Care,Biotechnology,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989
...,...,...,...,...,...,...,...,...
498,XYL,Xylem Inc.,Industrials,Industrial Machinery & Supplies & Components,"White Plains, New York",2011-11-01,1524472,2011
499,YUM,Yum! Brands,Consumer Discretionary,Restaurants,"Louisville, Kentucky",1997-10-06,1041061,1997
500,ZBRA,Zebra Technologies,Information Technology,Electronic Equipment & Instruments,"Lincolnshire, Illinois",2019-12-23,877212,1969
501,ZBH,Zimmer Biomet,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07,1136869,1927


### Find random company

In [102]:
import random
N = len(df)
for i in range(100):
    pick_random_company = random.randint(0,N-1)
    if pick_random_company > 500:
        print(pick_random_company)
        
print(pick_random_company)
display(df.iloc[pick_random_company])

431


Symbol                                     SYF
Security                   Synchrony Financial
GICS Sector                         Financials
GICS Sub-Industry             Consumer Finance
Headquarters Location    Stamford, Connecticut
Date added                          2015-11-18
CIK                                    1601712
Founded                                   2003
Name: 431, dtype: object

### Get wiki link for each company & append it to the dataframe in a new column

In [132]:
response = requests.get(URL)
soup = BeautifulSoup(response.text, 'html.parser')

# Find the table
table = soup.find('table', {'class': 'wikitable'})


# Find all of the links in the second column of the table
links = []
for row in table.find_all('tr')[1:]: # skip the header row
    cells = row.find_all('td')
    if cells:
        link_tag = cells[1].find('a')  # Adjust the index based on the column containing the link
        if link_tag and 'href' in link_tag.attrs:
            links.append(link_tag['href'])

# Create column in existing dataframe with link as element, but with full url
updated_df = df.assign(link=links)
updated_df['wiki_link'] = 'https://en.wikipedia.org' + updated_df['link']
display(updated_df)

# save the dataframe to a CSV file
# updated_df.to_csv('S&P_500_companies.csv', index=False)



Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded,link,wiki_link
0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902,/wiki/3M,https://en.wikipedia.org/wiki/3M
1,AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916,/wiki/A._O._Smith,https://en.wikipedia.org/wiki/A._O._Smith
2,ABT,Abbott Laboratories,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,1800,1888,/wiki/Abbott_Laboratories,https://en.wikipedia.org/wiki/Abbott_Laboratories
3,ABBV,AbbVie,Health Care,Biotechnology,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888),/wiki/AbbVie,https://en.wikipedia.org/wiki/AbbVie
4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989,/wiki/Accenture,https://en.wikipedia.org/wiki/Accenture
...,...,...,...,...,...,...,...,...,...,...
498,XYL,Xylem Inc.,Industrials,Industrial Machinery & Supplies & Components,"White Plains, New York",2011-11-01,1524472,2011,/wiki/Xylem_Inc.,https://en.wikipedia.org/wiki/Xylem_Inc.
499,YUM,Yum! Brands,Consumer Discretionary,Restaurants,"Louisville, Kentucky",1997-10-06,1041061,1997,/wiki/Yum!_Brands,https://en.wikipedia.org/wiki/Yum!_Brands
500,ZBRA,Zebra Technologies,Information Technology,Electronic Equipment & Instruments,"Lincolnshire, Illinois",2019-12-23,877212,1969,/wiki/Zebra_Technologies,https://en.wikipedia.org/wiki/Zebra_Technologies
501,ZBH,Zimmer Biomet,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07,1136869,1927,/wiki/Zimmer_Biomet,https://en.wikipedia.org/wiki/Zimmer_Biomet


In [146]:
from bs4 import BeautifulSoup
import requests

# Function to extract revenue from a table
def extract_revenue_from_infobox(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find the table with class 'infobox ib-company vcard'
    infobox = soup.find('table', {'class': 'infobox ib-company vcard'})
    
    if infobox:
        print("why not")
        # Loop through each row in the table
        for row in infobox.find_all('tr'):
            # Find the <th> element in the row
            th_element = row.find('th', {'class': 'infobox-label'})
            print(th_element)
            if th_element:
                # Check if the <th> contains the word "revenue"
                if 'revenue' in th_element.get_text(strip=True).lower():
                    # The revenue value is often in the next <td> or <span> element
                    td_element = row.find('td')
                    if td_element:
                        revenue_value = td_element.get_text(strip=True)
                        return revenue_value

    return None

# Example usage:
url = 'https://en.wikipedia.org/wiki/AbbVie	'
response = requests.get(url)
html_content = response.text

revenue = extract_revenue_from_infobox(html_content)
if revenue:
    print(f"Revenue: {revenue}")
else:
    print("Revenue not found.")

Revenue not found.


In [143]:
from bs4 import BeautifulSoup
import requests

# Function to extract revenue and trend (increase/decrease) from a table
def extract_revenue_and_trend_from_infobox(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    
    # Find the table with class 'infobox ib-company vcard'
    infobox = soup.find('table', {'class': 'infobox ib-company vcard'})
    
    if infobox:
        # Loop through each row in the table
        for row in infobox.find_all('tr'):
            # Find the <th> element in the row
            th_element = row.find('th', {'class': 'infobox-label'})
            
            if th_element:
                # Check if the <th> contains the word "revenue"
                if 'revenue' in th_element.get_text(strip=True).lower():
                    # Find the <td> containing the revenue value
                    td_element = row.find('td')
                    if td_element:
                        # Initialize variables to hold the revenue and trend
                        revenue_value = None
                        trend = None

                        # Extract the revenue value directly from the <td>
                        # We assume it's text inside <span> or directly in the <td>
                        revenue_text = td_element.get_text(strip=True)

                        # Check for the trend within nested <span> tags (e.g., <span title="Increase">)
                        for span in td_element.find_all('span'):
                            if span.has_attr('title') and span['title'].lower() in ['increase', 'decrease']:
                                trend = span['title']

                        # Return both revenue and trend (if found)
                        return {
                            'revenue_value': revenue_text,
                            'trend': trend
                        }

    return None

# Example usage:
url = 'https://en.wikipedia.org/wiki/AbbVie	'
response = requests.get(url)
html_content = response.text

result = extract_revenue_and_trend_from_infobox(html_content)
if result:
    print(f"Revenue: {result['revenue_value']}, Trend: {result['trend']}")
else:
    print("Revenue or trend information not found.")

Revenue or trend information not found.


In [None]:
<table class="infobox ib-company vcard"><caption class="infobox-title fn org">AbbVie Inc.</caption><tbody><tr><td colspan="2" class="infobox-image ib-company-logo logo"><span class="mw-default-size skin-invert" typeof="mw:File/Frameless"><a href="/wiki/File:AbbVie_logo.svg" class="mw-file-description"><img src="//upload.wikimedia.org/wikipedia/commons/thumb/c/cc/AbbVie_logo.svg/200px-AbbVie_logo.svg.png" decoding="async" width="200" height="38" class="mw-file-element hoverZoomLink" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/c/cc/AbbVie_logo.svg/300px-AbbVie_logo.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/c/cc/AbbVie_logo.svg/400px-AbbVie_logo.svg.png 2x" data-file-width="559" data-file-height="105"></a></span></td></tr><tr><th scope="row" class="infobox-label">Company type</th><td class="infobox-data category"><a href="/wiki/Public_company" title="Public company">Public</a></td></tr><tr><th scope="row" class="infobox-label"><div style="display: inline-block; line-height: 1.2em; padding: .1em 0;"><a href="/wiki/Ticker_symbol" title="Ticker symbol">Traded as</a></div></th><td class="infobox-data"><style data-mw-deduplicate="TemplateStyles:r1126788409">.mw-parser-output .plainlist ol,.mw-parser-output .plainlist ul{line-height:inherit;list-style:none;margin:0;padding:0}.mw-parser-output .plainlist ol li,.mw-parser-output .plainlist ul li{margin-bottom:0}</style><div class="plainlist"><ul><li><a href="/wiki/New_York_Stock_Exchange" title="New York Stock Exchange">NYSE</a>:&nbsp;<a rel="nofollow" class="external text" href="https://www.nyse.com/quote/XNYS:ABBV">ABBV</a></li><li><a href="/wiki/S%26P_100" title="S&amp;P 100">S&amp;P 100</a> component</li><li><a href="/wiki/S%26P_500" title="S&amp;P 500">S&amp;P 500</a> component</li></ul></div></td></tr><tr><th scope="row" class="infobox-label">Industry</th><td class="infobox-data category"><a href="/wiki/Biopharmaceutical" title="Biopharmaceutical">Biopharmaceutical</a></td></tr><tr><th scope="row" class="infobox-label">Founded</th><td class="infobox-data">April&nbsp;10, 2012<span class="noprint">; 12 years ago</span><span style="display:none">&nbsp;(<span class="bday dtstart published updated">2012-04-10</span>)</span></td></tr><tr><th scope="row" class="infobox-label">Headquarters</th><td class="infobox-data label">1 North Waukegan Road, <a href="/wiki/North_Chicago,_Illinois" title="North Chicago, Illinois">North Chicago, Illinois</a>, United States</td></tr><tr><th scope="row" class="infobox-label"><div style="display: inline-block; line-height: 1.2em; padding: .1em 0;">Area served</div></th><td class="infobox-data">170+ countries worldwide</td></tr><tr><th scope="row" class="infobox-label"><div style="display: inline-block; line-height: 1.2em; padding: .1em 0;">Key people</div></th><td class="infobox-data agent"><link rel="mw-deduplicated-inline-style" href="mw-data:TemplateStyles:r1126788409"><div class="plainlist"><ul><li>Richard A. Gonzalez (<a href="/wiki/Chairman" class="mw-redirect" title="Chairman">chairman</a> and <a href="/wiki/Chief_executive_officer" title="Chief executive officer">CEO</a>)</li><li>Robert Michael (<a href="/wiki/President_(corporate_title)" title="President (corporate title)">president</a> and <a href="/wiki/Chief_operating_officer" title="Chief operating officer">COO</a>)</li></ul></div></td></tr><tr><th scope="row" class="infobox-label">Products</th><td class="infobox-data"><style data-mw-deduplicate="TemplateStyles:r1129693374">.mw-parser-output .hlist dl,.mw-parser-output .hlist ol,.mw-parser-output .hlist ul{margin:0;padding:0}.mw-parser-output .hlist dd,.mw-parser-output .hlist dt,.mw-parser-output .hlist li{margin:0;display:inline}.mw-parser-output .hlist.inline,.mw-parser-output .hlist.inline dl,.mw-parser-output .hlist.inline ol,.mw-parser-output .hlist.inline ul,.mw-parser-output .hlist dl dl,.mw-parser-output .hlist dl ol,.mw-parser-output .hlist dl ul,.mw-parser-output .hlist ol dl,.mw-parser-output .hlist ol ol,.mw-parser-output .hlist ol ul,.mw-parser-output .hlist ul dl,.mw-parser-output .hlist ul ol,.mw-parser-output .hlist ul ul{display:inline}.mw-parser-output .hlist .mw-empty-li{display:none}.mw-parser-output .hlist dt::after{content:": "}.mw-parser-output .hlist dd::after,.mw-parser-output .hlist li::after{content:" · ";font-weight:bold}.mw-parser-output .hlist dd:last-child::after,.mw-parser-output .hlist dt:last-child::after,.mw-parser-output .hlist li:last-child::after{content:none}.mw-parser-output .hlist dd dd:first-child::before,.mw-parser-output .hlist dd dt:first-child::before,.mw-parser-output .hlist dd li:first-child::before,.mw-parser-output .hlist dt dd:first-child::before,.mw-parser-output .hlist dt dt:first-child::before,.mw-parser-output .hlist dt li:first-child::before,.mw-parser-output .hlist li dd:first-child::before,.mw-parser-output .hlist li dt:first-child::before,.mw-parser-output .hlist li li:first-child::before{content:" (";font-weight:normal}.mw-parser-output .hlist dd dd:last-child::after,.mw-parser-output .hlist dd dt:last-child::after,.mw-parser-output .hlist dd li:last-child::after,.mw-parser-output .hlist dt dd:last-child::after,.mw-parser-output .hlist dt dt:last-child::after,.mw-parser-output .hlist dt li:last-child::after,.mw-parser-output .hlist li dd:last-child::after,.mw-parser-output .hlist li dt:last-child::after,.mw-parser-output .hlist li li:last-child::after{content:")";font-weight:normal}.mw-parser-output .hlist ol{counter-reset:listitem}.mw-parser-output .hlist ol>li{counter-increment:listitem}.mw-parser-output .hlist ol>li::before{content:" "counter(listitem)"\a0 "}.mw-parser-output .hlist dd ol>li:first-child::before,.mw-parser-output .hlist dt ol>li:first-child::before,.mw-parser-output .hlist li ol>li:first-child::before{content:" ("counter(listitem)"\a0 "}</style><div class="hlist">
<ul><li><a href="/wiki/Adalimumab" title="Adalimumab">Humira</a></li>
<li><a href="/wiki/Ibrutinib" title="Ibrutinib">Imbruvica</a></li>
<li><a href="/wiki/Lopinavir" title="Lopinavir">Kaletra</a></li>
<li><a href="/wiki/Glecaprevir/pibrentasvir" title="Glecaprevir/pibrentasvir">Mavyret/Maviret</a></li>
<li><a href="/wiki/Ritonavir" title="Ritonavir">Norvir</a></li>
<li><a href="/wiki/Risankizumab" title="Risankizumab">Skyrizi</a></li>
<li><a href="/wiki/Venetoclax" title="Venetoclax">Venclexta</a></li>
<li><a href="/wiki/Daclizumab" title="Daclizumab">Zinbryta</a></li></ul>
</div></td></tr><tr><th scope="row" class="infobox-label">Revenue</th><td class="infobox-data"><span typeof="mw:File"><span title="Decrease"><img alt="Decrease" src="//upload.wikimedia.org/wikipedia/commons/thumb/e/ed/Decrease2.svg/11px-Decrease2.svg.png" decoding="async" width="11" height="11" class="mw-file-element hoverZoomLink" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/e/ed/Decrease2.svg/17px-Decrease2.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/e/ed/Decrease2.svg/22px-Decrease2.svg.png 2x" data-file-width="300" data-file-height="300"></span></span> <span style="white-space: nowrap"><a href="/wiki/United_States_dollar" title="United States dollar">US$</a>54.32 billion</span> (2023)</td></tr><tr><th scope="row" class="infobox-label"><div style="display: inline-block; line-height: 1.2em; padding: .1em 0;"><a href="/wiki/Earnings_before_interest_and_taxes" title="Earnings before interest and taxes">Operating income</a></div></th><td class="infobox-data"><span typeof="mw:File"><span title="Decrease"><img alt="Decrease" src="//upload.wikimedia.org/wikipedia/commons/thumb/e/ed/Decrease2.svg/11px-Decrease2.svg.png" decoding="async" width="11" height="11" class="mw-file-element hoverZoomLink" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/e/ed/Decrease2.svg/17px-Decrease2.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/e/ed/Decrease2.svg/22px-Decrease2.svg.png 2x" data-file-width="300" data-file-height="300"></span></span> US$12.76 billion (2023)</td></tr><tr><th scope="row" class="infobox-label"><div style="display: inline-block; line-height: 1.2em; padding: .1em 0;"><a href="/wiki/Net_income" title="Net income">Net income</a></div></th><td class="infobox-data"><span typeof="mw:File"><span title="Decrease"><img alt="Decrease" src="//upload.wikimedia.org/wikipedia/commons/thumb/e/ed/Decrease2.svg/11px-Decrease2.svg.png" decoding="async" width="11" height="11" class="mw-file-element hoverZoomLink" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/e/ed/Decrease2.svg/17px-Decrease2.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/e/ed/Decrease2.svg/22px-Decrease2.svg.png 2x" data-file-width="300" data-file-height="300"></span></span> US$4.873 billion (2023)</td></tr><tr><th scope="row" class="infobox-label"><span class="nowrap"><a href="/wiki/Asset" title="Asset">Total assets</a></span></th><td class="infobox-data"><span class="nowrap"><span typeof="mw:File"><span title="Decrease"><img alt="Decrease" src="//upload.wikimedia.org/wikipedia/commons/thumb/e/ed/Decrease2.svg/11px-Decrease2.svg.png" decoding="async" width="11" height="11" class="mw-file-element hoverZoomLink" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/e/ed/Decrease2.svg/17px-Decrease2.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/e/ed/Decrease2.svg/22px-Decrease2.svg.png 2x" data-file-width="300" data-file-height="300"></span></span> US$134.7 billion (2023)</span></td></tr><tr><th scope="row" class="infobox-label"><span class="nowrap"><a href="/wiki/Equity_(finance)" title="Equity (finance)">Total equity</a></span></th><td class="infobox-data"><span typeof="mw:File"><span title="Decrease"><img alt="Decrease" src="//upload.wikimedia.org/wikipedia/commons/thumb/e/ed/Decrease2.svg/11px-Decrease2.svg.png" decoding="async" width="11" height="11" class="mw-file-element hoverZoomLink" srcset="//upload.wikimedia.org/wikipedia/commons/thumb/e/ed/Decrease2.svg/17px-Decrease2.svg.png 1.5x, //upload.wikimedia.org/wikipedia/commons/thumb/e/ed/Decrease2.svg/22px-Decrease2.svg.png 2x" data-file-width="300" data-file-height="300"></span></span> US$10.36 billion (2023)</td></tr><tr><th scope="row" class="infobox-label"><div style="display: inline-block; line-height: 1.2em; padding: .1em 0;">Number of employees</div></th><td class="infobox-data"><abbr title="circa">c.</abbr><span style="white-space:nowrap;"> 50,000</span> (2024)</td></tr><tr><th scope="row" class="infobox-label">Website</th><td class="infobox-data"><span class="url"><a rel="nofollow" class="external text" href="http://abbvie.com">abbvie<wbr>.com</a></span></td></tr><tr><td colspan="2" class="infobox-below"><b>Footnotes&nbsp;/ references</b><br><sup id="cite_ref-10K_1-0" class="reference"><a href="#cite_note-10K-1"><span class="cite-bracket">[</span>1<span class="cite-bracket">]</span></a></sup><sup class="reference nowrap"><span title="Pages: facepage, 51–55">: facepage, 51–55 </span></sup></td></tr></tbody></table>

In [None]:
url = 'https://en.wikipedia.org/wiki/AbbVie	'