In [38]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import re
import pandas as pd

Note: The url can easily be modified to loop over multiple years where the corresponding scraped data can be easily read into a pandas dataframe

In [39]:
# opening the webpage in a chrome browser
# path to chromedrive is unique
driver = webdriver.Chrome(executable_path=r'C:\Users\johnw\Desktop\Python\Chromedriver\chromedriver_win32\chromedriver')
url = 'https://fortune.com/fortune500/2019/search/'
source = driver.get(url)

In [40]:
# allow some time for the webpage to load
driver.implicitly_wait(10)

In [41]:
# if ad pops up, select "No, Thanks" on the sign-up screen
try:
    driver.find_element_by_xpath('//*[@id="bx-element-1186146-JWp4eVy"]/button').click()
except:
    pass

In [42]:
# our function to expand the page to 100 companies        
# row options are in the 4th <select> tag
# loop through all the options and see if they are "100 Rows"
# if True, click the element
rows = driver.find_elements_by_tag_name('select')
rows_100 = rows[4].find_elements_by_tag_name('option')
for row in rows_100:
    if row.text == '100 Rows':
        print('Selecting page length: ' + row.text)
        row.click()

Selecting page length: 100 Rows


In [43]:
# fields to populate with webscrape

# keys for all_data dictionary
catagories = [
    'rev_MM',
    'rev_pct_change',
    'profit_MM',
    'profit_pct_change',
    'assets_MM',
    'mkt_value_MM',
    'rank_change_1000',
    'employees',
    'rank_change_500',
]

# data to be collected
data = []
rank = []
name = []

# define the function that will be used to scrape the website
def scrape():
    # load in the html for each page
    html = driver.page_source
    soup = BeautifulSoup(html, 'lxml')
    
    # define pattern for regex matching data
    pattern = re.compile('[\$-*\d{,3},*\d{1,3}.\d]*')

    # scrape table data from webpage
    # loop through all table rows <tr> and table data <td>
    divs = soup.find_all('div', class_ = 'rt-tr-group')   
    for div in divs:
        # regex to pull company rank and name
        match = re.findall('\d+', div.text)
        rank.append(match[0])
        name.append(div.text.split(match[0])[1].split('$')[0])
        
        # find all our table data <td> values
        rows = div.findAll('div', class_='rt-td searchResults__cell--2Y7Ce')
        
        # check if table data <td> values match our regex, if so pull the data
        for row in rows:
            if re.match(pattern, row.text):
                data.append(row.text)
        


# loops through 5 pages to get the fortune 500 data
# find button --> scrape data --> click button
def run_scrape():
    for i in range(0, 5):
        for button in driver.find_elements_by_class_name('-next'):
            if button.text == 'NEXT':
                scrape()
                button.click()

    # close the browser when finished with the session            
    driver.close()           
            

    
# populate dictionary with scraped data using logic
all_data = {}
def format_scrape():
    all_data['rank'] = rank
    all_data['name'] = name
    for i in range(0, 9):
        # grab every 9th element starting at element i
        # and place it into the "ith" catagory 
        all_data[catagories[i]] = data[i::9]
    return all_data
        
            
# display our data  
run_scrape()
format_scrape()
print(all_data.keys(), '\n')
print(list(zip(all_data['rank'][:10], all_data['name'][:10])))

dict_keys(['rank', 'name', 'rev_MM', 'rev_pct_change', 'profit_MM', 'profit_pct_change', 'assets_MM', 'mkt_value_MM', 'rank_change_1000', 'employees', 'rank_change_500']) 

[('1', 'Walmart'), ('2', 'Exxon Mobil'), ('3', 'Apple'), ('4', 'Berkshire Hathaway'), ('5', 'Amazon.com'), ('6', 'UnitedHealth Group'), ('7', 'McKesson'), ('8', 'CVS Health'), ('9', 'AT&T'), ('10', 'AmerisourceBergen')]


In [44]:
# load our data into a pandas datafram for diaplay
df = pd.DataFrame(all_data).set_index('rank')
df

Unnamed: 0_level_0,name,rev_MM,rev_pct_change,profit_MM,profit_pct_change,assets_MM,mkt_value_MM,rank_change_1000,employees,rank_change_500
rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,Walmart,"$514,405.0",2.8%,"$6,670.0",-32.4%,"$219,295.0","$279,880.3",-,2200000,-
2,Exxon Mobil,"$290,212.0",18.8%,"$20,840.0",5.7%,"$346,196.0","$342,172.0",-,71000,-
3,Apple,"$265,595.0",15.9%,"$59,531.0",23.1%,"$365,725.0","$895,667.4",1,132000,1
4,Berkshire Hathaway,"$247,837.0",2.4%,"$4,021.0",-91.1%,"$707,794.0","$493,870.3",-1,389000,-1
5,Amazon.com,"$232,887.0",30.9%,"$10,073.0",232.1%,"$162,648.0","$874,709.5",3,647500,3
...,...,...,...,...,...,...,...,...,...,...
496,Simon Property Group,"$5,657.9",2.2%,"$2,440.1",25.3%,"$30,686.2","$56,301.7",-3,4150,-3
497,Navient,"$5,610.0",8.3%,$395.0,35.3%,"$104,176.0","$2,828.9",18,6500,-
498,Western Union,"$5,589.9",1.2%,$851.9,-,"$8,996.8","$8,050.9",-4,12000,-4
499,Peabody Energy,"$5,581.8",0.1%,$646.9,-,"$7,423.7","$3,065.6",-8,7400,-8
