In [24]:
# This code is a test for the web scraping function. If running this code returns an error, the HTML structure of the 
# website may have changed and the function will have to be adjusted. 

In [25]:
import pandas as pd 
import requests
import re
from datetime import date

import bs4
from bs4 import BeautifulSoup

import plotly
import plotly.graph_objects as go
from plotly.subplots import make_subplots

print(f'Pandas {pd.__version__}')
print(f'Requests {requests.__version__}')
print(f'Beautiful Soup {bs4.__version__}')
print(f'Plotly {plotly.__version__}')

Pandas 0.25.1
Requests 2.22.0
Beautiful Soup 4.8.2
Plotly 4.5.0


In [26]:
def get_statement(stock_ticker, statement_type): # specify statement type as 'income statement', 'balance sheet', or 'cash flow'
  
    urls = {'income statement': 'https://finance.yahoo.com/quote/' + stock_ticker + '/financials?p=' + stock_ticker,
           'balance sheet': 'https://finance.yahoo.com/quote/' + stock_ticker +'/balance-sheet?p=' + stock_ticker,
           'cash flow': 'https://finance.yahoo.com/quote/' + stock_ticker + '/cash-flow?p=' + stock_ticker}

    url = urls[statement_type]
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html')


    ls= [] # Create empty list
    for l in soup.find_all('div'): 
        #Find all data structures that are ‘div’
        ls.append(l.string) # add each element to the list

    new_ls = list(filter(None,ls)) 
    new_ls = new_ls[12:] # 12 indicates the index position of start of the table

    # Some financial statements have fewer columns than others
    # Use a regex expression to find the index positions of the dates, in order to determine how many columns are present in the table
    date_index_list = [i for i, item in enumerate(new_ls) if re.search(r'(\d+/\d+/\d+)', item)]
    data = list(zip(*[iter(new_ls)]*(int(date_index_list[-1])+1))) 

    df = pd.DataFrame(data)

    df.columns = df.iloc[0]  # Name columns to first row of dataframe
    df = df.iloc[1:, ]  # start to read 1st row

    # At this point we have a dataframe that looks similar to the layout of Yahoo Finance website
    # We need to transpose the dataframe to get it in a workable format

    df = df.T  # transpose dataframe
    df.columns = df.iloc[0]  # Name columns to first row of dataframe
    df.drop(df.index[0], inplace=True)  # Drop first index row
    df.index.name = ''  # Remove the index name
    
    # Rename 'ttm' to current date and later we will filter down to only year
    # This way 'TTM' will be farthest to the right on graphs
    df.rename(index={'ttm': date.today().strftime('%m/%d/%Y')}, inplace=True) 

    df.index = pd.to_datetime(df.index) 
    df = df.apply(lambda x: x.str.replace(',', ''))

    df = df.apply(pd.to_numeric, errors='coerce') # Convert dataframe to numeric dtype

     
    df.dropna(how='all', axis=1, inplace=True) # Drop empty columns
    df.dropna(thresh=6, axis=0, inplace=True) # Drop rows with more than 6 missing columns
    df.fillna(0, inplace=True) # fill missing values with zeros
    
    df.sort_index(inplace=True)
    df.index = df.index.year # Change index to year only
    df = df.multiply(1000) # Yahoo Finance - all numbers are in thousands
    
    
    
    # Add company name
    df['Co'] = soup.h1.text.split('- ')[1].split('&')[0].split('Ltd.')[0].strip()
    
    # Yahoo Finance currently has a typo in the cash flow statement. Let's fix it. 
    if statement_type == 'cash flow':
        try:
            df.rename(columns={'Net cash used privided by (used for) financing activities': 'Net cash provided by (used for) financing activities'}, inplace=True)

        except:
            pass     
        # ^ Once the typo on the website is corrected, this block should be removed ^
        
    return df

In [27]:
### DATA TEST ###
Norwegian_Income_Statement = get_statement('NCLH', 'income statement')
Norwegian_Balance_Sheet = get_statement('NCLH', 'balance sheet')
Norwegian_Cash_Flow = get_statement('NCLH', 'cash flow')

Royal_Caribbean_Income_Statement = get_statement('RCL', 'income statement')
Royal_Caribbean_Balance_Sheet = get_statement('RCL', 'balance sheet')
Royal_Caribbean_Cash_Flow = get_statement('RCL', 'cash flow')

Carnival_Income_Statement = get_statement('CCL', 'income statement')
Carnival_Balance_Sheet = get_statement('CCL', 'balance sheet')
Carnival_Cash_Flow = get_statement('CCL', 'cash flow')


In [28]:
###### NORWEGIAN TEST #######

# Norwegian Income Statement Test


assert Norwegian_Income_Statement['Total Revenue'][2018] == 6055126000, 'Norwegian Income Statement Error'
assert Norwegian_Income_Statement['Total Revenue'][2017] == 5396175000, 'Norwegian Income Statement Error'
assert Norwegian_Income_Statement['Total Revenue'][2016] == 4874340000, 'Norwegian Income Statement Error'


assert Norwegian_Income_Statement['Total Operating Expenses'][2018] == 1458989000, 'Norwegian Income Statement Error'
assert Norwegian_Income_Statement['Total Operating Expenses'][2017] == 1283712000, 'Norwegian Income Statement Error'
assert Norwegian_Income_Statement['Total Operating Expenses'][2016] == 1098651000, 'Norwegian Income Statement Error'


assert Norwegian_Income_Statement['Net Income'][2018] == 954843000, 'Norwegian Income Statement Error'
assert Norwegian_Income_Statement['Net Income'][2017] == 759872000, 'Norwegian Income Statement Error'
assert Norwegian_Income_Statement['Net Income'][2016] == 633085000, 'Norwegian Income Statement Error'

# Norwegian Balance Sheet Test - No 2019 column listed as of yet


assert Norwegian_Balance_Sheet['Total Current Assets'][2018] == 550313000, 'Norwegian Balance Sheet Error'
assert Norwegian_Balance_Sheet['Total Current Assets'][2017] == 518337000, 'Norwegian Balance Sheet Error'
assert Norwegian_Balance_Sheet['Total Current Assets'][2016] == 411093000, 'Norwegian Balance Sheet Error'


assert Norwegian_Balance_Sheet['Net property, plant and equipment'][2018] == 12119253000, 'Norwegian Balance SheetError'
assert Norwegian_Balance_Sheet['Net property, plant and equipment'][2017] == 11040488000, 'Norwegian Balance Sheet Error'
assert Norwegian_Balance_Sheet['Net property, plant and equipment'][2016] == 10117689000, 'Norwegian Balance Sheet Error'


assert Norwegian_Balance_Sheet['Total Assets'][2018] == 15205970000, 'Norwegian Balance Sheet Error'
assert Norwegian_Balance_Sheet['Total Assets'][2017] == 14094869000, 'Norwegian Balance Sheet Error'
assert Norwegian_Balance_Sheet['Total Assets'][2016] == 12973911000, 'Norwegian Balance Sheet Error'

# Norwegian Cash Flow Test


assert Norwegian_Cash_Flow['Net cash provided by operating activites'][2018] == 2075171000, 'Norwegian Cash Flow Error'
assert Norwegian_Cash_Flow['Net cash provided by operating activites'][2017] == 1585741000, 'Norwegian Cash Flow Error'
assert Norwegian_Cash_Flow['Net cash provided by operating activites'][2016] == 1239666000, 'Norwegian Cash Flow Error'


assert Norwegian_Cash_Flow['Net cash used for investing activites'][2018] == -1502708000, 'Norwegian Cash Flow Error'
assert Norwegian_Cash_Flow['Net cash used for investing activites'][2017] == -1404898000, 'Norwegian Cash Flow Error'
assert Norwegian_Cash_Flow['Net cash used for investing activites'][2016] == -1128914000, 'Norwegian Cash Flow Error'


assert Norwegian_Cash_Flow['Free Cash Flow'][2018] == 508375000, 'Norwegian Cash Flow Error'
assert Norwegian_Cash_Flow['Free Cash Flow'][2017] == 213527000, 'Norwegian Cash Flow Error'
assert Norwegian_Cash_Flow['Free Cash Flow'][2016] == 147575000, 'Norwegian Cash Flow Error'




###### ROYAL CARIBBEAN TEST #######

# Royal Caribbean Income Statement Test


assert Royal_Caribbean_Income_Statement['Total Revenue'][2018] == 9493849000, 'Royal Caribbean Income Statement Error'
assert Royal_Caribbean_Income_Statement['Total Revenue'][2017] == 8777845000, 'Royal Caribbean Income Statement Error'
assert Royal_Caribbean_Income_Statement['Total Revenue'][2016] == 8496401000, 'Royal Caribbean Income Statement Error'


assert Royal_Caribbean_Income_Statement['Total Operating Expenses'][2018] == 2336841000, 'Royal Caribbean Income Statement Error'
assert Royal_Caribbean_Income_Statement['Total Operating Expenses'][2017] == 2137210000, 'Royal Caribbean Income Statement Error'
assert Royal_Caribbean_Income_Statement['Total Operating Expenses'][2016] == 1995205000, 'Royal Caribbean Income Statement Error'


assert Royal_Caribbean_Income_Statement['Net Income'][2018] == 1811042000, 'Royal Caribbean Income Statement Error'
assert Royal_Caribbean_Income_Statement['Net Income'][2017] == 1625133000, 'Royal Caribbean Income Statement Error'
assert Royal_Caribbean_Income_Statement['Net Income'][2016] == 1283388000, 'Royal Caribbean Income Statement Error'

# Royal Caribbean Balance Sheet Test


assert Royal_Caribbean_Balance_Sheet['Total Current Assets'][2018] == 1242044000, 'Royal Caribbean Balance Sheet Error'
assert Royal_Caribbean_Balance_Sheet['Total Current Assets'][2017] == 843028000, 'Royal Caribbean Balance Sheet Error'
assert Royal_Caribbean_Balance_Sheet['Total Current Assets'][2016] == 748305000, 'Royal Caribbean Balance Sheet Error'


assert Royal_Caribbean_Balance_Sheet['Net property, plant and equipment'][2018] == 23466163000, 'Royal Caribbean Balance SheetError'
assert Royal_Caribbean_Balance_Sheet['Net property, plant and equipment'][2017] == 19735180000, 'Royal Caribbean Balance Sheet Error'
assert Royal_Caribbean_Balance_Sheet['Net property, plant and equipment'][2016] == 20161427000, 'Royal Caribbean Balance Sheet Error'

assert Royal_Caribbean_Balance_Sheet['Total Assets'][2019] == 30320284000, 'Royal Caribbean Balance Sheet Error'
assert Royal_Caribbean_Balance_Sheet['Total Assets'][2018] == 27698270000, 'Royal Caribbean Balance Sheet Error'
assert Royal_Caribbean_Balance_Sheet['Total Assets'][2017] == 22296317000, 'Royal Caribbean Balance Sheet Error'
assert Royal_Caribbean_Balance_Sheet['Total Assets'][2016] == 22310324000, 'Royal Caribbean Balance Sheet Error'

# Royal Caribbean Cash Flow Test


assert Royal_Caribbean_Cash_Flow['Net cash provided by operating activites'][2018] == 3479139000, 'Royal Caribbean Cash Flow Error'
assert Royal_Caribbean_Cash_Flow['Net cash provided by operating activites'][2017] == 2874566000, 'Royal Caribbean Cash Flow Error'
assert Royal_Caribbean_Cash_Flow['Net cash provided by operating activites'][2016] == 2516690000, 'Royal Caribbean Cash Flow Error'


assert Royal_Caribbean_Cash_Flow['Net cash used for investing activites'][2018] == -4489158000, 'Royal Caribbean Cash Flow Error'
assert Royal_Caribbean_Cash_Flow['Net cash used for investing activites'][2017] == -213592000, 'Royal Caribbean Cash Flow Error'
assert Royal_Caribbean_Cash_Flow['Net cash used for investing activites'][2016] == -2724892000, 'Royal Caribbean Cash Flow Error'


assert Royal_Caribbean_Cash_Flow['Free Cash Flow'][2018] == -180889000, 'Royal Caribbean Cash Flow Error'
assert Royal_Caribbean_Cash_Flow['Free Cash Flow'][2017] == 2310428000, 'Royal Caribbean Cash Flow Error'
assert Royal_Caribbean_Cash_Flow['Free Cash Flow'][2016] == 22327000, 'Royal Caribbean Cash Flow Error'



###### CARNIVAL TEST #######


# Carnival Income Statement Test


assert Carnival_Income_Statement['Total Revenue'][2018] == 18881000000, 'Carnival Income Statement Error'
assert Carnival_Income_Statement['Total Revenue'][2017] == 17510000000, 'Carnival Income Statement Error'
assert Carnival_Income_Statement['Total Revenue'][2016] == 16389000000, 'Carnival Income Statement Error'


assert Carnival_Income_Statement['Total Operating Expenses'][2018] == 4467000000, 'Carnival Income Statement Error'
assert Carnival_Income_Statement['Total Operating Expenses'][2017] == 4111000000, 'Carnival Income Statement Error'
assert Carnival_Income_Statement['Total Operating Expenses'][2016] == 3935000000, 'Carnival Income Statement Error'


assert Carnival_Income_Statement['Net Income'][2018] == 3152000000, 'Carnival Income Statement Error'
assert Carnival_Income_Statement['Net Income'][2017] == 2606000000, 'Carnival Income Statement Error'
assert Carnival_Income_Statement['Net Income'][2016] == 2779000000, 'Carnival Income Statement Error'

# Carnival Balance Sheet Test


assert Carnival_Balance_Sheet['Total Current Assets'][2018] == 2225000000, 'Carnival Balance Sheet Error'
assert Carnival_Balance_Sheet['Total Current Assets'][2017] == 1596000000, 'Carnival Balance Sheet Error'
assert Carnival_Balance_Sheet['Total Current Assets'][2016] == 1689000000, 'Carnival Balance Sheet Error'


assert Carnival_Balance_Sheet['Net property, plant and equipment'][2018] == 35336000000, 'Carnival Balance SheetError'
assert Carnival_Balance_Sheet['Net property, plant and equipment'][2017] == 34430000000, 'Carnival Balance Sheet Error'
assert Carnival_Balance_Sheet['Net property, plant and equipment'][2016] == 32429000000, 'Carnival Balance Sheet Error'


assert Carnival_Balance_Sheet['Total Assets'][2018] == 42401000000, 'Carnival Balance Sheet Error'
assert Carnival_Balance_Sheet['Total Assets'][2017] == 40778000000, 'Carnival Balance Sheet Error'
assert Carnival_Balance_Sheet['Total Assets'][2016] == 38936000000, 'Carnival Balance Sheet Error'

# Carnival Cash Flow Test


assert Carnival_Cash_Flow['Net cash provided by operating activites'][2018] == 5549000000, 'Carnival Cash Flow Error'
assert Carnival_Cash_Flow['Net cash provided by operating activites'][2017] == 5322000000, 'Carnival Cash Flow Error'
assert Carnival_Cash_Flow['Net cash provided by operating activites'][2016] == 5134000000, 'Carnival Cash Flow Error'


assert Carnival_Cash_Flow['Net cash used for investing activites'][2018] == -3502000000, 'Carnival Cash Flow Error'
assert Carnival_Cash_Flow['Net cash used for investing activites'][2017] == -3089000000, 'Carnival Cash Flow Error'
assert Carnival_Cash_Flow['Net cash used for investing activites'][2016] == -3323000000, 'Carnival Cash Flow Error'


assert Carnival_Cash_Flow['Free Cash Flow'][2018] == 1800000000, 'Carnival Cash Flow Error'
assert Carnival_Cash_Flow['Free Cash Flow'][2017] == 2378000000, 'Carnival Cash Flow Error'
assert Carnival_Cash_Flow['Free Cash Flow'][2016] == 2072000000, 'Carnival Cash Flow Error'

print('Test complete. Good to go!')

Test complete. Good to go!
