# Scrapping financial information

In [185]:
%load_ext blackcellmagic
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import re
from itertools import chain
import pandas as pd
from tabulate import tabulate
import os

The blackcellmagic extension is already loaded. To reload it, use:
  %reload_ext blackcellmagic


In [186]:
def flatMap(function_to_apply, list_of_inputs, filter_to_apply=lambda x: True):
    return list(
        chain.from_iterable(
            [function_to_apply(el) for el in list_of_inputs if filter_to_apply(el)]
        )
    )


In [197]:
companies = {"LVMH": "LVMH.PA", "Aribus": "AIR.PA", "Danone": "DANO.PA"}
companiesData = []

In [193]:
# create a new Chrome session
driver = webdriver.Chrome("../../chromedriver")
driver.implicitly_wait(30)

In [198]:
%%time
for company in companies:
    # New record with company name
    companyData = {"company": company}
    # Build url with company stock name
    url = (
        "https://www.reuters.com/finance/stocks/financial-highlights/"
        + companies[company]
    )
    # get url and feed it to beautifulSoup
    driver.get(url)
    soup = BeautifulSoup(driver.page_source, "html.parser")

    # Get all tables with class dataTable
    tables = soup.find_all("table", class_="dataTable")
    for table in tables:
        # Parse Sales table
        if "SALES (in millions)" in table.text:
            dfs = pd.read_html(str(table), header=[0])
            if len(dfs) < 1:
                print("No sales found for company " + company)
            else:
                # Suppose there is one possible dataframe and thus get first one in list
                df = dfs[0]
                # Get index of line after "SALES (in millions)"
                q4_sales_index = (
                    df.loc[df[df.columns[0]] == "SALES (in millions)"].index.tolist()[0]
                    + 1
                )
                companyData["sales_mean"] = df.at[q4_sales_index, "Mean"]
                companyData["sales_low"] = df.at[q4_sales_index, "High"]
                companyData["sales_high"] = df.at[q4_sales_index, "Low"]

        elif "Dividend Yield" in table.text:
            dfs = pd.read_html(str(table), header=[0])
            if len(dfs) < 1:
                print("No dividend data found for company " + company)
            else:
                df = dfs[0]
                dividend_yield_index = df.loc[
                    df[df.columns[0]] == "Dividend Yield"
                ].index.tolist()[0]
                companyData["dividend_yield_company"] = df.at[
                    dividend_yield_index, "Company"
                ]
                companyData["dividend_yield_industry"] = df.at[
                    dividend_yield_index, "industry"
                ]
                companyData["dividend_yield_sector"] = df.at[
                    dividend_yield_index, "sector"
                ]

        elif "% Shares Owned:" in table.text:
            dfs = pd.read_html(str(table), header=None)
            if len(dfs) < 1:
                print("No shares data found for company " + company)
            else:
                df = dfs[0]
                shares_index = df.loc[
                    df[df.columns[0]] == "% Shares Owned:"
                ].index.tolist()[0]
                companyData["institutional_shares"] = df.at[shares_index, df.columns[1]]

    divs = soup.find_all("div", class_="sectionQuoteDetail")

    price_regex = re.compile("[0-9]{1,9}[.,][0-9]{1,3}$")
    prices = flatMap(
        lambda div: div.findAll("span", text=price_regex),
        divs,
        lambda div: "on Paris Stock Exchange" in div.text,
    )
    if len(prices) < 1:
        print("No price found for company " + company)
    else:
        companyData["stock_price"] = float(prices[0].text)

    change_regex = re.compile("[-+][0-9]{1,9}[.,][0-9]{1,3}%")
    changes = flatMap(
        lambda div: div.findAll("span", text=change_regex),
        divs,
        lambda div: "Change" in div.text,
    )
    if len(changes) < 1:
        print("No change found for company " + company)
    else:
        companyData["stock_change"] = re.findall(change_regex, changes[0].text)[0]

    companiesData.append(companyData)


CPU times: user 362 ms, sys: 17.2 ms, total: 379 ms
Wall time: 2.49 s


In [199]:
companiesDf = pd.DataFrame(companiesData)
companiesDf.head()

Unnamed: 0,company,dividend_yield_company,dividend_yield_industry,dividend_yield_sector,institutional_shares,sales_high,sales_low,sales_mean,stock_change,stock_price
0,LVMH,1.92,1.7,2.6,20.57%,13575.0,13769.0,13667.7,-2.04%,259.9
1,Aribus,1.45,1.34,1.64,43.53%,21431.0,26073.4,23493.0,-1.27%,97.73
2,Danone,2.9,2.78,2.48,50.60%,6025.0,6142.0,6072.6,-1.52%,64.34


In [200]:
driver.quit()

In [201]:
companiesDf.to_csv("./companies.csv")