# Reuters web crawler


In [1]:
import requests
import unittest
from bs4 import BeautifulSoup


In [2]:
website_prefix = "https://www.reuters.com/finance/stocks/financial-highlights/"

In [3]:
def _handle_request_result_and_build_soup(request_result):
    if request_result.status_code == 200:
        html_doc =  request_result.text
        soup = BeautifulSoup(html_doc,"html.parser")
        return soup


In [4]:
def _convert_string_to_float(string):
    temp = string.strip()
    if temp[0]== '(' : # small negative percentage
        return float(temp[1:-2])
    if temp[-1]== '-' : # -- NA value
        return 0.0
    temp = ''.join(temp.split(',')) # remove thousand's comma
    return float(temp)


In [5]:
def get_reuters_data(page):
    res = requests.get(website_prefix + page)
    soup = _handle_request_result_and_build_soup(res)
    
    nasdaqChangeSrc = soup.find("div", class_= "sectionQuote nasdaqChange")
    nasdaqChangeSpans = nasdaqChangeSrc.find_all("span", limit=2)
    nasdaqChange = _convert_string_to_float(nasdaqChangeSpans[1].text)
    
    priceChangeSrc = soup.find("div", class_= "sectionQuote priceChange")
    priceChangeSpans = priceChangeSrc.find("span", class_="valueContentPercent")
    priceChange = _convert_string_to_float(priceChangeSpans.span.text)
    
    tables = soup.find_all("table", class_= "dataTable", limit=3)
    # tables[0] has Quarter Ending Dec-18
    # tables[1] has % Owned Institutions
    # tables[2] has Dividend Yield
    allQuarters = tables[0].find("tr", class_="stripe")
    quartersData = allQuarters.find_all("td", class_="data")
    quarters = list(map(lambda x : _convert_string_to_float(x.text), quartersData[1:-1]))

    OwnedData = tables[1].find_all("td", class_="data")
    owned = list(map(lambda x : _convert_string_to_float(x.text), OwnedData[-3:]))

    allYields = tables[2].find("tr", class_="stripe")
    YieldsData = allYields.find_all("td", class_="data")
    yields = list(map(lambda x : _convert_string_to_float(x.text), YieldsData))
    
    return [quarters, [nasdaqChange, priceChange], owned, yields]

# Result lists

- Quarter Ending Dec-18 [Mean, High, Low]
- Change [value, %]
- % Owned Institutions [Company, industry, sector]
- Dividend Yield [Company, industry, sector]


In [6]:
sources = ["AIR.PA", "LVMH.PA", "DANO.PA"]

In [7]:
list(map(lambda x : get_reuters_data(x), sources))

[[[23493.0, 26073.4, 21431.0],
  [103.62, -0.37],
  [0.0, 0.0, 0.0],
  [1.44, 1.32, 1.62]],
 [[13667.7, 13769.0, 13575.0],
  [281.85, -1.52],
  [0.0, 0.0, 0.0],
  [1.89, 1.69, 2.55]],
 [[6072.6, 6142.0, 6025.0],
  [65.45, -0.05],
  [0.0, 0.18, 0.16],
  [2.49, 2.35, 2.23]]]