# Data Collection
All KMB data manually exported from various Morningstar tabs ([here](https://www.morningstar.com/stocks/xnas/kmb/quote))  
Using original statements, not restatements.  

# Share Information  
No spinoffs or stock splits since 2014

In [16]:
import undetected_chromedriver as uc

options = uc.ChromeOptions()
options.add_argument('--headless')

driver = uc.Chrome(options=options, version_main=144) 
dividend_url = r"https://www.morningstar.com/stocks/xnas/kmb/dividends"
driver.get(dividend_url)

# yields

In [17]:
dividend_url = r"https://www.morningstar.com/stocks/xnas/kmb/dividends"

import undetected_chromedriver as uc
from bs4 import BeautifulSoup
import time

import re
import pandas as pd
from pathlib import Path

options = uc.ChromeOptions()
options.add_argument('--headless')
driver = uc.Chrome(options=options, version_main=144) 

In [18]:
def get_html(dividend_url: str) -> str:
    driver.get(dividend_url)
    time.sleep(3) # wait for full page load
    yield_html = driver.page_source
    return yield_html

In [19]:
def parse_table(yield_html: str) -> list:
    """
    Parse table, but ignore the last n rows
    """
    soup = BeautifulSoup(yield_html)
    div_table = soup.find("div", class_="report-table")
    header_html = div_table.find("thead")
    body_html = div_table.find("tbody")

    # header w/ years 
    headers = header_html.find("tr").find_all("th")
    headers = [header.text.strip() for header in headers]

    # body w/ corresponding data
    row_data = body_html.find_all("tr")
    parsed_data = list()
    for row in row_data:
        curr_data = row.find_all("td")
        curr_data = [el.text.strip() for el in curr_data]
        parsed_data.append(curr_data)

    # combine data
    parsed_data.extend([headers]) 

    return parsed_data

def parse_yield(parsed_data: list) -> dict:
    # parse numerical values
    parsed_data = [[d if d != "—" else None for d in li] for li in parsed_data] # rename missing values
    parsed_data = [d[:-3] for d in parsed_data] # remove summary statistics
    parsed_data = [li for li in parsed_data if li[0] and not re.match("[1-9]", li[0][0])]
    parsed_data = {d[0]: d[1:] for d in parsed_data}

    return parsed_data

In [20]:
def export_yields(parsed_data: dict) -> None:
    # prepare for export
    df = pd.DataFrame(parsed_data)
    df.columns = ["annual_per_share_div", "trailing_div_yield", "buyback_yield", "total_yield", "payout_ratio", "fiscal_year"]
    df[["annual_per_share_div", "trailing_div_yield", "buyback_yield", "total_yield", "payout_ratio"]] = df[["annual_per_share_div", "trailing_div_yield", "buyback_yield", "total_yield", "payout_ratio"]].astype(float)
    df["fiscal_year"] = df["fiscal_year"].astype(int)
    df[["trailing_div_yield", "buyback_yield", "total_yield", "payout_ratio"]] = df[["trailing_div_yield", "buyback_yield", "total_yield", "payout_ratio"]] / 100

    cleaned_fp = Path("cleaned")
    df.to_csv(cleaned_fp / "yields.csv", float_format=r"%.12f", index=False)


In [21]:
yield_html = get_html(dividend_url)
parsed_table = parse_table(yield_html)
parsed_yields = parse_yield(parsed_table)
export_yields(parsed_yields)


## ownership

In [22]:
ownership_url = r"https://www.morningstar.com/stocks/xnas/kmb/ownership"

In [23]:
def parse_ownership(parsed_table):
    parsed_ownership = [[None if len(el) == 0 or 'Unlock' in el or "—" in el 
                     else el.replace(',','') for el in li]
                    for li in parsed_table]
    return parsed_ownership

def export_ownership(parsed_ownership):
    df = pd.DataFrame(parsed_ownership[:-1])
    df.columns = parsed_ownership[-1][1:]
    df = df.dropna(axis=1, how="all") # remove paid features
    df.columns = ["percent_ownership", "percent_portfolio_allocated", "shares_owned", "shares_bought", "percent_position_change", "recent_ownership_date"]
    df = df[~df["recent_ownership_date"].isna()] # remove ownership summary statistics
    df["percent_position_change"] = df["percent_position_change"].fillna("100.00%") # if all shares were purchased in the recent transaction, consider that as buying 100% of the portfolio's shares
    df["shares_bought"] = df["shares_bought"].fillna("0") # top holder, but didn't change portfolio allocation
    df["percent_position_change"] = df["percent_position_change"].str.replace("%", "")
    df[["percent_ownership", "percent_portfolio_allocated", "percent_position_change"]] = df[["percent_ownership", "percent_portfolio_allocated", "percent_position_change"]].astype(float) / 100
    df[["shares_owned", "shares_bought"]] = df[["shares_owned", "shares_bought"]].astype(int)
    df["percent_position_change"] = df["shares_bought"] / df["shares_owned"] # recalculate manually for better precision
    df["recent_ownership_date"] = pd.to_datetime(df["recent_ownership_date"])

    cleaned_fp = Path("cleaned")
    df.to_csv(cleaned_fp / "ownership.csv", float_format = r"%.12f", index=False)

In [24]:
ownership_html = get_html(ownership_url)
parsed_table = parse_table(ownership_html)
parsed_ownership = parse_ownership(parsed_table)
export_ownership(parsed_ownership)

In [25]:
driver.quit()