# Scape Sector information for multiple Mutual fund from Yahoo Finance

This works using splinter to open the url in the browser.  We must do it this way because the data we are looking for is not it the HTML if look at it outside of a browser. The data is loaded with the webpage by a java script.  So if we just did a "requests.get(url)" the data we are after would not be there.  We must 
load a browser and scrape/pars that.

### Next Steps in Python
* it is not going to the next mutual fund
* Convert the values in the data frame from strings to values
* Export that data frame as a CSV file

### Next step in Excel 
* Import CSV file in my financial spreadsheet
* Report my exposer to each of these sectors.


In [3]:
# Website:  https://finance.yahoo.com/quote/VTSAX/holdings?p=VTSAX

# Import Splinter and BeautifulSoup
from splinter import Browser
from bs4 import BeautifulSoup
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import os
import json

In [4]:
executable_path = {'executable_path': ChromeDriverManager().install()}
browser = Browser('chrome', **executable_path, headless=False)

---
## Create Functions

## Build a funtion that splits a string at the first number

In [5]:
# FUNCTION
# DESCRIPTION:  This function is used to splits a string into 2 parts at the first number. 
#               A dictionary is created with the first string as the key and the second string as the value
# ARGUMENTS:  A string that starts with a string and is followed by a value.
# RETURN:  Returns a dictionary where the letters in the string form the key and the numbers become the values of the dictionary entry.

def split_string_at_first_number(string):
    index = len(string)

    # count the characters until you find the first number.
    for i, char in enumerate(string):
        
        # break out of the for loop when you find a digit.
        if char.isdigit():
            index = i
            break

    # divide the string in to the key and the value based on the location of the first digit.    
    key = string[:index]
    value = string[index:]

    # Put the key and value into a dictionary and return the dictionary
    dictionary = {key: value}
    return dictionary

## Build a fuction that scraptes and returns the sector weights for one ticker

In [6]:
# FUNCTION
# DESCRIPTION:  Find the industry sector loading of a mutual fund by scraping a Yahoo finance webpage
# ARGUMENTS:  The mutual fund ticker symbol
# RETURN:  a data frame containing the industry sector loading of that fund

def one_sector(mutual_fund):
      # build the URL where the data is located.
      # mutual_fund is the ticker symble of the fund to be scraped
      # Constant pieces of the Yahoo finance website to be scraped
      url_part_1 = 'https://finance.yahoo.com/quote/'
      url_part_2 = '/holdings?p='
      
      # Build the full url to be scrapped.
      url = url_part_1 + mutual_fund + url_part_2 + mutual_fund

      # Print the full URL - this is done only for diagnostic purposes
      print(url)

      # open a browser to the required URL
      browser.visit(url)

      # Create a Beautiful Soup object
      # This will open the url in the browser.  We must do it this way because the data we are looking for is not it the HTML if look at it outside of a browser.
      # The data is loaded with the webpage by a java script.  So if we just did a "requests.get(url)" the data we are after would not be there.  We must 
      # load a browser and scrape/pars that.

      html = browser.html
      soup = BeautifulSoup(html, 'html.parser')

      # Start a dictionary with the key-value pair for the fund name.
      Industry_sec ={'Fund_Code': fund}

      # get the industry sector information from the url
      # The search strings come from "inspecting" the website where the data is located.
      # The website does not unquicly identify sector information. It get extra information that will need to be cleaned out 
      Industry_sector = soup.find_all('div', class_='Bdbw(1px) Bdbc($seperatorColor) Bdbs(s) H(25px) Pt(10px)')

      for sector in Industry_sector:

                  # return sector name and value
                  sec = str(sector.text)

                  # seperate the name and value and update the dictionary Industry_sec with the new key-value pair
                  Industry_sec.update (split_string_at_first_number(sec))

      # build a dataframe using the Industry_sec dictionary
      sector_df = pd.DataFrame(Industry_sec, index =[0])
      #  df.head()
      return sector_df

## Build function that scrapes and returns the 1, 3, 5-year returns of one ticker

In [7]:
def one_return(mutual_fund):
      # build the URL where the data is located.
      # mutual_fund is the ticker symble of the fund to be scraped
      # Constant pieces of the Yahoo finance website to be scraped
      url_part_1 = 'https://finance.yahoo.com/quote/'
      url_part_2 = '/performance?p='
      
      # Build the full url to be scrapped.
      url = url_part_1 + mutual_fund + url_part_2 + mutual_fund

      # Print the full URL - this is done only for diagnostic purposes
      print(url)

      # open a browser to the required URL
      browser.visit(url)

      # Create a Beautiful Soup object
      # This will open the url in the browser.  We must do it this way because the data we are looking for is not it the HTML if look at it outside of a browser.
      # The data is loaded with the webpage by a java script.  So if we just did a "requests.get(url)" the data we are after would not be there.  We must 
      # load a browser and scrape/pars that.

      html = browser.html
      soup = BeautifulSoup(html, 'html.parser')

      # Start a dictionary with the key-value pair for the fund name.
      Ticker_returns ={'Fund_Code': fund}

      periods = [
            '1-Month',
            '3-Month',
            '1-Year',
            '3-Year',
            '5-Year',
            '10-Year'
            ]

      for period in periods:
            period_return = soup.find('span', text=period).find_next('span').find_next('span').find_next('span').text
            dict = str('dict_' + period.replace('-','_'))
            dict = {period: period_return}
            Ticker_returns.update (dict)

      # build a dataframe using the Ticker_returns dictionary
      returns_df = pd.DataFrame(Ticker_returns, index =[0])
      #  df.head()
      return returns_df


---
# Start the search for my information

In [8]:
# Many times the returned data frame contains more column than just the industry sectors.
# This is a list of the columns that are actually industry sectors
industry_sector_col = [
      'Fund_Code', 
      'Basic Materials', 
      'Consumer Cyclical', 
      'Financial Services',
      'Real Estate', 
      'Consumer Defensive', 
      'Healthcare', 'Utilities',
      'Communication Services', 
      'Energy', 
      'Industrials', 
      'Technology']

# Create an empty data frame with the column headings
rollup_sector_df = pd.DataFrame(columns=industry_sector_col)

periods = [
      '1-Month',
      '3-Month',
      '1-Year',
      '3-Year',
      '5-Year',
      '10-Year'
      ]

# Create an empty data frame with the column headings
rollup_return_df = pd.DataFrame(columns=periods)


# list of mutual funds that I want to scrape.
funds = [
      'BDBKX',
      'DIA',
      'QQQ',
      'RERGX',
      'RWMGX',
      'VAW',
      'VBK',
      'VDC',
      'VDE',
      'VEMPX',
      'VEUSX',
      'VEXAX',
      'VFIAX',
      'VGK',
      'VHT',
      'VIGIX',
      'VIS',
      'VMVAX',
      'VNQ',
      'VOO',
      'VOT',
      'VPU',
      'VTI',
      'VTIAX',
      'VTPSX',
      'VXF',
      'VXUS',
      'WFSPX',
]

In [9]:

# for loop moving through the list of funds
for fund in funds:

      # Go Scrape the industry sector information for one ticker.
      sector_df = one_sector(fund)

      # clean the sector_df, keep only the needed columns
      sector_df = sector_df[industry_sector_col]

      # Add the sector information from the last ticker to the rollup data frame
      rollup_sector_df = pd.concat([rollup_sector_df, sector_df])



      # Go scrape the returns information for one ticker
      return_df = one_return(fund)
      
      # Add the return information from the last ticker to the rollup data frame
      rollup_return_df = pd.concat([rollup_return_df, return_df])


https://finance.yahoo.com/quote/BDBKX/holdings?p=BDBKX
https://finance.yahoo.com/quote/BDBKX/performance?p=BDBKX
https://finance.yahoo.com/quote/DIA/holdings?p=DIA
https://finance.yahoo.com/quote/DIA/performance?p=DIA
https://finance.yahoo.com/quote/QQQ/holdings?p=QQQ
https://finance.yahoo.com/quote/QQQ/performance?p=QQQ
https://finance.yahoo.com/quote/RERGX/holdings?p=RERGX
https://finance.yahoo.com/quote/RERGX/performance?p=RERGX
https://finance.yahoo.com/quote/RWMGX/holdings?p=RWMGX
https://finance.yahoo.com/quote/RWMGX/performance?p=RWMGX
https://finance.yahoo.com/quote/VAW/holdings?p=VAW
https://finance.yahoo.com/quote/VAW/performance?p=VAW
https://finance.yahoo.com/quote/VBK/holdings?p=VBK
https://finance.yahoo.com/quote/VBK/performance?p=VBK
https://finance.yahoo.com/quote/VDC/holdings?p=VDC
https://finance.yahoo.com/quote/VDC/performance?p=VDC
https://finance.yahoo.com/quote/VDE/holdings?p=VDE
https://finance.yahoo.com/quote/VDE/performance?p=VDE
https://finance.yahoo.com/quote

In [10]:
ticker_info_df = rollup_sector_df.merge(rollup_return_df, on = ['Fund_Code'])

In [11]:
ticker_info_df .head(30)

Unnamed: 0,Fund_Code,Basic Materials,Consumer Cyclical,Financial Services,Real Estate,Consumer Defensive,Healthcare,Utilities,Communication Services,Energy,Industrials,Technology,1-Month,3-Month,1-Year,3-Year,5-Year,10-Year
0,BDBKX,4.23%,10.27%,15.86%,7.66%,4.21%,16.08%,3.46%,2.56%,6.66%,14.71%,13.51%,6.34%,6.65%,11.26%,11.26%,4.19%,8.26%
1,DIA,0.98%,13.48%,20.18%,0.00%,6.98%,20.09%,0.00%,2.47%,3.01%,13.94%,18.79%,3.35%,4.82%,12.26%,12.15%,9.24%,11.02%
2,QQQ,0.00%,14.48%,0.49%,0.23%,5.36%,5.78%,1.03%,17.14%,0.36%,3.81%,51.32%,4.57%,16.50%,29.11%,15.12%,17.03%,18.78%
3,RERGX,8.12%,13.81%,11.12%,0.35%,6.18%,13.17%,0.94%,3.07%,8.21%,13.65%,14.95%,0.94%,2.65%,13.49%,4.98%,4.04%,6.17%
4,RWMGX,2.71%,6.96%,14.01%,1.98%,8.46%,18.38%,2.75%,5.08%,5.04%,11.17%,18.99%,3.59%,7.51%,12.10%,14.78%,10.59%,11.44%
5,VAW,86.30%,10.93%,0.00%,0.00%,0.00%,0.82%,0.00%,0.00%,0.00%,1.57%,0.00%,8.14%,4.10%,13.56%,17.07%,8.57%,9.79%
6,VBK,2.18%,11.60%,3.52%,5.87%,3.77%,20.09%,0.42%,2.82%,4.49%,15.53%,27.02%,6.33%,7.97%,14.93%,5.60%,5.92%,9.11%
7,VDC,0.20%,0.62%,0.00%,0.00%,97.47%,0.93%,0.00%,0.00%,0.00%,0.16%,0.00%,1.56%,1.34%,6.67%,11.79%,10.22%,9.32%
8,VDE,0.37%,0.00%,0.00%,0.00%,0.00%,0.00%,0.13%,0.00%,98.90%,0.00%,0.00%,3.89%,-0.13%,14.81%,37.23%,5.45%,3.40%
9,VEMPX,3.46%,11.30%,13.22%,6.60%,2.85%,12.94%,1.90%,3.76%,4.06%,13.99%,22.92%,6.90%,8.28%,13.05%,9.63%,6.02%,9.29%


In [12]:
# close down the browser opened by ChromeDriverManager
browser.quit()

In [13]:
# Write the data frame as a CSV file

# build the path and file name for the CSV file.
file_one = os.path.join('.','Resources', 'Industry_Sector_Return_Info.csv')

# Write out the file as the CSV file with headers but without an index.
ticker_info_df.to_csv(file_one, index=False, header=True)                    