# Scrapping algorithm to exract Nasdaq data

In [89]:
from bs4 import BeautifulSoup
from selenium import webdriver
import os
import pandas as pd

In [106]:
# set a chrome driver that will take all information from page
driver = webdriver.Chrome(os.getcwd() + "\chromedriver.exe")
driver.get(url)

df = pd.DataFrame(columns=['Insider', 'Relation', 'Last date', 
                            'Transaction', 'Owner type', 'Shares traded', 
                            'Price', 'Shares held', 'Capitalization'])


  driver = webdriver.Chrome(os.getcwd() + "\chromedriver.exe")


In [107]:
# form a "soup" file, from where can be extracted all the data
soup = BeautifulSoup(driver.page_source, "html.parser")

In [108]:
table = soup.findAll("table", attrs={'class': 'insider-activity__table'})
for record in table[2].findAll("tr", attrs={"class": "insider-activity__row"}):
    cells = record.findAll("td", attrs={"class": "insider-activity__cell"})
    number = cells[6].text.replace(",", "")
    number = cells[6].text.replace("$", "")
    new_row = {'Insider': cells[0].text, 
                'Relation': cells[1].text,
                'Last date': cells[2].text,
                'Transaction': cells[3].text,
                'Owner type': cells[4].text,
                'Shares traded': int(cells[5].text.replace(",", "")),
                'Price': float(number),
                'Shares held': int(cells[7].text.replace(",", "")),
                'Capitalization': int(cells[5].text.replace(",", "")) * float(number)}
    df = df.append(new_row, ignore_index=True)


In [109]:
df

Unnamed: 0,Insider,Relation,Last date,Transaction,Owner type,Shares traded,Price,Shares held,Capitalization
0,KIRKHORN ZACHARY,Officer,10/18/2021,Automatic Sell,Direct,1250,851.47,54912,1064337.5
1,TANEJA VAIBHAV,Officer,10/18/2021,Automatic Sell,Direct,7000,851.98,21212,5963860.0
2,TANEJA VAIBHAV,Officer,10/18/2021,Option Execute,Direct,7000,54.66,28212,382620.0
3,BAGLINO ANDREW D,Officer,09/27/2021,Automatic Sell,Direct,1005,773.42,18380,777287.1
4,BAGLINO ANDREW D,Officer,09/27/2021,Option Execute,Direct,1000,51.64,19385,51640.0
5,KIRKHORN ZACHARY,Officer,09/17/2021,Automatic Sell,Direct,1250,756.95,56162,946187.5
6,BAGLINO ANDREW D,Officer,09/07/2021,Sell,Direct,896,752.9,18385,674598.4
7,TANEJA VAIBHAV,Officer,09/07/2021,Sell,Direct,1767,752.9,21212,1330374.3
8,KIRKHORN ZACHARY,Officer,09/07/2021,Sell,Direct,2780,752.9,57412,2093062.0
9,BAGLINO ANDREW D,Officer,09/05/2021,Option Execute,Direct,1784,0.0,19281,0.0


# Scrapping algorithm to extract Yahoo data

In [111]:
from bs4 import BeautifulSoup
from selenium import webdriver
import os
import pandas as pd

In [123]:
url = "https://finance.yahoo.com/quote/TSLA/insider-transactions?p=TSLA"

# set a chrome driver that will take all information from page
driver = webdriver.Chrome(os.getcwd() + "\chromedriver.exe")
driver.get(url)

df = pd.DataFrame(columns=['Insider', 'Type', 'Value', 'Date', 'Shares', 'Price'])

# form a "soup" file, from where can be extracted all the data
soup = BeautifulSoup(driver.page_source, "html.parser")

  driver = webdriver.Chrome(os.getcwd() + "\chromedriver.exe")


In [124]:
table = soup.findAll("table", attrs={'class': 'W(100%) BdB Bdc($seperatorColor)'})
for record in table[0].findAll("tr", attrs={"class": "BdT Bdc($seperatorColor) Bgc($hoverBgColor):h Whs(nw) H(45px)"}):
    insider = record.findAll("a", attrs={"class": "Tt(u)"})[0].text
    cells = record.findAll("td", attrs={"class": "Ta(end) Pstart(10px)"})
    if cells[2].text == "":
        value = 0
    else:
        value = int(cells[2].text.replace(",", ""))
    new_row = {'Insider': insider, 'Type': cells[1].text,
                'Value': value, 'Date': cells[3].text, 'Shares': int(cells[4].text.replace(",", "")),
                'Price': float(value /
                    float(cells[4].text.replace(",", ""))
                )}
    df = df.append(new_row, ignore_index=True)


In [125]:
df

Unnamed: 0,Insider,Type,Value,Date,Shares,Price
0,TANEJA VAIBHAV,Direct,1330193,"Sep 07, 2021",1767,752.797397
1,KIRKHORN ZACHARY,Direct,2092885,"Sep 07, 2021",2780,752.836331
2,BAGLINO ANDREW D,Direct,674602,"Sep 07, 2021",896,752.904018
3,TANEJA VAIBHAV,Direct,0,"Sep 03, 2021",3666,0.000000
4,KIRKHORN ZACHARY,Direct,0,"Sep 03, 2021",5935,0.000000
...,...,...,...,...,...,...
145,BAGLINO ANDREW D,Direct,418734,"Jul 10, 2020",300,1395.780000
146,BAGLINO ANDREW D,Direct,48386,"Jul 10, 2020",200,241.930000
147,GUILLEN JEROME M,Direct,3250500,"Jul 01, 2020",3000,1083.500000
148,GUILLEN JEROME M,Direct,774540,"Jul 01, 2020",3000,258.180000


# Setting scrapper class

In [137]:
class WebSharesScrapper:
    def __init__(self, list_of_shares: list, path_to_chrome_driver: str):
        """
        initialization of web scrapper
        
        Keyword arguments:
        list_of_shares (list) -- list of string short shares names, they
            will be used for finding correct links on site for writing
            transactions into csv files with respective names
        path_to_chrome_driver (str) -- path to chrome driver, must
            correspond with current version of installed chrome
        """
        self.shares_list = list_of_shares
        self.first_part_url = "https://finance.yahoo.com/quote/"
        self.second_part_url = "/insider-transactions?p="
        self.driver = webdriver.Chrome(path_to_chrome_driver)
        self.columns = ['Insider', 'Type', 'Value', 'Date', 'Shares', 'Price']


    def get_share_dataframe(self, share: str) -> pd.DataFrame:
        """
        get pandas dataframe for given share name
        
        Keyword arguments:
        share (str) -- short share name
        """
        self.driver.get(self.first_part_url + share + self.second_part_url + share)

        df = pd.DataFrame(columns=['Insider', 'Type', 'Value', 'Date', 'Shares', 'Price'])
        soup = BeautifulSoup(self.driver.page_source, "html.parser")

        table = soup.findAll("table", attrs={'class': 'W(100%) BdB Bdc($seperatorColor)'})
        for record in table[0].findAll("tr", attrs={"class": "BdT Bdc($seperatorColor) Bgc($hoverBgColor):h Whs(nw) H(45px)"}):
            insider = record.findAll("a", attrs={"class": "Tt(u)"})[0].text
            cells = record.findAll("td", attrs={"class": "Ta(end) Pstart(10px)"})
            if cells[2].text == "":
                value = 0
            else:
                value = int(cells[2].text.replace(",", ""))
            new_row = {'Insider': insider, 'Type': cells[1].text,
                        'Value': value, 'Date': cells[3].text,
                        'Info': cells[0].text, 'Shares': int(cells[4].text.replace(",", "")),
                        'Price': float(value /
                            float(cells[4].text.replace(",", ""))
                        )}
            df = df.append(new_row, ignore_index=True)

        return df
    
    
    def set_shares_list(self, shares_list: list):
        """
        reset list of shares short names
        
        Keyword arguments:
        shares_list (list) -- list of shares short names
        """
        self.shares_list = shares_list


    def make_all_shares_to_csv(self):
        """
        write generated for each share (from previously defined list 
        of short share names) dataframes into csv files
        """
        for share in self.shares_list:
            df = self.get_share_dataframe(share)
            df.to_csv(share + ".csv")

In [140]:
# make history for (respectively) Tesla, Microsoft Corporation, Apple, Google, Amazon, Facebook, Netflix
scrapper = WebSharesScrapper(list_of_shares=['TSLA', 'MSFT', 'AAPL', 'GOOG', 'AMZN', 'FB', 'NFLX'], 
                             path_to_chrome_driver=os.getcwd() + "\chromedriver.exe")
scrapper.make_all_shares_to_csv()

  self.driver = webdriver.Chrome(path_to_chrome_driver)


IndexError: list index out of range