# Web Scraper - Wikipedia

This module scrapes wikipedia page containing list of S&P 500 companies 

https://en.wikipedia.org/wiki/List_of_S%26P_500_companies#Selected_changes_to_the_list_of_S&P_500_components

And creates json file of years as keys and list of S&P500 companies tickers as values

In [3]:
import requests
from bs4 import BeautifulSoup

import pandas as pd
import time
from datetime import datetime

In [50]:
class Wiki_sp500_scraper:
    """
    Scrapes wiki and saves timeseries of sp500 stocks to a json file
    """
    
    def __init__(self, json_save_path):
        self.json_save_path = json_save_path
        url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies#Selected_changes_to_the_list_of_S&P_500_components'
        
        r = requests.get(url)
        counter = 10
        while r.status_code != 200 and counter > 0:
            counter -= 1
            time.sleep(0.2)
            r = requests.get(url)
           
        self.soup = BeautifulSoup(r.content, 'html.parser')
    
    def scrape_stocks_table(self):
        table = self.soup.find('table', id='constituents')
        self.tickers = []
        
        for tr in table.find_all('tr'):
            row = tr.find_all('td')
            if len(row) > 0:
                self.tickers.append(row[0].text.replace('\n', ''))
    
    def scrape_changes_table(self):
        table = self.soup.find('table', id='changes')
        self.changes_table = {}
        
        for tr in table.find_all('tr'):
            row = tr.find_all('td')
            if len(row) > 4:
                try:
                    date = datetime.strptime(row[0].text, '%B %d, %Y')
                except ValueError:
                    break
                
                if date not in self.changes_table:
                    self.changes_table[date] = {'A': [], 'R': []}
                    
                self.changes_table[date]['A'].append(row[1].text)
                self.changes_table[date]['R'].append(row[3].text)
            
    
    def create_timeseries_of_sp500_stocks(self):
        pass
    
    def save_to_json(self):
        pass
    
    def run(self):
        self.scrape_stocks_table()
        self.scrape_changes_table()
        self.create_timeseries_of_sp500_stocks()
        self.save_to_json()
    
    
    

In [51]:
Wiki_sp500_scraper("dsadsa").scrape_changes_table()

{datetime.datetime(2020, 10, 12, 0, 0): {'A': [''], 'R': ['NBL']}, datetime.datetime(2020, 10, 9, 0, 0): {'A': ['VNT'], 'R': ['']}, datetime.datetime(2020, 10, 7, 0, 0): {'A': ['POOL'], 'R': ['ETFC']}, datetime.datetime(2020, 9, 21, 0, 0): {'A': ['ETSY', 'TER', 'CTLT'], 'R': ['HRB', 'COTY', 'KSS']}, datetime.datetime(2020, 6, 22, 0, 0): {'A': ['BIO', 'TDY', 'TYL'], 'R': ['ADS', 'HOG', 'JWN']}, datetime.datetime(2020, 5, 22, 0, 0): {'A': ['WST'], 'R': ['HP']}, datetime.datetime(2020, 5, 12, 0, 0): {'A': ['DPZ', 'DXCM'], 'R': ['CPRI', 'AGN']}, datetime.datetime(2020, 4, 6, 0, 0): {'A': ['', ''], 'R': ['M', 'RTN']}, datetime.datetime(2020, 4, 3, 0, 0): {'A': ['OTIS', 'CARR'], 'R': ['', '']}, datetime.datetime(2020, 3, 3, 0, 0): {'A': ['GDI'], 'R': ['XEC']}, datetime.datetime(2020, 1, 28, 0, 0): {'A': ['PAYC'], 'R': ['WCG']}, datetime.datetime(2019, 12, 23, 0, 0): {'A': ['LYV', 'ZBRA', 'STE'], 'R': ['AMG', 'TRIP', 'MAC']}, datetime.datetime(2019, 12, 9, 0, 0): {'A': ['ODFL'], 'R': ['STI']}