# Web Scraper - Wikipedia

This module scrapes wikipedia page containing list of S&P 500 companies 

https://en.wikipedia.org/wiki/List_of_S%26P_500_companies#Selected_changes_to_the_list_of_S&P_500_components

And creates json file of years as keys and list of S&P500 companies tickers as values

In [3]:
import requests
from bs4 import BeautifulSoup

import time
from datetime import datetime

import json

In [16]:
class Wiki_sp500_scraper:
    """
    Scrapes wiki and saves timeseries of sp500 stocks to a json file
    """
    
    def __init__(self, json_save_path):
        self.json_save_path = json_save_path
        url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies#Selected_changes_to_the_list_of_S&P_500_components'
        
        r = requests.get(url)
        counter = 10
        while r.status_code != 200 and counter > 0:
            counter -= 1
            time.sleep(0.2)
            r = requests.get(url)
           
        self.soup = BeautifulSoup(r.content, 'html.parser')
        self.all_tickers = []
    
    def scrape_stocks_table(self):
        table = self.soup.find('table', id='constituents')
        self.tickers = []
        
        for tr in table.find_all('tr'):
            row = tr.find_all('td')
            if len(row) > 0:
                self.tickers.append(row[0].text.replace('\n', ''))
                
        self.all_tickers = self.tickers
                
    
    def scrape_changes_table(self):
        table = self.soup.find('table', id='changes')
        
        # Date, added, removed as key tickers as values
        self.changes = {}
        
        for tr in table.find_all('tr'):
            row = tr.find_all('td')
            if len(row) > 4:
                try:
                    date = datetime.strptime(row[0].text, '%B %d, %Y')
                except ValueError:
                    continue
                
                if date not in self.changes:
                    self.changes[date] = {'A': [], 'R': []}
                    
                self.changes[date]['A'].append(row[1].text)
                self.changes[date]['R'].append(row[3].text)
                self.all_tickers.append(row[1].text)
                self.all_tickers.append(row[3].text)
        
        self.all_tickers = set(self.all_tickers)
        self.all_tickers.discard(' ')
        self.all_tickers.discard('')        
                            
    
    def create_timeseries_of_sp500_stocks(self):
        self.timeseries = {}
        
        ticker_name_change = {
            'GDI': 'IR',  'BMS': 'AMCR', 'Q': 'IQV', 'KORS': 'CPRI', 'DLPH': 'APTV', 'TYC': '???',
            'SAIC': 'SAI', 'PCLN': 'BKNG', 'HRS': 'LHX', 'JEC': 'J', 'TSO': 'ANDV', 'LUK': 'JEF',
            'KFT': 'KHC'
        }
        
        end_date = max(self.changes)
        start_date = min(self.changes)
        
        sorted(self.changes)
        sp500_stocks = set(self.tickers)
        current_year = end_date.year
        
        for date in self.changes:
            year = date.year
            while year != current_year:
                sp500_stocks.discard(' ')
                sp500_stocks.discard('')
                self.timeseries[current_year] = sorted(list(sp500_stocks))
                current_year -= 1
               
            # What was added in some year need to be removed to get to the state of that year
            for added in self.changes[date]['A']:
                if added not in sp500_stocks:
                    added = ticker_name_change.get(added, "")
                    print(f'year={year}, added={added}')
                sp500_stocks.discard(added)
            for removed in self.changes[date]['R']:
                if removed in sp500_stocks:
                    print(f'year={year}, removed={removed}')
                sp500_stocks.add(removed)
                
        while current_year >= start_date.year:
            sp500_stocks.discard(' ')
            sp500_stocks.discard('')
            self.timeseries[current_year] = sorted(list(sp500_stocks))
            current_year -= 1
                
        for key, value in self.timeseries.items():
            print(f'Key: {key}, len={len(value)}')
            
            
            
    
    def save_to_json(self):
        with open("all_tickers_in_sp500.json", "w") as json_file:
            json.dump(list(self.all_tickers), json_file)
        
    
    def run(self):
        self.scrape_stocks_table()
        self.scrape_changes_table()
        self.create_timeseries_of_sp500_stocks()
        self.save_to_json()
    
    
    

In [17]:
Wiki_sp500_scraper("test.json").run()

755
year=2020, removed=NBL
year=2020, removed=ETFC
year=2020, removed=HRB
year=2020, removed=COTY
year=2020, removed=KSS
year=2020, removed=ADS
year=2020, removed=HOG
year=2020, removed=JWN
year=2020, removed=HP
year=2020, removed=CPRI
year=2020, removed=AGN
year=2020, added=
year=2020, removed=M
year=2020, removed=RTN
year=2020, removed=
year=2020, removed=XEC
year=2020, removed=WCG
year=2019, removed=AMG
year=2019, removed=TRIP
year=2019, removed=MAC
year=2019, removed=STI
year=2019, removed=VIAB
year=2019, removed=CELG
year=2019, removed=NKTR
year=2019, removed=JEF
year=2019, removed=TSS
year=2019, removed=APC
year=2019, removed=FL
year=2019, removed=RHT
year=2019, removed=LLL
year=2019, removed=MAT
year=2019, removed=FLR
year=2019, removed=BHF
year=2019, removed=GT
year=2019, removed=NFX
year=2019, removed=PCG
year=2019, removed=SCG
year=2018, removed=ESRX
year=2018, removed=COL
year=2018, removed=AET
year=2018, removed=SRCL
year=2018, removed=EQT
year=2018, removed=CA
year=2018, r