In [53]:

class SP100TickerGetter:
    def __init__(self, n_tickers = 1000):
        self.n_tickers = n_tickers

    def get_tickers(self, current_date:date = date.today()):
        '''
        adapted from: https://stackoverflow.com/questions/44232578/automating-getting-the-sp-500-list
        '''
        url = "https://en.wikipedia.org/wiki/S%26P_100"
        if current_date != date.today():
            response = requests.get("https://en.wikipedia.org/w/index.php?title=S%26P_100&offset=&limit=500&action=history")
            bs_object = BeautifulSoup(response.text, 'html.parser')        
            links = bs_object.findAll('a', {'class': 'mw-changeslist-date'})
            dates = pd.Series()
            for link in links:
                dates[datetime.strptime(link.text, "%H:%M, %d %B %Y").date()] = link["href"]
            pivot = current_date
            list_for_min = [_date for _date in dates.index if _date < pivot]
            nearest_in_past = min(list_for_min, key=lambda x: abs(x - pivot))
            url = "https://en.wikipedia.org/"+dates[nearest_in_past]
            
        response = requests.get(url)
        bs_object = BeautifulSoup(response.text, 'html.parser')
        table = bs_object.find('table', {'class': 'wikitable sortable'})        
        
        tickers = []

        try:
            for index, row in enumerate(table.findAll('tr')[1:]):
                if index >= self.n_tickers:
                    break
                ticker = row.findAll('td')[0].text.strip()
                tickers.append(ticker)
        except:
            return None

        return pd.Series(tickers)

In [54]:
sp100 = SP100TickerGetter()

In [55]:
sp100.get_tickers(date(2010,1,1))



0        AA
1      AAPL
2       ABT
3       AEP
4       ALL
       ... 
96      WMB
97      WMT
98       WY
99      XOM
100     XRX
Length: 101, dtype: object

In [28]:
from urllib.parse import urlencode
from simple_back.price_providers import DailyDataProvider
from abc import abstractmethod
from datetime import date, datetime
from bs4 import BeautifulSoup
import pandas as pd
import requests


class WikipediaProvider(DataProvider):
    @property
    def columns(self):
        return ["tickers"]

    @property
    def columns_order(self):
        return [0]

    def get(
        self, symbol: str, date: pd.Timestamp
    ) -> pd.DataFrame:
        title = urlencode({'title':symbol})
        hist_url = f"https://en.wikipedia.org/w/index.php?{title}&offset=&limit=500&action=history"
        response = requests.get(hist_url)
        response = BeautifulSoup(response.text, 'html.parser')        
        links = response.findAll('a', {'class': 'mw-changeslist-date'})
        dates = pd.Series(dtype='str')
        for link in links:
            dates[datetime.strptime(link.text, "%H:%M, %d %B %Y").date()] = link["href"]
        if type(date) == slice:
            date = date.stop
        pivot = date
        list_for_min = [_date for _date in dates.index if _date < pivot]
        nearest_in_past = min(list_for_min, key=lambda x: abs(x - pivot))
        url = "https://en.wikipedia.org/"+dates[nearest_in_past]
        if self.in_cache(url):
            html = self.get_cache(url)
        else:
            html = requests.get(url).text
            self.set_cache(url, html)
        return self.get_from_html(html, title)
        
    @abstractmethod
    def get_from_html(self, html):
        pass

In [29]:
class SpProvider(WikipediaProvider):
    def get_from_html(self, html, title):
        bs_object = BeautifulSoup(html, 'html.parser')
        if title == 'S&P_100':
            table = bs_object.find('table', {'class': 'wikitable sortable'})
        if title == 'S&P_500':
            table = bs_object.find({'id':"constituents"})
        tickers = []
        try:
            for row in table.findAll('tr')[1:]:
                ticker = row.findAll('td')[0].text.strip()
                tickers.append(ticker)
        except:
            return None
        return pd.Series(tickers, dtype='str')

In [30]:
sp = SpProvider()

In [32]:
sp['S&P_500']

In [None]:
sp.clear_cache()

In [1]:
import requests
import re
from urllib.parse import urlencode
import pandas as pd
from simple_back.price_providers import DataProvider
from abc import abstractmethod
from bs4 import BeautifulSoup
from dateutil.relativedelta import relativedelta

class WikipediaProvider(DataProvider):

    def get_revisions(self, title):
        url = "https://en.wikipedia.org/w/api.php?action=query&format=xml&prop=revisions&rvlimit=500&" + title
        revisions = []                                      
        next_params = ''
        
        if self.in_cache(title):
            results = self.get_cache(title)
        else:
            while True:
                response = requests.get(url + next_params).text
                revisions += re.findall('<rev [^>]*>', response)
                cont = re.search('<continue rvcontinue="([^"]+)"', response)
                if not cont:
                    break
                next_params = "&rvcontinue=" + cont.group(1)

            results = [
                (
                    pd.Timestamp(re.findall("timestamp=\"([^\"]+)",r)[0]),
                    re.findall("id=\"([^\"]+)",r)[0]
                ) 
                for r in revisions
            ]
            
            self.set_cache(title, results, 1)

        return results
    
    @property
    def name(self):
        return 'Wikipedia Provider'
    
    def get(
        self, datetime: pd.Timestamp, symbol: str
    ):
        new_symbol = self.transform_symbol(symbol)
        if new_symbol is None:
            new_symbol = symbol
        titles = urlencode({'titles':new_symbol})
        title = urlencode({'title':new_symbol})
        rev = self.get_revisions(titles)
        for r in rev:
            if r[0] <= datetime:
                if self.debug:
                    print(r[0])
                if self.in_cache(title+r[1]):
                    html = self.get_cache(title+r[1])
                else:
                    url = f'https://en.wikipedia.org/w/index.php?{title}&oldid={r[1]}'
                    if self.debug:
                        print(url)
                    html = requests.get(url).text
                    self.set_cache(title+r[1],html)
                return self.process_html(html, symbol)
            
    def dates(self, symbol):
        new_symbol = self.transform_symbol(symbol)
        if new_symbol is None:
            new_symbol = symbol
        titles = urlencode({'titles':new_symbol})
        rev = self.get_revisions(titles)
        revs = [r[0] for r in rev]
        revs.reverse()
        return revs
            
    def transform_symbol(self, symbol):
        return symbol
            
    @abstractmethod
    def process_html(self, html, symbol):
        pass

In [2]:
class SpProvider(WikipediaProvider):
    
    def transform_symbol(self, symbol):
        if symbol == 'S&P_500':
            return "List_of_S&P_500_companies"
        return symbol
    
    def process_html(self, html, symbol):
        bs_object = BeautifulSoup(html, 'html.parser')
        if symbol == 'S&P_100':
            table = bs_object.find('table', {'class': 'wikitable sortable'})
            td_i = 0
        if symbol == 'S&P_500':
            table = bs_object.find('table', {'id': 'constituents'})
            td_i = 0
            if table is None:
                table = bs_object.find('table', {'class': 'wikitable sortable'})
            if table is None:
                table = bs_object.find('table', {'class': 'wikitable'})
                td_i = 1
            if table is None:
                return None
        tickers = []
        try:
            for row in table.findAll('tr')[1:]:
                ticker = row.findAll('td')[td_i].text.strip()
                tickers.append(ticker)
            return pd.Series(tickers)
        except:
            return None

In [9]:
sp = SpProvider()