# Scraping info on stock splits using fake_useragent and ThreadPoolExecutor

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import pandas as pd
from fake_useragent import UserAgent

In [2]:
session = requests.session()
session.proxies = {}

r = session.get("http://httpbin.org/ip")
print(r.text)

{
  "origin": "46.216.27.102, 46.216.27.102"
}



In [10]:
# turned out that fake useragent is enough, and tor with changing IPs is not necessary
# !service tor start

# session = requests.session()
# session.proxies = {}

# session.proxies['http'] = 'socks4://localhost:9051'
# session.proxies['https'] = 'socks4://localhost:9051'

# r = session.get("http://httpbin.org/ip")
# print(r.text)

In [3]:
ticker_df = pd.read_csv("tickers.csv", names=['ticker'])
tickers = ticker_df['ticker'].tolist()
print(len(tickers))

13502


13502 tickers is a lot. Executing all this in a row takes a couple of hours

In [4]:
def get_splits_data(t: str) -> pd.DataFrame:
    """Get data on single ticker"""
    
    ticker = t.lower()
    regex_split_info = re.compile(r"(\d{2}/\d{2}/\d{4})\s*([\d.]+?)(\s*for\s*)(\s*[\d.]+)")  # x for x part - split ratio
    

    r = session.get("https://www.stocksplithistory.com/{}/".format(ticker), timeout = 10,
                    headers={'User-Agent': UserAgent().chrome})

    soup = BeautifulSoup(r.text,'html.parser')

    all_text = ' '.join(soup.findAll(text=True)).replace('\n\n', '')
    
    try:
        tab = re.findall(r"{}\s*split\s*history\s*.*?stock\s*splits".format(ticker), all_text.lower(), flags=re.DOTALL)[0]
    except IndexError:
        return  # no info on this ticker at all
    if 'for' in tab:
        splits = regex_split_info.findall(tab)
        if splits:
            return pd.DataFrame([(ticker.upper(), x[0], float(x[1]), float(x[3])) for x in splits], columns=['ticker', 'date', 'now', 'was'])

    return 

In [5]:
get_splits_data("AAPL")

Unnamed: 0,ticker,date,now,was
0,AAPL,06/16/1987,2.0,1.0
1,AAPL,06/21/2000,2.0,1.0
2,AAPL,02/28/2005,2.0,1.0
3,AAPL,06/09/2014,7.0,1.0


In [6]:
from concurrent.futures import ThreadPoolExecutor
from time import sleep

In [7]:
%%time
data = None
while 1:
    with ThreadPoolExecutor(max_workers=70) as pool:  # arbitrary value - too much causes problems as well
        future = [pool.submit(get_splits_data, ticker) for ticker in tickers]
    
    try:
        data_temp = pd.concat([x.result() for x in future if x.done()
                        and not x.exception()
                        and x.result() is not None], ignore_index=True)  # concatenation of data for tickers
    except ValueError:
        data_temp = None
    
    if data_temp is not None:
        if data is None:
            data = data_temp
        else:
            data = pd.concat([data, data_temp], ignore_index=True)
        
    number_of_runs_with_errors = len([x.exception() for x in future if x.exception()])  # timeout errors, connection refuses
    print(number_of_runs_with_errors)
    if number_of_runs_with_errors == 0:
        break
        
    errored = list(filter(lambda x: x[0] is not None, zip([x.exception() for x in future], tickers)))
    tickers = [x[1] for x in errored]  # new tickers list - those that with errors. Scraping them once again
    sleep(10)

10
1
1
1
0
CPU times: user 15min 24s, sys: 2min 42s, total: 18min 7s
Wall time: 17min 16s


In [8]:
data.shape

(8246, 4)

In [9]:
data['ticker'].unique().shape

(3446,)

this result is actually pretty nice - less than 20 mins.