# The Beermile

## Retrieve Data

In [None]:
import sys
import requests
import numpy as np
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen

In [None]:
url_base = 'https://www.beermile.com/display/'
max_event_id = 151381

In [None]:
%%time

# Find which events are actually races
is_event = 0
not_event = 0


# Look for all events by numerical ID
for i in range(1, max_event_id):
    # Website uses sequential 
    event = f'event_{i}'
    url = f'{url_base}{event}'
    result = requests.get(url)
    page = urlopen(result.url).read()
    a = bs(page, 'html.parser')
    
    event_type = False
    if (a.title.string == 'Race Not Available'):
        not_event += 1
        with open('not_events.txt', 'a') as g:
            g.write(f'{i}\n')
    else:
        is_event += 1
        event_type = True
        with open('events.txt', 'a') as f:
            f.write(f'{i}\n')
            
    print(f'\r #{i:06d} Events: {is_event:<6} - Not Events: {not_event:<6}', end='') 
    sys.stdout.flush()


## Clean Data Into DataFrame

In [None]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen

In [None]:
def clean_soup_list(soup_list):
    clean_soup_list = [s.text.strip() for s in soup_list]
    return clean_soup_list

def get_new_cols(headers, columns=[]):
    '''
    Use headers (from a table) Soup to find columns not already defined  
    '''
    new_columns = []
    headers = clean_soup_list(headers)
    for col in headers:
        if col not in columns:
            new_columns.append(col)
    return new_columns

def get_data_rows(event_rows):
    data = []
    is_header = True
    for row in event_rows:
        # Skip over first row (since it's the header)
        if (is_header):
            is_header = False
        else:
            cols = row.find_all('td')
            # TODO: Determine how to "clean" string
            cols = [ele.text.strip() for ele in cols]
            row = [ele for ele in cols]
            data.append(row)
    return data

def add_soup_to_df(events_soup, df):
    '''
    Add data (and potentially new columns) from Soup to df
    '''
    # Add new columns to df
    columns = df.columns
    headers = events_soup.find_all('tr')[0].find_all('td')
    new_columns = get_new_cols(headers, columns)
    for c in new_columns:
        df[c] = np.nan
    # Add data to df
    data_rows = events_soup.find_all('tr')
    new = get_data_rows(data_rows)
    df = df.append(
        pd.DataFrame(new, columns=clean_soup_list(headers)),
        sort=False
    )
    
    return df

### Iterate through events list to get data into dataframe

In [None]:
count = 0
url_event_base = 'https://www.beermile.com/display/event_'
cols = []

df = pd.DataFrame()

for n_event in open('events.txt'):
    url = f'{url_event_base}{n_event}'
    page = urlopen(url).read()
    soup = bs(page, 'html.parser')
    
    df = add_soup_to_df(soup,df)
    # TEST: Only go through 5 results
    count += 1
    if count >5:
        break

In [None]:
# Display DataFrame created
display(df.head())
df.describe()

### Process data parallel (experiment)

In [None]:
from multiprocessing import Pool

In [None]:
def parallel_df(event_ids):
    count = 0
    url_event_base = 'https://www.beermile.com/display/event_'
    cols = []

    df = pd.DataFrame()

    for n_event in event_ids:
        url = f'{url_event_base}{n_event}'
        page = urlopen(url).read()
        soup = bs(page, 'html.parser')

        df = add_soup_to_df(soup,df)
        # TEST: Only go through 5 results
        count += 1
        if count >5:
            break
    
    return df

In [None]:
all_events = [n for n in open('events.txt')]

In [None]:
pool = Pool()
# TODO: Read from file

process_n = 5
A = all_events[:process_n]
B = all_events[process_n:process_n*2]
C = all_events[process_n*2:process_n*3]
D = all_events[process_n*3:process_n*4]


result1 = pool.apply_async(parallel_df, [A])
result2 = pool.apply_async(parallel_df, [B])
result3 = pool.apply_async(parallel_df, [C])
result4 = pool.apply_async(parallel_df, [D])

df = result1.get(timeout=20)
df = df.append(result2.get(timeout=20),sort=False)
df = df.append(result3.get(timeout=20),sort=False)
df = df.append(result4.get(timeout=20),sort=False)

In [None]:
display(df.describe())