# The Beermile

## Retrieve Data

In [None]:
import sys
import requests
import numpy as np
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen

In [None]:
url_base = 'https://www.beermile.com/display/'
max_event_id = 151381

In [None]:
%%time

# Find which events are actually races
is_event = 0
not_event = 0


# Look for all events by numerical ID
for i in range(1, max_event_id):
    # Website uses sequential 
    event = f'event_{i}'
    url = f'{url_base}{event}'
    result = requests.get(url)
    page = urlopen(result.url).read()
    a = bs(page, 'html.parser')
    
    eventType = False
    if (a.title.string == 'Race Not Available'):
        not_event += 1
        with open('not_events.txt', 'a') as g:
            g.write(f'{i}\n')
    else:
        is_event += 1
        eventType = True
        with open('events.txt', 'a') as f:
            f.write(f'{i}\n')
            
    print(f'\r #{i:06d} Events: {is_event:<6} - Not Events: {not_event:<6}', end='') 
    sys.stdout.flush()


## Clean Data Into DataFrame

In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup as bs
from urllib.request import urlopen

In [4]:
def clean_soup_list(soup_list):
    clean_soup_list = [s.text.strip() for s in soup_list]
    return clean_soup_list

def get_new_cols(headers, columns=[]):
    '''
    Use headers (from a table) Soup to find columns not already defined  
    '''
    new_columns = []
    headers = clean_soup_list(headers)
    for col in headers:
        if col not in columns:
            new_columns.append(col)
    return new_columns

def get_data_rows(event_rows):
    data = []
    is_header = True
    for row in event_rows:
        # Skip over first row (since it's the header)
        if (is_header):
            is_header = False
        else:
            cols = row.find_all('td')
            # TODO: Determine how to "clean" string
            cols = [ele.text.strip() for ele in cols]
            row = [ele for ele in cols]
            data.append(row)
    return data

def add_soup_to_df(events_soup, df):
    '''
    Add data (and potentially new columns) from Soup to df
    '''
    # Add new columns to df
    columns = df.columns
    headers = events_soup.find_all('tr')[0].find_all('td')
    new_columns = get_new_cols(headers, columns)
    for c in new_columns:
        df[c] = np.nan
    # Add data to df
    data_rows = events_soup.find_all('tr')
    new = get_data_rows(data_rows)
    df =df.append(
        pd.DataFrame(new, columns=clean_soup_list(headers)),
        sort=False
    )
    
    return df

### Iterate through events list to get data into dataframe

In [5]:
count = 0
url_event_base = 'https://www.beermile.com/display/event_'
cols = []

df = pd.DataFrame()

for n_event in open('events.txt'):
    url = f'{url_event_base}{n_event}'
    page = urlopen(url).read()
    soup = bs(page, 'html.parser')
    
    df = add_soup_to_df(soup,df)
    # TEST: Only go through 5 results
    count += 1
    if count >5:
        break

Unnamed: 0,Unnamed: 1,Name,Time,Gender,Beer,Age,Notes
0,1,Tyler Murphy,9:01,M,,,
1,2,Neil O'Reilly,10:02,M,,,
2,3,Chris McKie,10:03,M,,,
3,4,Curtis Gadula,11:13,M,,,
4,5,Andy Guitard,11:44,M,,,


In [6]:
# Display DataFrame created
display(df.head())
df.describe()

Unnamed: 0,Unnamed: 1,Name,Time,Gender,Beer,Age,Notes
count,55,55,55,55,55.0,5,33
unique,28,50,52,3,2.0,4,8
top,1,Bruce Perlow,8:23,M,,22,Wesleyan
freq,5,2,2,45,54.0,2,10
