## A collection of Powerball drawings from the USA and ZAF in csv format.

I used BeautifulSoup to web scrape data from <a href="https://lotto.net">lotto.net</a>. I sort the data, turn it into a Pandas Dataframe, and exported it as a csv.

In [None]:
import requests
from bs4 import BeautifulSoup

import pandas as pd
import numpy as np

import time
from datetime import datetime

import re

## Helper Functions

In [None]:
def simple_listsort(input_list, n=5):
    # a simple function to split the ball numbers into their drawing groups
    # simple unflatten
    # each drawing is a list of numbers
    output_list = []
    for i in range(0,len(input_list),n):
        output_list.append(input_list[i:i+n])
    return output_list 
    
def winner_numbering(input_list):
    # Takes in the drawing results and returns a list with corresponding numbers
    # 0 if the result is a Rollover
    # 1 if the result is a Winner
    output_list = []
    
    for outcome in input_list:
        if outcome == 'Rollover!':
            output_list.append(0)
        elif outcome == 'Jackpot Won!':
            output_list.append(1)
        else:
            assert False, 'Unexpected outcome text'
            
    return output_list

def jackpot_cleaner(input_list):
    # Takes in the jackpot str and returns a corresponding list of ints 
    pattern = re.compile(r'[0-9,]+')
    
    output_list = []
    for text in input_list:
        # search for the numbers, the result is a list of len 1 so call the 1st element to remove list
        jackpot_str = pattern.findall(text)[0]
        # the str contains comma spaced values, remove those then convert the jackpot number into an int
        val = int(jackpot_str.replace(',',''))
        output_list.append(val)
        
    return output_list

def date_sorter(input_soup):
    # parse the input soup into the weekday and data and return two respective lists
    output_weekday, output_date = [], []
    for draw in input_soup:
        output_weekday.append(draw.contents[1].text)
        # another method besides regular expressions to remove misc. html text
        output_date.append(draw.contents[2].replace('\r','').replace('\n','').replace('\t',''))
    
    return output_weekday, output_date

## Dataset Creator

In [None]:
def powerball_dataset(start_year = 1992, current_year=datetime.now().year, base_url='https://www.lotto.net/powerball/numbers/'
, n=5):
    
    drawing_numbers_list, powerball_list, winner_list, jackpot_list, weekdays_list, dates_list = [],[],[],[],[],[]
    
    for i in range(start_year, current_year+1):
        year_url = base_url + str(i)
        html_pull = requests.get(year_url)
        soup = BeautifulSoup(html_pull.text, 'html.parser')

        # gathering the ball numbers, this list creates one entry per draw, 5 times longer than
        # the amount of drawings
        results = soup.find_all('li', attrs={'class':'ball ball'})
        ball_numbers = [int(result.span.text) for result in results]
        drawing_numbers = simple_listsort(ball_numbers,n=n)
        drawing_numbers_list.extend(drawing_numbers)
        
        # gathering the powerball numbers
        results = soup.find_all('li', attrs={'class':'ball powerball'})
        powerballs = [int(result.span.text) for result in results]
        powerball_list.extend(powerballs)
        
        #TODO add power-play numbers

        # gathering the result of the drawing and converting into numbers
        # 0 if Rollover, 1 if Winner
        results = soup.find_all('span', attrs={'class':'rollover'})
        winner_result = [result.text for result in results]
        winner_num = winner_numbering(winner_result)
        winner_list.extend(winner_num)
        
        
        # gathering the jackpot total
        results = soup.find_all('div', attrs={'class':'jackpot'})
        # A strange issue popped up overnight. Suddenly, each page has ~4 additional jackpot listings at the end of the page
        # I can't see it when I open the page or view the html. It only appears in the .requests output. 
        # Some web issue I'm not getting, some ad maybe? I just implemented this simple work around, since these rouge jackpots are 
        # appearing at the end of the page. Just index to the end of one of the other sorted values, i.e., powerballs.
        # NOTE not the actual list, as that has values from previous iterations.
        # If the listings are gone, then this implementation still works. 
        jackpot_strs = [result.span.text for result in results][:len(powerballs)]
        jackpot_vals = jackpot_cleaner(jackpot_strs)
        jackpot_list.extend(jackpot_vals)
        
        # gathering the date data. The given weekday and the complete date. 
        results = soup.find_all('div', attrs={'class':'date'})
        weekdays, compl_dates = date_sorter(results)
        weekdays_list.extend(weekdays)
        dates_list.extend(compl_dates)

#         print('jackpot', len(jackpot_list), 'numbers', len(drawing_numbers_list), 'powerball', len(powerball_list),
#              'winner', len(winner_list), 'weekday', len(weekdays_list),'date', len(dates_list))
        # sleep so we don't bother the website
        time.sleep(1)
    
    dataset = {'DrawingNumbers':drawing_numbers_list,'Powerball': powerball_list,
               'Result': winner_list,'Jackpot': jackpot_list, 'Weekday': weekdays_list, 'Date': dates_list}
    df = pd.DataFrame(dataset, columns = ['DrawingNumbers', 'Powerball', 'Result', 'Jackpot', 'Weekday', 'Date'])
    df['Date'] = pd.to_datetime(df['Date'])
    return df 

In [None]:
if __name__ == "__main__":
    usa_url = 'https://www.lotto.net/powerball/numbers/'
    usa_df = powerball_dataset()    
    usa_df.to_csv('powerball_usa.csv',index=False)
    

    
    zaf_url = 'https://www.lotto.net/south-africa-powerball/results/'
    zaf_df = powerball_dataset(start_year=2009, base_url=zaf_url, n=5)
    zaf_df.to_csv('powerball_zaf.csv', index=False)
    
    
###
# 1996 - mid 2018 do not have jackpot values listed
###
#     aus_url = 'https://www.lotto.net/australia-powerball/results/'
#     aus_df = powerball_dataset(start_year=1996, base_url=aus_url, n=7)
#     aus_df.to_csv('powerball_aus.csv', index=False)

###
# 2 jackpot values not listed
###
#     nzl_url = 'https://www.lotto.net/new-zealand-powerball/results/'
#     nzl_df = powerball_dataset(start_year=2011, base_url=nzl_url, n=6)
#     nzl_df.to_csv('powerball_nzl.csv', index=False)