In [178]:
# Setting up dependencies to be utilized later here.
import pandas as pd
from pathlib import Path

import matplotlib.pyplot as plt
import scipy.stats as st
import numpy as np


#We can add more dependencies... or we can remove dependencies we haven't
#utilized when we finish the project.

In [187]:
# Converting .csv file into a dataframe for data manipulation
wa_lottery_path_2020 = Path("Resources/2020_lottery_data.csv")
wa_lottery_df_2020 = pd.read_csv(wa_lottery_path_2020)

# displaying .csv just to see what data we have
wa_lottery_df_2020

Unnamed: 0,date,weekday,winning_numbers,powerball,powerplay,jackpot
0,1/1/2020,Wed,49-53-57-59-62,26,2,220000000
1,1/4/2020,Sat,1-11-21-25-54,7,2,237000000
2,1/8/2020,Wed,2-4-7-43-56,22,4,258000000
3,1/11/2020,Sat,3-21-23-31-59,3,2,277000000
4,1/15/2020,Wed,39-41-53-55-68,19,2,296000000
...,...,...,...,...,...,...
100,12/16/2020,Wed,4-23-37-61-67,7,2,287000000
101,12/19/2020,Sat,27-32-34-43-52,13,2,304000000
102,12/23/2020,Wed,6-13-38-39-53,6,3,321000000
103,12/26/2020,Sat,10-24-27-35-53,18,2,341000000


In [213]:
# Generated a new datafram that holds only the two columns 'weekday' + 'powerball'
weekday_powerball_2020 = wa_lottery_df_2020[['weekday','powerball']]

# grouped dataframe by 'weekday' then analyzed the column 'powerball' using value_counts function
powerball_grouped_2020 = weekday_powerball_2020.groupby(['weekday'])['powerball'].value_counts()

# took the results from the value_counts then grouped by the 'weekday' again and then asked for the
# highest or most common value for the powerball
powerball_common_2020 = powerball_grouped_2020.groupby('weekday').idxmax()

powerball_common_2020

weekday
Sat    (Sat, 18)
Wed     (Wed, 2)
Name: powerball, dtype: object

In [259]:
# Generated a new datafram that holds only the two columns 'weekday' + 'powerplay'
weekday_powerplay_2020 = wa_lottery_df_2020[['weekday', 'powerplay']]

# grouped dataframe by 'weekday' then analyzed the column 'powerplay' using value_counts function
powerplay_grouped_2020 = weekday_powerplay_2020.groupby(['weekday'])['powerplay'].value_counts()

# took the results from the value_counts then grouped by the 'weekday' again and then asked for the
# highest or most common value for the powerplay
powerplay_common_2020 = powerplay_grouped_2020.groupby('weekday').idxmax()


powerplay_common_2020

weekday
Sat    (Sat, 2)
Wed    (Wed, 2)
Name: powerplay, dtype: object

In [183]:
# generate a graph, bar, pie, not sure which one would represent the data better
# Plot bar chart
data = most_played_df.plot(kind = 'barh', ylabel = 'weekday', figsize = (10,5))
data.set(xlabel = 'Most Played Number')
for c in data.containers:
    data.bar_label(c, fmt = '%.0f', label_type = 'edge')
data.margins(x = 0.1)
data.legend(title = 'Lottery', bbox_to_anchor = (1, 1.02), loc = 'upper left')

In [256]:
# create a copy of the original dataframe with columns 'date' and 'powerball'
# This will eliminate any warnings about the original dataframes index
date_powerball_2020 = wa_lottery_df_2020[['date', 'powerball']].copy()

# convert the date column object(mm/dd/yyyy) into a datetime object (yyyy-mm-dd) [pandas reference: 
# https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html]
date_powerball_2020['date'] = pd.to_datetime(date_powerball_2020['date'])

# Now I can generate a new column that will hold the months 
# (ex: january == 1, february == 2)
# [pandas.Series.dt = https://pandas.pydata.org/docs/reference/api/pandas.Series.dt.html]
date_powerball_2020['month'] = date_powerball_2020['date'].dt.month

# make a variable that holds a dataframe that is grouped by month
month = date_powerball_2020.groupby('month')

# create an empty dictionary
powerball_common_2020 = {}

# create a loop that will iterate through 1-12 (for the months).  
for i in range(1, 13):
    
    #set a variable to hold the group by each month
    month_data = month.get_group(i) 
    
    # take the variable month_data find the greatest repeated value
    powerball_common_2020[i] = month_data['powerball'].max()

# I utilized pd.DataFrame.from_dict() from https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.from_dict.html 
# to convert my dictionary from my for loop into a DataFrame.
powerball_common_months_2020 = pd.DataFrame.from_dict(powerball_common_2020, orient='index', columns=['Common Powerball'])  

There were three most frequent numbers that were called out of 12 months and that was 18, 5, 4.  Choosing these numbers increases your odds of winning at most 4$ even if your winning numbers didn't match.

In [254]:
# create a copy of the original dataframe with columns 'date' and 'powerplay'
# This will eliminate any warnings about the original dataframes index
date_powerplay_2020 = wa_lottery_df_2020[['date', 'powerplay']].copy()

# convert the date column object(mm/dd/yyyy) into a datetime object (yyyy-mm-dd) [pandas reference: 
# https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html]
date_powerplay_2020['date'] = pd.to_datetime(date_powerplay_2020['date'])

# Now I can generate a new column that will hold the months 
# (ex: january == 1, february == 2)
# [pandas.Series.dt = https://pandas.pydata.org/docs/reference/api/pandas.Series.dt.html]
date_powerplay_2020['month'] = date_powerplay_2020['date'].dt.month

# make a variable that holds a groupby function for month in
# dataframe date_powerball_2020
month = date_powerplay_2020.groupby('month')

# create an empty dictionary
powerplay_common_2020 = {}

# create a loop that will iterate through 1-12 (for the months).  
for i in range(1, 13):
    
    #set a variable to hold the group by each month
    month_data = month.get_group(i) 
    
    # take the variable month_data find the greatest repeated value
    powerplay_common_2020[i] = month_data['powerplay'].max()

# I utilized pd.DataFrame.from_dict() from https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.from_dict.html 
# to convert my dictionary from my for loop into a DataFrame.
powerplay_common_months_2020 = pd.DataFrame.from_dict(powerplay_common_2020, orient='index', columns=['Common Powerplay'])


In [None]:
# Plot bar chart of some sort



It appears that during 2020 choosing either 2 or 3 for your Powerplay number would enable some non-jackpot prize winnings at least.

In [260]:
# make a copy of the original DataFrame to manipulate
month_lottery_df_2020 = wa_lottery_df_2020.copy()

# convert the date column object(mm/dd/yyyy) into a datetime object (yyyy-mm-dd) [pandas reference: 
# https://pandas.pydata.org/docs/reference/api/pandas.to_datetime.html]
month_lottery_df_2020['date'] = pd.to_datetime(month_lottery_df_2020['date'])

# Now I can generate a new column that will hold the months 
# (ex: january == 1, february == 2)
# [pandas.Series.dt = https://pandas.pydata.org/docs/reference/api/pandas.Series.dt.html]
month_lottery_df_2020['month'] = month_lottery_df_2020['date'].dt.month

# make a variable that holds a groupby function for month in
# dataframe date_powerball_2020
month = month_lottery_df_2020.groupby('month')

# create an empty dictionary
jackpot_common_2020 = {}

# create a loop that will iterate through 1-12 (for the months).  
for i in range(1, 13):
    
    #set a variable to hold the group by each month
    month_data = month.get_group(i) 
    
    # take the variable month_data find the greatest repeated value
    jackpot_common_2020[i] = month_data['jackpot'].max()

# I utilized pd.DataFrame.from_dict() from https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.from_dict.html 
# to convert my dictionary from my for loop into a DataFrame.
jackpot_common_months_2020 = pd.DataFrame.from_dict(jackpot_common_2020, orient='index', columns=['Common Jackpot'])


Appears I have November, December, and January are my highest value months to win the jackpot.  Possible people have more spending money because of holiday bonuses leading into the holiday season so more money is poured into the gambling system.

In [None]:
# create a copy of the original DataFrame
jackpot_lottery_df_2020 = wa_lottery_df_2020.copy()

# Generated a new datafram that holds only the two columns 'weekday' + 'powerplay'
weekday_powerplay_2020 = wa_lottery_df_2020[['weekday', 'powerplay']]

# grouped dataframe by 'weekday' then analyzed the column 'powerplay' using value_counts function
powerplay_grouped_2020 = weekday_powerplay_2020.groupby(['weekday'])['powerplay'].value_counts()

# took the results from the value_counts then grouped by the 'weekday' again and then asked for the
# highest or most common value for the powerplay
powerplay_common_2020 = powerplay_grouped_2020.groupby(by = ['weekday']).idxmax()


powerplay_common_2020
