In [78]:
import pandas as pd
import numpy as npy
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 1000)
import datetime
import random
import sqlite3

conn_mb = sqlite3.connect('assets/mb_data.db')
conn_pb = sqlite3.connect('assets/pb_data.db')
cursor_mb = conn_mb.cursor()
cursor_pb = conn_pb.cursor()

###The following two code cells meet the first feature requirements:
1. Read two data files (CSV). 
    - Four CSV files are read in and cleaned (transformed). 
2. Set up a local database and read data in with SQLite. 
    - After transformation the files are read into a SQLite DB to be used for futher analysis.

In [79]:
mega_draw_file = ("assets/megamillions.csv")
mega_weather_file = ("assets/mega_weather.csv")


mega_draw_df = pd.read_csv(mega_draw_file, engine='python', parse_dates= {"draw_date" : ["year","month","day"]})
mega_weather_df = pd.read_csv(mega_weather_file, engine='python')

#Split the "DATE_TIME" column into 2 seperate columns "Date" and "Time".

mega_weather_df['Date'] = pd.to_datetime(mega_weather_df['DATE_TIME']).dt.date
mega_weather_df['Time'] = pd.to_datetime(mega_weather_df['DATE_TIME']).dt.time

pd.to_datetime(mega_draw_df['draw_date'], errors='coerce')
mega_draw_df['month'] = pd.DatetimeIndex(mega_draw_df['draw_date']).month_name()
mega_draw_df['day_name'] = pd.DatetimeIndex(mega_draw_df['draw_date']).day_name()
mega_draw_df['day_num'] = pd.DatetimeIndex(mega_draw_df['draw_date']).dayofweek
mega_draw_df['day_date'] = pd.DatetimeIndex(mega_draw_df['draw_date']).day

mega_draw_df.to_sql('mb_draw', conn_mb, if_exists='append', index = False)
mega_weather_df.to_sql('mb_weather', conn_mb, if_exists='append', index = False)

print(mega_draw_df)
print(mega_weather_df)

      draw_date           game  num_1  num_2  num_3  num_4  num_5  mb     month day_name  day_num  day_date
0    2003-12-05  Mega Millions     12     44     15     18      1  42  December   Friday        4         5
1    2003-12-09  Mega Millions     14     15     48      4     24  41  December  Tuesday        1         9
2    2003-12-12  Mega Millions     16     32     46      9     45  26  December   Friday        4        12
3    2003-12-16  Mega Millions     47     16     31     24     46  47  December  Tuesday        1        16
4    2003-12-19  Mega Millions      5     10     39     17     35  38  December   Friday        4        19
...         ...            ...    ...    ...    ...    ...    ...  ..       ...      ...      ...       ...
2006 2023-02-24  Mega Millions     22     49      2     65     67   7  February   Friday        4        24
2007 2023-02-28  Mega Millions     59     52     40     14     16  13  February  Tuesday        1        28
2008 2023-03-03  Mega Millio

In [80]:
pb_draw_file = ("assets/powerball.csv")
pb_weather_file = ("assets/pb_weather.csv")

pb_draw_df = pd.read_csv(pb_draw_file, engine='python', parse_dates= {"draw_date" : ["year","month","day"]})
pb_weather_df = pd.read_csv(pb_weather_file, engine='python')

pb_weather_df['Date'] = pd.to_datetime(pb_weather_df['DATE_TIME']).dt.date
pb_weather_df['Time'] = pd.to_datetime(pb_weather_df['DATE_TIME']).dt.time

pd.to_datetime(pb_draw_df['draw_date'], errors='coerce')
pb_draw_df['month'] = pd.DatetimeIndex(pb_draw_df['draw_date']).month_name()
pb_draw_df['day_name'] = pd.DatetimeIndex(pb_draw_df['draw_date']).day_name()
pb_draw_df['day_num'] = pd.DatetimeIndex(pb_draw_df['draw_date']).dayofweek
pb_draw_df['day_date'] = pd.DatetimeIndex(pb_draw_df['draw_date']).day

pb_draw_df.to_sql('pb_draw', conn_pb, if_exists='append', index = False)
pb_weather_df.to_sql('pb_weather', conn_pb, if_exists='append', index = False)

print(pb_draw_df)
print(pb_weather_df)

      draw_date       game  num_1  num_2  num_3  num_4  num_5  pb     month   day_name  day_num  day_date
0    2010-02-03  Powerball     37     52     22     36     17  24  February  Wednesday        2         3
1    2010-02-06  Powerball     22     54     52     14     59   4  February   Saturday        5         6
2    2010-02-10  Powerball     29      8     37     38      5  34  February  Wednesday        2        10
3    2010-02-13  Powerball     14     10     40     51     30   1  February   Saturday        5        13
4    2010-02-17  Powerball     36      7     26      8     19  15  February  Wednesday        2        17
...         ...        ...    ...    ...    ...    ...    ...  ..       ...        ...      ...       ...
1445 2023-03-04  Powerball     10     40     16     18     66  16     March   Saturday        5         4
1446 2023-03-06  Powerball     69     58     13     29      2   4     March     Monday        0         6
1447 2023-03-08  Powerball     43     69     2

In [81]:
add_time = "ALTER TABLE mb_draw ADD COLUMN draw_time varchar (32)"
cursor_mb.execute(add_time)

OperationalError: duplicate column name: draw_time

In [None]:
#weather_df = weather_df.astype({"HourlyRelativeHumidity": float})
weather_filter_df = mega_weather_df.query('DBTemp == 42 & RHumid == 73')
#weather_df.dtypes
#weather_filter_df
weather_date_unique = weather_filter_df['Date'].unique()
weather_date_unique_df = pd.DataFrame(weather_date_unique, columns=['date_time'])
weather_date_unique_df['date_time'] = pd.to_datetime(weather_date_unique_df['date_time'])

weather_date_unique_df['time_only'] = pd.to_datetime(weather_date_unique_df['date_time']).dt.time
weather_date_unique_df['draw_date'] = pd.to_datetime(weather_date_unique_df['date_time']).dt.date
weather_date_unique_df['draw_date'] = pd.to_datetime(weather_date_unique_df['draw_date'])
weather_date_unique_df['draw_date'] = weather_date_unique_df['draw_date'].dt.strftime('%m/%d/%Y')



dates_list = weather_date_unique_df['draw_date'].values.tolist()

#with pd.option_context('display.max_rows', None,
#                       'display.max_columns', None,
#                       'display.precision', 3,
#                       ):
#    print(weather_date_unique_df)
dates_list


In [None]:
arr = mega_draw_df.iloc[:, 2:7].values

mega_draw_df['check'] = npy.any(((arr[:, 1:] - arr[:, :-1]) == 1),axis=1).astype(int)
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(mega_draw_df)

mega_draw_df.dtypes

In [None]:
counts_1 = mega_draw_df['num_1'].value_counts()
counts_2 = mega_draw_df['num_2'].value_counts()
counts_3 = mega_draw_df['num_3'].value_counts()
counts_4 = mega_draw_df['num_4'].value_counts()
counts_5 = mega_draw_df['num_5'].value_counts()
counts_MB = mega_draw_df['MB'].value_counts()
#counts_df.columns = ['num_1', 'counts_1']
#with pd.option_context('display.max_rows', None,
                       #'display.max_columns', None,
                       #'display.precision', 3,
                       #):
    #print(counts_1, counts_2)
counts_df = pd.concat([counts_1, counts_2, counts_3, counts_4, counts_5, counts_MB], axis=1)
with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(counts_df)


In [None]:
mega_draw_df[0:2][['num_1', 'num_5']] #Slice the dataframe by selected row indexes and a list of column names.
mega_draw_df['num_3'][:4] #create a series from the first 4 rows (0 - 3) of the column string "num_3".
dft['num_2' : 'num_5'] #Slice transposed dataframe with row name from 'num_2 up to and including 'num_5'.
dft.loc['num_3', 2] #Row/Column based indexing. loc uses lables to slice.
mega_draw_df.iloc[1:100, :] #Slice dataframe by rows 1 - 99 and all columns. iloc uses integers as index positions to slice.


In [None]:
mega_draw_df['draw_date'] = pd.to_datetime(mega_draw_df['draw_date'])
date_df = mega_draw_df[mega_draw_df['draw_date'].dt.month == 12]

counts_1 = date_df['num_1'].value_counts()
counts_2 = date_df['num_2'].value_counts()
counts_3 = date_df['num_3'].value_counts()
counts_4 = date_df['num_4'].value_counts()
counts_5 = date_df['num_5'].value_counts()
counts_MB = date_df['MB'].value_counts()
counts_df = pd.concat([counts_1, counts_2, counts_3, counts_4, counts_5, counts_MB], axis=1)

with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(counts_df)
    

In [None]:
mega_draw_df['draw_date'].astype(str)
df2 = mega_draw_df[mega_draw_df['draw_date'].isin(dates_list)]




with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(df2)



In [None]:
counts_1 = df2['num_1'].value_counts()
counts_2 = df2['num_2'].value_counts()
counts_3 = df2['num_3'].value_counts()
counts_4 = df2['num_4'].value_counts()
counts_5 = df2['num_5'].value_counts()
counts_MB = df2['MB'].value_counts()
counts_MB_df= pd.DataFrame(counts_MB)
sorted_MB = counts_MB_df.sort_values('MB', ascending=False)
counts_df = pd.concat([counts_1, counts_2, counts_3, counts_4, counts_5], axis=1)
counts_df['totals'] = counts_df.sum(axis=1, skipna=True)
sorted_df = counts_df.sort_values('totals', ascending=False)

with pd.option_context('display.max_rows', None,
                       'display.max_columns', None,
                       'display.precision', 3,
                       ):
    print(sorted_df)
    print(sorted_MB)

In [None]:
draw_list=(sorted_df.index.values)
draw_list_MB=(sorted_MB.index.values)
print(draw_list)
print(draw_list_MB)
type(draw_list)

In [None]:
draw_list = list(draw_list)
new_list1 = random.sample(draw_list, 5)
new_list1 = sorted(new_list1)
draw_list_MB = list(draw_list_MB)
new_list_MB = random.sample(draw_list_MB, 1)
#sorted([str(x) for x in new_list1])
print(new_list1 + new_list_MB)
type(new_list1[1])