In [3]:
import polars as pl
import polars.selectors as cs
import plotly.express as px
import os

source_data = 'Lottery_Powerball_Winning_Numbers__Beginning_2010.csv'
df = (
    pl.scan_csv(source_data)
    .with_columns(
        DATE = pl.col('Draw Date').str.to_date(format='%m/%d/%Y'),
        SPLIT_NUMS = pl.col('Winning Numbers').str.split(' ')
    )
    .select(
        pl.col('DATE'),
        YEAR = pl.col('DATE').dt.year().cast(pl.UInt16),
        MONTH = pl.col('DATE').dt.strftime('%b'),
        DAY = pl.col('DATE').dt.strftime('%d').cast(pl.UInt8),
        DAY_NAME = pl.col('DATE').dt.strftime('%a'),
        NUM_1 = pl.col('SPLIT_NUMS')
            .list.get(0, null_on_oob=True)
            .str.strip_chars().cast(pl.UInt8),
        NUM_2 = pl.col('SPLIT_NUMS')
            .list.get(1, null_on_oob=True)
            .str.strip_chars().cast(pl.UInt8),
        NUM_3 = pl.col('SPLIT_NUMS')
            .list.get(2, null_on_oob=True)
            .str.strip_chars().cast(pl.UInt8),
        NUM_4 = pl.col('SPLIT_NUMS')
            .list.get(3, null_on_oob=True)
            .str.strip_chars().cast(pl.UInt8),
        NUM_5 = pl.col('SPLIT_NUMS')
            .list.get(4, null_on_oob=True)
            .str.strip_chars().cast(pl.UInt8),
        NUM_6 = pl.col('SPLIT_NUMS')
            .list.get(5, null_on_oob=True)
            .str.strip_chars().cast(pl.UInt8),
    )
    .collect()
)
df.head()



DATE,YEAR,MONTH,DAY,DAY_NAME,NUM_1,NUM_2,NUM_3,NUM_4,NUM_5,NUM_6
date,u16,str,u8,str,u8,u8,u8,u8,u8,u8
2020-09-26,2020,"""Sep""",26,"""Sat""",11,21,27,36,62,24
2020-09-30,2020,"""Sep""",30,"""Wed""",14,18,36,49,67,18
2020-10-03,2020,"""Oct""",3,"""Sat""",18,31,36,43,47,20
2020-10-07,2020,"""Oct""",7,"""Wed""",6,24,30,53,56,19
2020-10-10,2020,"""Oct""",10,"""Sat""",5,18,23,40,50,18


In [None]:
(
    df
    .filter(pl.col('CREATED_DATE') != pl.col('RECEIVE_DATE'))
    .filter(pl.col('PICKUP_DURATION').is_not_null())
    .sample(10)
    .glimpse()
)

In [None]:
(
    df
    .filter(pl.col('CREATED_DATE') != pl.col('RECEIVE_DATE'))
    .filter(pl.col('PICKUP_DURATION').is_not_null())
    .select('CREATED_DATE', 'DELIVERY_DATE', 'RECEIVE_DATE','WITHDRAW_DATE', 'EXPIRE_DATE' )
    .sample(5)
    .transpose(include_header=True)
    .sort('column_0')
)



# .str.strptime(pl.Datetime, fmt="%m/%d/%Y %H:%M")


In [None]:
for c in df.columns:
    print(
        c, 
        df[c]
        .value_counts()
        .with_columns(
            PCT = (100*pl.col('count')/pl.col('count').sum()).cast(pl.Float32)
        )
    )

In [None]:
print(list(df.columns))
print()

In [None]:
print(sorted(list(set(df['BOROUGH']))))

In [None]:
fig = px.scatter_map(
    df,
    lat = 'LATITUDE',
    lon = 'LONGITUDE',
    color='BOROUGH'
)
fig.show()

In [None]:
group_by_cols = ['LOCKER_NAME', 'ADDRESS', 'LOCATION_TYPE', 'LATITUDE', 'LONGITUDE', 'BOROUGH']
df_group_by = (
    df
    .group_by(group_by_cols).len()
    .rename({'len':  'COUNT'})
    .sort('COUNT')
)
df_group_by
# df_group_by['len'].sum()

In [None]:
df.columns