In [48]:
import polars as pl
import polars.selectors as cs
import plotly.express as px
import os
source_data = 'Food Waste data & research - PUBLIC.xlsx'
sheet_name = 'food waste by country'

In [46]:
df = (
    pl.read_excel(
        source_data,    
        sheet_name=sheet_name,
        has_header=False,
        read_options={ "skip_rows": 2}
    )
    .filter(~pl.col('column_1').is_in(['Country', 'AVERAGE']))
    .select(
        COUNTRY = pl.col('column_1'),
        HOUSEHOLD_PC =  pl.col('column_3').cast(pl.UInt8()),
        HOUSEHOLD_TOT = pl.col('column_4').cast(pl.UInt32()),
        RETAIL_PC =  pl.col('column_5').cast(pl.UInt8()),
        RETAIL_TOT = pl.col('column_6').cast(pl.UInt32()),
        FOOD_SERVICE_PC =  pl.col('column_7').cast(pl.UInt8()),
        FOOD_SERVICE_TOT = pl.col('column_8').cast(pl.UInt32())
    )
)
df

COUNTRY,HOUSEHOLD_PC,HOUSEHOLD_TOT,RETAIL_PC,RETAIL_TOT,FOOD_SERVICE_PC,FOOD_SERVICE_TOT
str,u8,u32,u8,u32,u8,u32
"""Afghanistan""",82,3109153,16,594982,28,1051783
"""Albania""",83,238492,16,45058,28,79651
"""Algeria""",91,3918529,16,673360,28,1190335
"""Andorra""",84,6497,13,988,26,1971
"""Angola""",100,3169523,16,497755,28,879908
…,…,…,…,…,…,…
"""Venezuela (Boliv. Rep. of)""",72,2065461,16,445994,28,788407
"""Viet Nam""",76,7346717,16,1508689,28,2666991
"""Yemen""",104,3026946,16,456099,28,806270
"""Zambia""",78,1391729,16,279350,28,493822


In [None]:
import polars as pl
import polars.selectors as cs
import plotly.express as px
import os

source_data = 'Lottery_Powerball_Winning_Numbers__Beginning_2010.csv'
df = (
    pl.scan_csv(source_data)
    .with_columns(
        DATE = pl.col('Draw Date').str.to_date(format='%m/%d/%Y'),
        SPLIT_NUMS = pl.col('Winning Numbers').str.split(' ')
    )
    .select(
        pl.col('DATE'),
        # YEAR = pl.col('DATE').dt.year().cast(pl.UInt16),
        # MONTH = pl.col('DATE').dt.strftime('%b'),
        # DAY = pl.col('DATE').dt.strftime('%d').cast(pl.UInt8),
        # DAY_NAME = pl.col('DATE').dt.strftime('%a'),
        LOWEST = pl.col('SPLIT_NUMS')
            .list.get(0, null_on_oob=True)
            .str.strip_chars().cast(pl.UInt8),
        SECOND_LOWEST = pl.col('SPLIT_NUMS')
            .list.get(1, null_on_oob=True)
            .str.strip_chars().cast(pl.UInt8),
        MEDIAN = pl.col('SPLIT_NUMS')
            .list.get(2, null_on_oob=True)
            .str.strip_chars().cast(pl.UInt8),
        SECOND_HIGHEST = pl.col('SPLIT_NUMS')
            .list.get(3, null_on_oob=True)
            .str.strip_chars().cast(pl.UInt8),
        HIGHEST = pl.col('SPLIT_NUMS')
            .list.get(4, null_on_oob=True)
            .str.strip_chars().cast(pl.UInt8),
        POWERBALL = pl.col('SPLIT_NUMS')
            .list.get(5, null_on_oob=True)
            .str.strip_chars().cast(pl.UInt8),
    )
    .sort('DATE', descending=True)
    .sort('LOWEST', descending=False)
    .collect()
)
df


DATE,LOWEST,SECOND_LOWEST,MEDIAN,SECOND_HIGHEST,HIGHEST,POWERBALL
date,u8,u8,u8,u8,u8,u8
2020-10-21,1,3,13,44,56,26
2021-01-06,1,20,22,60,66,3
2021-01-30,1,2,7,52,61,4
2021-02-06,1,16,48,49,65,8
2021-02-17,1,15,21,32,46,1
…,…,…,…,…,…,…
2015-10-17,48,49,57,62,69,19
2020-01-01,49,53,57,59,62,26
2017-07-19,50,51,59,61,63,4
2020-11-21,51,54,57,60,69,11


In [1]:
pick_list = ['LOWEST', 'SECOND_LOWEST', 'MEDIAN', 'SECOND_HIGHEST', 'HIGHEST']
pick_list + ['ALL']

['LOWEST', 'SECOND_LOWEST', 'MEDIAN', 'SECOND_HIGHEST', 'HIGHEST', 'ALL']

In [None]:
(
    df
    .filter(pl.col('CREATED_DATE') != pl.col('RECEIVE_DATE'))
    .filter(pl.col('PICKUP_DURATION').is_not_null())
    .sample(10)
    .glimpse()
)

In [None]:
(
    df
    .filter(pl.col('CREATED_DATE') != pl.col('RECEIVE_DATE'))
    .filter(pl.col('PICKUP_DURATION').is_not_null())
    .select('CREATED_DATE', 'DELIVERY_DATE', 'RECEIVE_DATE','WITHDRAW_DATE', 'EXPIRE_DATE' )
    .sample(5)
    .transpose(include_header=True)
    .sort('column_0')
)



# .str.strptime(pl.Datetime, fmt="%m/%d/%Y %H:%M")


In [None]:
for c in df.columns:
    print(
        c, 
        df[c]
        .value_counts()
        .with_columns(
            PCT = (100*pl.col('count')/pl.col('count').sum()).cast(pl.Float32)
        )
    )

In [None]:
print(list(df.columns))
print()

In [None]:
print(sorted(list(set(df['BOROUGH']))))

In [None]:
fig = px.scatter_map(
    df,
    lat = 'LATITUDE',
    lon = 'LONGITUDE',
    color='BOROUGH'
)
fig.show()

In [None]:
group_by_cols = ['LOCKER_NAME', 'ADDRESS', 'LOCATION_TYPE', 'LATITUDE', 'LONGITUDE', 'BOROUGH']
df_group_by = (
    df
    .group_by(group_by_cols).len()
    .rename({'len':  'COUNT'})
    .sort('COUNT')
)
df_group_by
# df_group_by['len'].sum()

In [None]:
df.columns