In [1]:
import polars as pl
import polars.selectors as cs
import plotly.express as px


df = (
    pl.read_csv(
        'LockerNYC_Reservations_20250903.csv',
    )
    .rename(    
        lambda c: 
            c.upper()          # all column names to upper case
            .replace(' ', '_') # replace blanks with underscores
            .replace(r'(', '') # replace left parens with underscores
            .replace(r')', '') # replace left parens with underscores
    )
    .select(
        ['TYPE', 'DELIVERED',
        'RECEIVED',
        'LOCKER_NAME', 'LOCKER_BOX_DOOR', 'LOCKER_SIZE', 
        'PICKUP_DURATION', 'DELIVERY_DURATION', 
        'LOCATION_TYPE', 'ADDRESS', 'LATITUDE', 'LONGITUDE', 
        'BOROUGH',
        'RECEIVE_DATE', 'CREATED_DATE', 'DELIVERY_DATE', 'WITHDRAW_DATE', 'EXPIRE_DATE',
        ],
    )
    .with_columns(cs.ends_with('_DATE').str.to_datetime(format="%m/%d/%Y %H:%M"))
    .with_columns(ZIP_CODE = pl.col('ADDRESS').str.split(' ').list.last())
    .with_columns(ADDRESS = pl.col('ADDRESS').str.split(',').list.first())
)
print(df.sample(3))


shape: (3, 19)
┌─────────┬───────────┬──────────┬────────────┬───┬────────────┬────────────┬───────────┬──────────┐
│ TYPE    ┆ DELIVERED ┆ RECEIVED ┆ LOCKER_NAM ┆ … ┆ DELIVERY_D ┆ WITHDRAW_D ┆ EXPIRE_DA ┆ ZIP_CODE │
│ ---     ┆ ---       ┆ ---      ┆ E          ┆   ┆ ATE        ┆ ATE        ┆ TE        ┆ ---      │
│ str     ┆ bool      ┆ bool     ┆ ---        ┆   ┆ ---        ┆ ---        ┆ ---       ┆ str      │
│         ┆           ┆          ┆ str        ┆   ┆ datetime[μ ┆ datetime[μ ┆ datetime[ ┆          │
│         ┆           ┆          ┆            ┆   ┆ s]         ┆ s]         ┆ μs]       ┆          │
╞═════════╪═══════════╪══════════╪════════════╪═══╪════════════╪════════════╪═══════════╪══════════╡
│ Receive ┆ true      ┆ true     ┆ 508-1      ┆ … ┆ 2024-12-09 ┆ 2024-12-10 ┆ 2024-12-1 ┆ 10009    │
│         ┆           ┆          ┆            ┆   ┆ 20:34:00   ┆ 07:50:00   ┆ 0         ┆          │
│         ┆           ┆          ┆            ┆   ┆            ┆            

In [2]:
(
    df
    .filter(pl.col('CREATED_DATE') != pl.col('RECEIVE_DATE'))
    .filter(pl.col('PICKUP_DURATION').is_not_null())
    .sample(10)
    .glimpse()
)

Rows: 10
Columns: 19
$ TYPE                       <str> 'Receive', 'Receive', 'Receive', 'Receive', 'Receive', 'Receive', 'Receive', 'Receive', 'Receive', 'Receive'
$ DELIVERED                 <bool> True, True, True, True, True, True, True, True, True, True
$ RECEIVED                  <bool> True, True, True, True, True, True, True, True, True, True
$ LOCKER_NAME                <str> '508-1', 'Westside Home Center', '508-1', 'GoLocker W 161 Street', 'GoLocker St.Marks', 'City Point - 2', '508-1', 'Bright Mart Laundry - Bushwick', 'Ideal Food Basket - Lafayette Ave', 'City Point - 2'
$ LOCKER_BOX_DOOR            <i64> 4, 27, 7, 14, 17, 27, 8, 17, 11, 16
$ LOCKER_SIZE                <str> 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S', 'S'
$ PICKUP_DURATION            <str> '00.02.15.17', '00.02.41.44', '00.00.05.53', '00.03.36.00', '00.16.45.09', '02.21.59.45', '00.20.23.33', '00.00.04.48', '00.22.32.57', '00.21.33.11'
$ DELIVERY_DURATION          <str> '00.04.08.36', '00.04.58.33', '00.0

In [3]:
(
    df
    .filter(pl.col('CREATED_DATE') != pl.col('RECEIVE_DATE'))
    .filter(pl.col('PICKUP_DURATION').is_not_null())
    .select('CREATED_DATE', 'DELIVERY_DATE', 'RECEIVE_DATE','WITHDRAW_DATE', 'EXPIRE_DATE' )
    .sample(5)
    .transpose(include_header=True)
    .sort('column_0')
)



# .str.strptime(pl.Datetime, fmt="%m/%d/%Y %H:%M")


column,column_0,column_1,column_2,column_3,column_4
str,datetime[μs],datetime[μs],datetime[μs],datetime[μs],datetime[μs]
"""RECEIVE_DATE""",2025-02-19 13:39:00,2025-02-12 14:16:00,2025-01-03 15:20:00,2025-02-10 16:03:00,2025-02-11 16:16:00
"""CREATED_DATE""",2025-02-19 13:40:00,2025-02-12 14:17:00,2025-01-03 15:21:00,2025-02-10 16:04:00,2025-02-11 16:17:00
"""DELIVERY_DATE""",2025-02-19 20:31:00,2025-02-12 18:30:00,2025-01-03 19:09:00,2025-02-10 17:59:00,2025-02-11 19:40:00
"""WITHDRAW_DATE""",2025-02-21 09:38:00,2025-02-17 18:37:00,2025-01-04 10:44:00,2025-02-11 16:32:00,2025-02-19 15:32:00
"""EXPIRE_DATE""",2025-02-22 20:31:00,2025-02-13 18:30:00,2025-01-04 19:09:00,2025-02-13 17:59:00,2025-02-12 19:40:00


In [4]:
for c in df.columns:
    print(
        c, 
        df[c]
        .value_counts()
        .with_columns(
            PCT = (100*pl.col('count')/pl.col('count').sum()).cast(pl.Float32)
        )
    )

TYPE shape: (3, 3)
┌─────────┬───────┬───────────┐
│ TYPE    ┆ count ┆ PCT       │
│ ---     ┆ ---   ┆ ---       │
│ str     ┆ u32   ┆ f32       │
╞═════════╪═══════╪═══════════╡
│ Receive ┆ 11894 ┆ 96.888237 │
│ Direct  ┆ 163   ┆ 1.327794  │
│ Send    ┆ 219   ┆ 1.783969  │
└─────────┴───────┴───────────┘
DELIVERED shape: (2, 3)
┌───────────┬───────┬───────────┐
│ DELIVERED ┆ count ┆ PCT       │
│ ---       ┆ ---   ┆ ---       │
│ bool      ┆ u32   ┆ f32       │
╞═══════════╪═══════╪═══════════╡
│ false     ┆ 6     ┆ 0.048876  │
│ true      ┆ 12270 ┆ 99.951126 │
└───────────┴───────┴───────────┘
RECEIVED shape: (2, 3)
┌──────────┬───────┬───────────┐
│ RECEIVED ┆ count ┆ PCT       │
│ ---      ┆ ---   ┆ ---       │
│ bool     ┆ u32   ┆ f32       │
╞══════════╪═══════╪═══════════╡
│ true     ┆ 11995 ┆ 97.710983 │
│ false    ┆ 281   ┆ 2.289019  │
└──────────┴───────┴───────────┘
LOCKER_NAME shape: (34, 3)
┌─────────────────────────────────┬───────┬───────────┐
│ LOCKER_NAME              

In [5]:
print(list(df.columns))
print()

['TYPE', 'DELIVERED', 'RECEIVED', 'LOCKER_NAME', 'LOCKER_BOX_DOOR', 'LOCKER_SIZE', 'PICKUP_DURATION', 'DELIVERY_DURATION', 'LOCATION_TYPE', 'ADDRESS', 'LATITUDE', 'LONGITUDE', 'BOROUGH', 'RECEIVE_DATE', 'CREATED_DATE', 'DELIVERY_DATE', 'WITHDRAW_DATE', 'EXPIRE_DATE', 'ZIP_CODE']



In [6]:
print(sorted(list(set(df['BOROUGH']))))

['Brooklyn', 'Manhattan', 'Queens']


In [7]:
fig = px.scatter_map(
    df,
    lat = 'LATITUDE',
    lon = 'LONGITUDE',
    color='BOROUGH'
)
fig.show()

In [8]:
group_by_cols = ['LOCKER_NAME', 'ADDRESS', 'LOCATION_TYPE', 'LATITUDE', 'LONGITUDE', 'BOROUGH']
df_group_by = (
    df
    .group_by(group_by_cols).len()
    .rename({'len':  'COUNT'})
    .sort('COUNT')
)
df_group_by
# df_group_by['len'].sum()

LOCKER_NAME,ADDRESS,LOCATION_TYPE,LATITUDE,LONGITUDE,BOROUGH,COUNT
str,str,str,f64,f64,str,u32
"""GoLocker-Farm Country Supermar…","""75-55 31st Ave""","""Indoor""",40.75991,-73.89213,"""Queens""",2
"""GoLocker-Ocean Eats""","""865 4th Ave""","""Indoor""",40.65644,-74.00172,"""Brooklyn""",3
"""GoLocker-Astoria Central Parki…","""31-57 31st Street""","""Indoor""",40.76285,-73.92423,"""Queens""",3
"""GoLocker-MobiCompu Repair""","""279 Smith St""","""Indoor""",40.68243,-73.99327,"""Brooklyn""",3
"""GoLocker-Sherman Parking Ridge…","""1870 Troutman Street""","""Indoor""",40.71121,-73.91778,"""Queens""",5
…,…,…,…,…,…,…
"""City Point - 2""","""445 Albee Square W""","""Indoor""",40.691078,-73.982784,"""Brooklyn""",817
"""Fort Greene Food Market""","""186 DeKalb Ave""","""Indoor""",40.689589,-73.972213,"""Brooklyn""",916
"""Ideal Food Basket - Lafayette …","""830 Lafayette Ave""","""Outdoor""",40.691226,-73.939338,"""Brooklyn""",1383
"""508-1""","""508 East 12th St""","""Indoor""",40.728865,-73.980838,"""Manhattan""",1765


In [9]:
df.columns

['TYPE',
 'DELIVERED',
 'RECEIVED',
 'LOCKER_NAME',
 'LOCKER_BOX_DOOR',
 'LOCKER_SIZE',
 'PICKUP_DURATION',
 'DELIVERY_DURATION',
 'LOCATION_TYPE',
 'ADDRESS',
 'LATITUDE',
 'LONGITUDE',
 'BOROUGH',
 'RECEIVE_DATE',
 'CREATED_DATE',
 'DELIVERY_DATE',
 'WITHDRAW_DATE',
 'EXPIRE_DATE',
 'ZIP_CODE']