In [1]:
import polars as pl
import requests
import json

In [2]:
Token = 'PawWhEunzcfyHfPtjouuRcCyeQXeMwhk'

Import Origination Data

In [3]:
originatingPeices1 = pl.read_csv('../../Data/Mail_v2/Originating Pieces pt.1 v2.csv', null_values='null')

In [4]:
originatingPeices2 = pl.read_csv('../../Data/Mail_v2/Originating Pieces pt.2 v2.csv', null_values='null')

In [5]:
originatingPeices = pl.concat([originatingPeices1, originatingPeices2], how = 'vertical')

In [6]:
originatingPeices.shape

(42174946, 10)

Remove Null Values

In [7]:
originatingPeices_v2 = originatingPeices.drop_nulls()

In [8]:
originatingPeices_v2.shape

(41160323, 10)

Remove Duplicate Values

In [9]:
originatingPeices_v3 = originatingPeices_v2.unique(subset=['UNIQUE_IDENTIFIER'], keep='none')

In [10]:
originatingPeices_v3.shape

(41160140, 10)

Remove Date Errors

In [11]:
originatingPeices_v4 = originatingPeices_v3.filter(pl.col('ACTUAL_DLVRY_DATE') > pl.col('START_THE_CLOCK_DATE'))

In [12]:
originatingPeices_v4.shape

(38832134, 10)

Filter for Mail Originating and Destinating in Music City

In [32]:
originatingPeices_v5 = originatingPeices_v4.filter((pl.col('ORIGIN_FACILITY') == 'MUSIC CITY ANNEX - 1532174') &
                                                   (pl.col('EXPECTED_DESTINATION_FACILITY') == 'MUSIC CITY ANNEX - 1532174'))

In [16]:
originatingPeices_v5.shape

(1821507, 10)

Find unique originating zips and dates

In [41]:
ozip = originatingPeices_v5.select(pl.col('OZIP3').alias('zip3')).unique(keep='first')

In [42]:
ozip

zip3
i64
372


In [67]:
oStartDate = originatingPeices_v5.select(pl.col('START_THE_CLOCK_DATE').alias('date')).unique(keep='first')
oStartDate

date
str
"""2024-01-20"""
"""2024-01-13"""
"""2024-01-10"""
"""2024-01-21"""
"""2024-01-17"""
…
"""2024-01-14"""
"""2024-01-18"""
"""2024-01-11"""
"""2024-01-08"""


Find Unique Destinating zips and dates

In [46]:
dzip = originatingPeices_v5.select(pl.col('DZIP3').alias('zip3')).unique(keep='first')
dzip

zip3
i64
307
385
370
373
421
374
371
422
384
372


In [68]:
dDate = originatingPeices_v5.select(pl.col('EXPECTED_DELIVERY_DATE').alias('date')).unique(keep='first')
dDate

date
str
"""2024-01-12"""
"""2024-01-16"""
"""2024-01-27"""
"""2024-01-22"""
"""2024-01-09"""
…
"""2024-01-25"""
"""2024-01-19"""
"""2024-01-23"""
"""2024-01-10"""


Find unique zips

In [64]:
zip3 = pl.concat([ozip, dzip]).unique(keep='first')['zip3'].to_list()

In [66]:
zip3

[385, 422, 374, 307, 371, 372, 384, 373, 421, 370]

Find unique dates

In [69]:
date = pl.concat([oStartDate, dDate]).unique()['date'].to_list()

In [70]:
date

['2024-01-14',
 '2024-01-17',
 '2024-01-23',
 '2024-01-24',
 '2024-01-16',
 '2024-01-19',
 '2024-01-26',
 '2024-01-08',
 '2024-01-10',
 '2024-01-18',
 '2024-01-20',
 '2024-01-09',
 '2024-01-22',
 '2024-01-27',
 '2024-01-25',
 '2024-01-12',
 '2024-01-11',
 '2024-01-13',
 '2024-01-21']

Import Zip Codes

In [54]:
upspZipcodes = pl.read_excel('../../Data/ZIP_Locale_Detail.xls')

In [55]:
upspZipcodes.head()

AREA NAME,AREA CODE,DISTRICT NAME,DISTRICT NO,DELIVERY ZIPCODE,LOCALE NAME,PHYSICAL DELV ADDR,PHYSICAL CITY,PHYSICAL STATE,PHYSICAL ZIP,PHYSICAL ZIP 4
str,str,str,str,str,str,str,str,str,str,str
"""SOUTHERN""","""4G""","""PUERTO RICO""","""006""","""00601""","""ADJUNTAS""","""37 CALLE MUNOZ RIVERA""","""ADJUNTAS""","""PR""","""00601""","""9998"""
"""SOUTHERN""","""4G""","""PUERTO RICO""","""006""","""00602""","""AGUADA""","""5 AVE NATIVO ALERS""","""AGUADA""","""PR""","""00602""","""9998"""
"""SOUTHERN""","""4G""","""PUERTO RICO""","""006""","""00603""","""AGUADILLA""","""50 CARR 459 STE 1""","""AGUADILLA""","""PR""","""00603""","""9998"""
"""SOUTHERN""","""4G""","""PUERTO RICO""","""006""","""00604""","""RAMEY""","""100 AVE BORINQUEN""","""AGUADILLA""","""PR""","""00603""","""9996"""
"""SOUTHERN""","""4G""","""PUERTO RICO""","""006""","""00605""","""AGUADILLA""","""50 CARR 459 STE 1""","""AGUADILLA""","""PR""","""00603""","""9998"""


In [58]:
upspZipcodes_cleaned = upspZipcodes.select(pl.col('DELIVERY ZIPCODE')).with_columns(pl.col('DELIVERY ZIPCODE').str.slice(0,3).alias('zip3').cast(pl.Int64))

In [59]:
upspZipcodes_cleaned.head()

DELIVERY ZIPCODE,zip3
str,i64
"""00601""",6
"""00602""",6
"""00603""",6
"""00604""",6
"""00605""",6


Weather Data Pull Function

In [82]:
def get_weather_data(api_key, zip_code, startdate, enddate):
    base_url = "https://www.ncdc.noaa.gov/cdo-web/api/v2/data"
    headers = {'token': api_key}
    params = {
        'datasetid': 'GHCND',
        'locationid': f'ZIP:{zip_code}',
        'startdate': startdate,
        'enddate': enddate,
        'datatypeid': ['TMIN', 'TMAX', 'PRCP', 'SNOW'],
        'units': 'standard',
        'limit': 1000
    }

    response = requests.get(base_url, headers=headers, params=params)

    if response.status_code == 200:
        data = response.json().get('results', [])
        return data

    else:
        emp_list = []
        return emp_list

In [78]:
zip3

[385, 422, 374, 307, 371, 372, 384, 373, 421, 370]

In [95]:
weatherdf = pl.DataFrame()

In [96]:
for i in zip3:
    zipcodes = upspZipcodes_cleaned.filter(pl.col('zip3') == i)
    for k in range(len(zipcodes)):
        z = zipcodes['DELIVERY ZIPCODE'][k]
        print('{2} - {0}/{1}'.format(k, (len(zipcodes)-1), i))
        d = get_weather_data(Token, z, '2024-01-08', '2024-01-27')
        if (len(d) > 0):
            raw = pl.DataFrame(d)
            pivoted = raw.pivot(index='date', values='value', columns='datatype', aggregate_function='max')
            pivoted = pivoted.with_columns(pl.lit(i).alias('zip3'))
            if len(weatherdf) == 0:
                weatherdf = pl.concat([weatherdf, pivoted])
            else:
                weatherdf = pl.concat([weatherdf, pivoted], how='align')

385 - 0/57
385 - 1/57
385 - 2/57
385 - 3/57
385 - 4/57
385 - 5/57
385 - 6/57
385 - 7/57
385 - 8/57
385 - 9/57
385 - 10/57
385 - 11/57
385 - 12/57
385 - 13/57
385 - 14/57
385 - 15/57
385 - 16/57
385 - 17/57
385 - 18/57
385 - 19/57
385 - 20/57
385 - 21/57
385 - 22/57
385 - 23/57
385 - 24/57
385 - 25/57
385 - 26/57
385 - 27/57
385 - 28/57
385 - 29/57
385 - 30/57
385 - 31/57
385 - 32/57
385 - 33/57
385 - 34/57
385 - 35/57
385 - 36/57
385 - 37/57
385 - 38/57
385 - 39/57
385 - 40/57
385 - 41/57
385 - 42/57
385 - 43/57
385 - 44/57
385 - 45/57
385 - 46/57
385 - 47/57
385 - 48/57
385 - 49/57
385 - 50/57
385 - 51/57
385 - 52/57
385 - 53/57
385 - 54/57
385 - 55/57
385 - 56/57
385 - 57/57
422 - 0/44
422 - 1/44
422 - 2/44
422 - 3/44
422 - 4/44
422 - 5/44
422 - 6/44
422 - 7/44
422 - 8/44
422 - 9/44
422 - 10/44
422 - 11/44
422 - 12/44
422 - 13/44
422 - 14/44
422 - 15/44
422 - 16/44
422 - 17/44
422 - 18/44
422 - 19/44
422 - 20/44
422 - 21/44
422 - 22/44
422 - 23/44
422 - 24/44
422 - 25/44
422 - 26/44


In [97]:
weatherdf.head()

date,PRCP,SNOW,TMAX,TMIN,zip3
str,f64,f64,f64,f64,i32
"""2024-01-08T00:00:00""",,0.0,45.0,27.0,422
"""2024-01-08T00:00:00""",,,51.0,28.0,307
"""2024-01-08T00:00:00""",0.0,,,,307
"""2024-01-08T00:00:00""",0.0,,,,307
"""2024-01-08T00:00:00""",0.0,0.0,,,370


In [98]:
weatherdf = weatherdf.group_by(['date', 'zip3']).agg([pl.max('PRCP'),
                                                      pl.max('SNOW'),
                                                      pl.max('TMAX'),
                                                      pl.min('TMIN')])

In [100]:
weatherdf.write_parquet('../../Data/weatherMailData.parquet.gzip', compression='gzip')

In [101]:
weatherdf = weatherdf.select(pl.col('date').str.to_date('%Y-%m-%dT%H:%M:%S'),
                             pl.col('zip3'),
                             pl.col('PRCP').alias('PRCP'),
                             pl.col('SNOW').alias('SNOW'),
                             pl.col('TMAX').alias('TMAX'),
                             pl.col('TMIN').alias('TMIN'))

In [102]:
weatherdf = weatherdf.select(pl.col('date'),
                             pl.col('zip3').cast(pl.Int64),
                             pl.col('PRCP'),
                             pl.col('SNOW'),
                             pl.col('TMAX'),
                             pl.col('TMIN'))

In [103]:
originatingPeices_v5.columns

['UNIQUE_IDENTIFIER',
 'START_THE_CLOCK_DATE',
 'ORIGIN_FACILITY',
 'OZIP3',
 'ACTUAL_DLVRY_DATE',
 'EXPECTED_DELIVERY_DATE',
 'EXPECTED_DESTINATION_FACILITY',
 'DZIP3',
 'MAIL_CLASS',
 'MAIL_SHAPE']

In [104]:
originatingPeices_v5 = originatingPeices_v5.select(pl.col('UNIQUE_IDENTIFIER'),
                                                   pl.col('START_THE_CLOCK_DATE').str.to_date('%Y-%m-%d'),
                                                   pl.col('ORIGIN_FACILITY'),
                                                   pl.col('OZIP3'),
                                                   pl.col('ACTUAL_DLVRY_DATE').str.to_date('%Y-%m-%d'),
                                                   pl.col('EXPECTED_DELIVERY_DATE').str.to_date('%Y-%m-%d'),
                                                   pl.col('EXPECTED_DESTINATION_FACILITY'),
                                                   pl.col('DZIP3'),
                                                   pl.col('MAIL_CLASS'),
                                                   pl.col('MAIL_SHAPE'))

In [105]:
weatherdf.columns

['date', 'zip3', 'PRCP', 'SNOW', 'TMAX', 'TMIN']

In [106]:
originatingPeices_v5.columns

['UNIQUE_IDENTIFIER',
 'START_THE_CLOCK_DATE',
 'ORIGIN_FACILITY',
 'OZIP3',
 'ACTUAL_DLVRY_DATE',
 'EXPECTED_DELIVERY_DATE',
 'EXPECTED_DESTINATION_FACILITY',
 'DZIP3',
 'MAIL_CLASS',
 'MAIL_SHAPE']

In [107]:
originatingPeices_v6 = originatingPeices_v5.join(weatherdf, 
                                                 left_on=['START_THE_CLOCK_DATE', 'OZIP3'], 
                                                 right_on=['date', 'zip3'])

In [108]:
originatingPeices_v6.columns

['UNIQUE_IDENTIFIER',
 'START_THE_CLOCK_DATE',
 'ORIGIN_FACILITY',
 'OZIP3',
 'ACTUAL_DLVRY_DATE',
 'EXPECTED_DELIVERY_DATE',
 'EXPECTED_DESTINATION_FACILITY',
 'DZIP3',
 'MAIL_CLASS',
 'MAIL_SHAPE',
 'PRCP',
 'SNOW',
 'TMAX',
 'TMIN']

In [109]:
originatingPeices_v6 = originatingPeices_v6.select(pl.col('UNIQUE_IDENTIFIER'),
                                                   pl.col('START_THE_CLOCK_DATE'),
                                                   pl.col('ORIGIN_FACILITY'),
                                                   pl.col('ACTUAL_DLVRY_DATE'),
                                                   pl.col('EXPECTED_DELIVERY_DATE'),
                                                   pl.col('EXPECTED_DESTINATION_FACILITY'),
                                                   pl.col('DZIP3'),
                                                   pl.col('PRCP').alias('o_PRCP'),
                                                   pl.col('SNOW').alias('o_SNOW'),
                                                   pl.col('TMAX').alias('o_TMAX'),
                                                   pl.col('TMIN').alias('o_TMIN'))

In [110]:
originatingPeices_v7 = originatingPeices_v6.join(weatherdf, 
                                                 left_on=['EXPECTED_DELIVERY_DATE', 'DZIP3'], 
                                                 right_on=['date', 'zip3'])

In [111]:
originatingPeices_v7.columns

['UNIQUE_IDENTIFIER',
 'START_THE_CLOCK_DATE',
 'ORIGIN_FACILITY',
 'ACTUAL_DLVRY_DATE',
 'EXPECTED_DELIVERY_DATE',
 'EXPECTED_DESTINATION_FACILITY',
 'DZIP3',
 'o_PRCP',
 'o_SNOW',
 'o_TMAX',
 'o_TMIN',
 'PRCP',
 'SNOW',
 'TMAX',
 'TMIN']

In [112]:
originatingPeices_v7 = originatingPeices_v7.select(pl.col('UNIQUE_IDENTIFIER'),
                                                   pl.col('START_THE_CLOCK_DATE'),
                                                   pl.col('ORIGIN_FACILITY'),
                                                   pl.col('ACTUAL_DLVRY_DATE'),
                                                   pl.col('EXPECTED_DELIVERY_DATE'),
                                                   pl.col('EXPECTED_DESTINATION_FACILITY'),
                                                   pl.col('DZIP3'),
                                                   pl.col('o_PRCP'),
                                                   pl.col('o_SNOW'),
                                                   pl.col('o_TMAX'),
                                                   pl.col('o_TMIN'),
                                                   pl.col('PRCP').alias('d_PRCP'),
                                                   pl.col('SNOW').alias('d_SNOW'),
                                                   pl.col('TMAX').alias('d_TMAX'),
                                                   pl.col('TMIN').alias('d_TMIN'))

In [113]:
originatingPeices_v7.write_parquet('../../Data/originatingMail_withWeather.parquet.gzip', compression='gzip')

In [114]:
originatingPeices_v7.shape

(1821507, 15)

In [115]:
test = pl.read_parquet('../../Data/originatingMail_withWeather.parquet.gzip')

In [117]:
test.shape

(1821507, 15)

In [118]:
test.head()

UNIQUE_IDENTIFIER,START_THE_CLOCK_DATE,ORIGIN_FACILITY,ACTUAL_DLVRY_DATE,EXPECTED_DELIVERY_DATE,EXPECTED_DESTINATION_FACILITY,DZIP3,o_PRCP,o_SNOW,o_TMAX,o_TMIN,d_PRCP,d_SNOW,d_TMAX,d_TMIN
i64,date,str,date,date,str,i64,f64,f64,f64,f64,f64,f64,f64,f64
286834011706370,2024-01-08,"""MUSIC CITY ANNEX - 1532174""",2024-01-10,2024-01-11,"""MUSIC CITY ANNEX - 1532174""",370,0.45,0.0,54.0,23.0,0.39,0.0,61.0,22.0
286834010262781,2024-01-08,"""MUSIC CITY ANNEX - 1532174""",2024-01-09,2024-01-11,"""MUSIC CITY ANNEX - 1532174""",370,0.45,0.0,54.0,23.0,0.39,0.0,61.0,22.0
286832830318347,2024-01-08,"""MUSIC CITY ANNEX - 1532174""",2024-01-10,2024-01-11,"""MUSIC CITY ANNEX - 1532174""",385,0.45,0.0,54.0,23.0,0.02,0.0,60.0,22.0
286832830321859,2024-01-08,"""MUSIC CITY ANNEX - 1532174""",2024-01-10,2024-01-11,"""MUSIC CITY ANNEX - 1532174""",372,0.45,0.0,54.0,23.0,0.0,0.0,59.0,30.0
286991153154247,2024-01-09,"""MUSIC CITY ANNEX - 1532174""",2024-01-11,2024-01-12,"""MUSIC CITY ANNEX - 1532174""",372,1.2,0.0,54.0,24.0,1.1,0.0,59.0,30.0
