## Sales/Discounts by country, segement
Make a timeline plot of sales activity
groupby country or market segment
draw a line plot of cumulative sales by selected country or market segment


In [2]:
import polars as pl
import polars.selectors as cs
import os

In [3]:
# Define Enum categories
enum_AGE = pl.Enum(['Adult', 'Baby', 'Senior', 'Young'])
enum_SEX = pl.Enum(['Male', 'Female', 'Unknown'])
enum_SIZE = pl.Enum(['Small', 'Medium', 'Large','Extra Large'])

In [5]:
root_file = 'allDogDescriptions'
if os.path.exists(root_file + '.parquet'):
    print(f'{"*"*20} Reading {root_file}.parquet  {"*"*20}')
    df = pl.read_parquet(root_file + '.parquet')
    
else:
    print(f'{"*"*20} Reading {root_file}.csv  {"*"*20}')
    df = (
        pl.read_csv(root_file + '.csv', ignore_errors=True)
        .select(
            ID = pl.col('id').cast(pl.UInt32),
            ORG_ID = pl.col('org_id'),
            BREED_PRIMARY = pl.col('breed_primary'),
            BREED_MIXED = pl.col('breed_mixed'),
            AGE = pl.col('age').cast(enum_AGE),
            SEX = pl.col('sex').cast(enum_SEX),
            SIZE = pl.col('size').cast(enum_SIZE),
            FIXED = pl.col('fixed'),
            HOUSE_TRAINED = pl.col('house_trained'),
            SHOTS_CURRENT = pl.col('shots_current'),
            NAME = pl.col('name').str.strip_chars().str.to_titlecase(),
            DATE = pl.col('posted')   # regex for dates formatted as YYYY-MM-DD
                .str.extract(r'(\d{4}-\d{2}-\d{2})', group_index=1)
                .str.to_date('%Y-%m-%d'),
            TIME = pl.col('posted')   # regex for time formatted as HH:MM:SS
                .str.extract(r'(\d{2}:\d{2}:\d{2})', group_index=1)
                .str.to_time('%H:%M:%S'),
            CONTACT_CITY = pl.col('contact_city'),
            CONTACT_STATE = pl.col('contact_state'),
            CONTACT_ZIP = pl.col('contact_zip').cast(pl.UInt32),
        )
        .filter(  # regex to accept states comprised of 2 uppercase letters    
            pl.col('CONTACT_STATE').str.contains(r'^[A-Z]{2}$')
        )
    )
    df.write_parquet(root_file + '.parquet')

print(df.shape)
print(df.columns[:8])
print(df.columns[8:])

print(df.sample(10).glimpse())

******************** Reading allDogDescriptions.parquet  ********************
(58147, 16)
['ID', 'ORG_ID', 'BREED_PRIMARY', 'BREED_MIXED', 'AGE', 'SEX', 'SIZE', 'FIXED']
['HOUSE_TRAINED', 'SHOTS_CURRENT', 'NAME', 'DATE', 'TIME', 'CONTACT_CITY', 'CONTACT_STATE', 'CONTACT_ZIP']
Rows: 10
Columns: 16
$ ID             <u32> 45931291, 45823525, 44361319, 45641425, 44518562, 39813087, 44349910, 45482007, 36709140, 45983715
$ ORG_ID         <str> 'NH74', 'GA66', 'TX920', 'NJ593', 'FL134', 'LA131', 'NC583', 'PA884', 'NM121', 'DC20'
$ BREED_PRIMARY  <str> 'Rottweiler', 'Mixed Breed', 'Great Pyrenees', 'Jack Russell Terrier', 'Dachshund', 'Labrador Retriever', 'Vizsla', 'Hound', 'Basset Hound', 'Shepherd'
$ BREED_MIXED   <bool> False, False, False, True, True, True, True, True, True, True
$ AGE           <enum> Adult, Young, Young, Adult, Young, Baby, Adult, Young, Senior, Baby
$ SEX           <enum> Female, Male, Female, Male, Male, Male, Male, Female, Female, Male
$ SIZE          <enum> Large, 