In [10]:
import polars as pl
pl.Config().set_table_rows(20)
import polars.selectors as cs
import os

if 'df.parquet' not in os.listdir():
    print('reading data from csv file')
    #----- SETUP ENUMERATED TYPES --------------------------------------------------
    product_enum = pl.Enum(
        ['Book', 'Headphones', 'Jeans', 'Laptop', 'Refrigerator', 'Running Shoes', 
        'Smartphone', 'Smartwatch', 'T-Shirt', 'Washing Machine']
    )
    category_enum =  pl.Enum(
        ['Books', 'Clothing', 'Electronics', 'Footwear', 'Home Appliances']
    )
    customer_enum = pl.Enum(
        ['Chris White', 'Daniel Harris', 'David Lee', 'Emily Johnson', 'Emma Clark', 
        'Jane Smith', 'John Doe', 'Michael Brown', 'Olivia Wilson', 'Sophia Miller']
    )
    location_enum = pl.Enum(
        ['Boston', 'Chicago', 'Dallas', 'Denver', 'Houston', 'Los Angeles', 'Miami', 
        'New York', 'San Francisco', 'Seattle']
    )
    payment_method_enum = pl.Enum(
        ['Amazon Pay', 'Credit Card', 'Debit Card', 'Gift Card', 'PayPal']
    )
    status_enum=pl.Enum(['Cancelled', 'Completed', 'Pending'])

    #----- LOAD AND CLEAN THE DATASET
    df  = (
        pl.read_csv(
            'amazon_sales_data.csv',
        )
        .rename(  # upper case all column names, replace spaces with underscores
            lambda c: 
                c.upper()            # column names to upper case
                .replace(' ', '_')   # blanks replaced with underscores
        )
        .with_columns(      # clean up DATE column and convert to pl.Date
            DATE = pl.col('DATE')
                .str.replace_all('-', '/')
                .str.replace_all('/25', '/2025')
                .str.to_date(format='%d/%m/%Y')
        )
        .rename({
            'ORDER_ID'           : 'ID',
            'CUSTOMER_NAME'      : 'CUSTOMER',
            'TOTAL_SALES'        : 'TOTAL',
            'CUSTOMER_LOCATION'  : 'LOCATION',
            'PAYMENT_METHOD'     : 'METHOD'
        })
        .with_columns(
            pl.col(['PRICE','TOTAL']).cast(pl.UInt16),
            pl.col('ID').str.slice(3).cast(pl.UInt8),
            pl.col('PRODUCT').cast(product_enum),
            pl.col('CATEGORY').cast(category_enum),       
            pl.col('CUSTOMER').cast(customer_enum),
            pl.col('LOCATION').cast(location_enum),
            pl.col('METHOD').cast(payment_method_enum),
            pl.col('STATUS').cast(status_enum),
            pl.col('QUANTITY').cast(pl.UInt8),
        )    
    )
    df.glimpse()
    df.write_parquet('df.parquet')
else:
    print('reading data from parquet file')
    df = pl.read_parquet('df.parquet')

print(df)

AttributeError: 'Config' object has no attribute 'set_table_rows'

In [6]:
for c in df.columns:
    # print(df[c].value_counts())
    print(c, df.unique(c).sort(c).get_column(c).to_list())

ID [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221,

In [12]:
df.unique(['CUSTOMER', 'LOCATION']) 

ID,DATE,PRODUCT,CATEGORY,PRICE,QUANTITY,TOTAL,CUSTOMER,LOCATION,METHOD,STATUS
u8,date,enum,enum,u16,u8,u16,enum,enum,enum,enum
44,2025-03-24,"""Smartwatch""","""Electronics""",150,1,150,"""Chris White""","""Houston""","""Debit Card""","""Pending"""
57,2025-03-15,"""Smartphone""","""Electronics""",500,1,500,"""Jane Smith""","""Los Angeles""","""Debit Card""","""Cancelled"""
11,2025-02-17,"""Book""","""Books""",15,2,30,"""David Lee""","""Boston""","""Amazon Pay""","""Pending"""
129,2025-02-26,"""Jeans""","""Clothing""",40,5,200,"""Emily Johnson""","""New York""","""Debit Card""","""Pending"""
119,2025-03-16,"""Smartphone""","""Electronics""",500,2,1000,"""Chris White""","""Chicago""","""PayPal""","""Pending"""
…,…,…,…,…,…,…,…,…,…,…
56,2025-03-19,"""Smartwatch""","""Electronics""",150,2,300,"""Emma Clark""","""Dallas""","""Credit Card""","""Completed"""
112,2025-03-06,"""Washing Machine""","""Home Appliances""",600,2,1200,"""David Lee""","""Dallas""","""Gift Card""","""Cancelled"""
25,2025-03-02,"""Book""","""Books""",15,5,75,"""Sophia Miller""","""Seattle""","""Amazon Pay""","""Completed"""
72,2025-03-07,"""Laptop""","""Electronics""",800,3,2400,"""Daniel Harris""","""Houston""","""Credit Card""","""Pending"""
