In [12]:
import polars as pl
import polars.selectors as cs
import os

In [23]:
parquet_data_source = 'df.parque'
csv_data_source = 'candy-data.csv' 
date_fmt ='%m/%d/%Y'
if os.path.exists(parquet_data_source): # use pre-cleaned parquet file
    print(f'Reading data from {parquet_data_source}')
    df = pl.read_parquet(parquet_data_source)

else:  # read data from csv and clean
    print(f'Reading data from {csv_data_source}')
    df = (
        pl.read_csv(
            csv_data_source,
            )
        .rename(lambda c: c.upper()) # col names to upper case
        .rename({
            'SUGARPERCENT'     : 'SUGAR_PCT',
            'PRICEPERCENT'     : 'PRICE_PCT',
            'WINPERCENT'       : 'WIN_PCT',
            'PEANUTYALMONDY'   : 'PEANUT_ALMOND',
            'CRISPEDRICEWAFER' : 'CRISP_RICE_WAF',
        })
        .with_columns(cs.integer().cast(pl.UInt8))
        .with_columns(cs.float().cast(pl.Float32))
    )

df
# df.write_excel('df.xlsx')
# df.write_parquet('df.parquet')
print(df.glimpse())
print(df.columns)



Reading data from candy-data.csv
Rows: 85
Columns: 13
$ COMPETITORNAME <str> '100 Grand', '3 Musketeers', 'One dime', 'One quarter', 'Air Heads', 'Almond Joy', 'Baby Ruth', 'Boston Baked Beans', 'Candy Corn', 'Caramel Apple Pops'
$ CHOCOLATE       <u8> 1, 1, 0, 0, 0, 1, 1, 0, 0, 0
$ FRUITY          <u8> 0, 0, 0, 0, 1, 0, 0, 0, 0, 1
$ CARAMEL         <u8> 1, 0, 0, 0, 0, 0, 1, 0, 0, 1
$ PEANUT_ALMOND   <u8> 0, 0, 0, 0, 0, 1, 1, 1, 0, 0
$ NOUGAT          <u8> 0, 1, 0, 0, 0, 0, 1, 0, 0, 0
$ CRISP_RICE_WAF  <u8> 1, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ HARD            <u8> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ BAR             <u8> 1, 1, 0, 0, 0, 1, 1, 0, 0, 0
$ PLURIBUS        <u8> 0, 0, 0, 0, 0, 0, 0, 1, 1, 0
$ SUGAR_PCT      <f32> 0.7319999933242798, 0.6039999723434448, 0.010999999940395355, 0.010999999940395355, 0.906000018119812, 0.4650000035762787, 0.6039999723434448, 0.31299999356269836, 0.906000018119812, 0.6039999723434448
$ PRICE_PCT      <f32> 0.8600000143051147, 0.5109999775886536, 0.115999996662

In [3]:
for col in df.columns:
    print(df[col].value_counts())

shape: (85, 2)
┌─────────────────────────────┬───────┐
│ competitorname              ┆ count │
│ ---                         ┆ ---   │
│ str                         ┆ u32   │
╞═════════════════════════════╪═══════╡
│ Fruit Chews                 ┆ 1     │
│ Tootsie Roll Midgies        ┆ 1     │
│ Skittles wildberry          ┆ 1     │
│ M&M's                       ┆ 1     │
│ Mr Good Bar                 ┆ 1     │
│ …                           ┆ …     │
│ Mike & Ike                  ┆ 1     │
│ Lifesavers big ring gummies ┆ 1     │
│ Snickers                    ┆ 1     │
│ Almond Joy                  ┆ 1     │
│ Peanut M&Ms                 ┆ 1     │
└─────────────────────────────┴───────┘
shape: (2, 2)
┌───────────┬───────┐
│ chocolate ┆ count │
│ ---       ┆ ---   │
│ i64       ┆ u32   │
╞═══════════╪═══════╡
│ 1         ┆ 37    │
│ 0         ┆ 48    │
└───────────┴───────┘
shape: (2, 2)
┌────────┬───────┐
│ fruity ┆ count │
│ ---    ┆ ---   │
│ i64    ┆ u32   │
╞════════╪═══════╡
│ 0   

In [5]:
df.sample(10).glimpse()

Rows: 10
Columns: 13
$ competitorname   <str> 'Sour Patch Tricksters', 'Caramel Apple Pops', 'Junior Mints', "Reese's Peanut Butter cup", 'Smarties candy', "Reese's Miniatures", "Werther's Original Caramel", 'Charleston Chew', 'Chewey Lemonhead Fruit Mix', 'Lifesavers big ring gummies'
$ chocolate        <i64> 0, 0, 1, 1, 0, 1, 0, 1, 0, 0
$ fruity           <i64> 1, 1, 0, 0, 1, 0, 0, 0, 1, 1
$ caramel          <i64> 0, 1, 0, 0, 0, 0, 1, 0, 0, 0
$ peanutyalmondy   <i64> 0, 0, 0, 1, 0, 1, 0, 0, 0, 0
$ nougat           <i64> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0
$ crispedricewafer <i64> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
$ hard             <i64> 0, 0, 0, 0, 1, 0, 1, 0, 0, 0
$ bar              <i64> 0, 0, 0, 0, 0, 0, 0, 1, 0, 0
$ pluribus         <i64> 1, 0, 1, 0, 1, 0, 0, 0, 1, 0
$ sugarpercent     <f64> 0.068999998, 0.60399997, 0.197, 0.72000003, 0.26699999, 0.034000002, 0.186, 0.60399997, 0.73199999, 0.26699999
$ pricepercent     <f64> 0.116, 0.32499999, 0.51099998, 0.65100002, 0.116, 0.27900001, 0.26

#### select, rename, cast columns

In [None]:
for i, col in enumerate(df.columns):
    select_string = (
        f"{col.upper().replace(' ', '_')} " + 
        f"= pl.col('{col}'), # .cast(pl.String)',"
    )

    print(select_string)
    # print(f"{col.upper().replace(' ', '_')} = pl.col('{col} # .cast(pl.String)'),")