In [21]:
import polars as pl
import os

In [None]:
parquet_data_source = 'df.parquet'
csv_data_source = 'europe_monthly_electricity.csv' 

if os.path.exists(parquet_data_source): # use pre-cleaned parquet file
    print(f'Reading data from {parquet_data_source}')
    df = pl.read_parquet(parquet_data_source)

else:  # read data from csv and clean
    print(f'Reading data from {csv_data_source}')
    df = (
        pl.read_csv(
            csv_data_source,
            try_parse_dates=True
            )
        .select(
            COUNTRY = pl.col('Area'),
            ISO_3_CODE = pl.col('ISO 3 code'),
            YEAR = pl.col('Date').dt.year(),
            EU = pl.col('EU').cast(pl.Boolean),
            OECD = pl.col('OECD').cast(pl.Boolean),
            G20 = pl.col('G20').cast(pl.Boolean),
            G7 = pl.col('G7').cast(pl.Boolean),
            CAT = pl.col('Category').cast(pl.Categorical),
            SUBCAT = pl.col('Subcategory').cast(pl.Categorical),
            VARIABLE = pl.col('Variable').cast(pl.Categorical),
            UNIT = pl.col('Unit').cast(pl.Categorical),
            VALUE = pl.col('Value'),
        )
        .drop_nulls(subset='ISO_3_CODE') 
        .filter(pl.col('YEAR') > 2014)   # data is parse prior to 2015
    )

df
df.write_excel('df.xlsx')
df.write_parquet('df.parquet')



Reading data from df.parquet


In [23]:
for col in df.columns:
    print(df[col].value_counts())

shape: (38, 2)
┌─────────────────┬───────┐
│ COUNTRY         ┆ count │
│ ---             ┆ ---   │
│ str             ┆ u32   │
╞═════════════════╪═══════╡
│ Malta           ┆ 4180  │
│ Spain           ┆ 7308  │
│ Estonia         ┆ 5418  │
│ Romania         ┆ 6174  │
│ Netherlands     ┆ 6930  │
│ …               ┆ …     │
│ Greece          ┆ 5418  │
│ North Macedonia ┆ 5390  │
│ Luxembourg      ┆ 3600  │
│ Cyprus          ┆ 5500  │
│ Kosovo          ┆ 1394  │
└─────────────────┴───────┘
shape: (38, 2)
┌────────────┬───────┐
│ ISO_3_CODE ┆ count │
│ ---        ┆ ---   │
│ str        ┆ u32   │
╞════════════╪═══════╡
│ NOR        ┆ 4641  │
│ IRL        ┆ 5704  │
│ BEL        ┆ 7308  │
│ FIN        ┆ 5304  │
│ XKX        ┆ 1394  │
│ …          ┆ …     │
│ CYP        ┆ 5500  │
│ MDA        ┆ 2576  │
│ GRC        ┆ 5418  │
│ SVN        ┆ 5586  │
│ BIH        ┆ 2979  │
└────────────┴───────┘
shape: (11, 2)
┌──────┬───────┐
│ YEAR ┆ count │
│ ---  ┆ ---   │
│ i32  ┆ u32   │
╞══════╪═══════╡
│ 2

#### select, rename, cast columns

In [24]:
for i, col in enumerate(df.columns):
    select_string = (
        f"{col.upper().replace(' ', '_')} " + 
        f"= pl.col('{col}'), # .cast(pl.String)',"
    )

    print(select_string)
    # print(f"{col.upper().replace(' ', '_')} = pl.col('{col} # .cast(pl.String)'),")

COUNTRY = pl.col('COUNTRY'), # .cast(pl.String)',
ISO_3_CODE = pl.col('ISO_3_CODE'), # .cast(pl.String)',
YEAR = pl.col('YEAR'), # .cast(pl.String)',
EU = pl.col('EU'), # .cast(pl.String)',
OECD = pl.col('OECD'), # .cast(pl.String)',
G20 = pl.col('G20'), # .cast(pl.String)',
G7 = pl.col('G7'), # .cast(pl.String)',
CAT = pl.col('CAT'), # .cast(pl.String)',
SUBCAT = pl.col('SUBCAT'), # .cast(pl.String)',
VARIABLE = pl.col('VARIABLE'), # .cast(pl.String)',
UNIT = pl.col('UNIT'), # .cast(pl.String)',
VALUE = pl.col('VALUE'), # .cast(pl.String)',
