In [None]:
import polars as pl
import os

In [None]:
parquet_data_source = 'df.parque'
csv_data_source = 'europe_monthly_electricity.csv' 
date_fmt ='%m/%d/%Y'
if os.path.exists(parquet_data_source): # use pre-cleaned parquet file
    print(f'Reading data from {parquet_data_source}')
    df = pl.read_parquet(parquet_data_source)

else:  # read data from csv and clean
    print(f'Reading data from {csv_data_source}')
    df = (
        pl.read_csv(
            csv_data_source,
            )
        .select(
            COUNTRY = pl.col('Area'),
            ISO_3_CODE = pl.col('ISO 3 code'),
            YEAR = pl.col('Date')
                .str.to_date(format=date_fmt)
                .dt.year(),
            MONTH = pl.col('Date')
                .str.to_date(format=date_fmt)
                .dt.strftime("%b"),
            MONTH_NUM = pl.col('Date')
                .str.to_date(format=date_fmt)
                .dt.month(),
            DATE = pl.col('Date')
                .str.to_date(format=date_fmt),
            EU = pl.col('EU').cast(pl.Boolean),
            OECD = pl.col('OECD').cast(pl.Boolean),
            G20 = pl.col('G20').cast(pl.Boolean),
            G7 = pl.col('G7').cast(pl.Boolean),
            CAT = pl.col('Category').cast(pl.Categorical),
            SUBCAT = pl.col('Subcategory').cast(pl.Categorical),
            EMISSION = pl.col('Variable').cast(pl.Categorical),
            UNIT = pl.col('Unit').cast(pl.Categorical),
            VALUE = pl.col('Value'),
        )
        .drop_nulls(subset='ISO_3_CODE') 
        .filter(pl.col('YEAR') > 2014)   # data is parse prior to 2015
    )

df
df.write_excel('df.xlsx')
df.write_parquet('df.parquet')
print(df.columns)



In [None]:
for col in df.columns:
    print(df[col].value_counts())

#### select, rename, cast columns

In [None]:
for i, col in enumerate(df.columns):
    select_string = (
        f"{col.upper().replace(' ', '_')} " + 
        f"= pl.col('{col}'), # .cast(pl.String)',"
    )

    print(select_string)
    # print(f"{col.upper().replace(' ', '_')} = pl.col('{col} # .cast(pl.String)'),")