In [1]:
import polars as pl
pl.Config().set_tbl_rows(30)

polars.config.Config

In [41]:
df_budget = (
    pl.read_csv(
        'Budget_FY2023.csv',
        schema_overrides={  # top of these cols are integer-like
            'Div_Code': pl.String,
            'Unit_Code': pl.String,
            }
        )
    .select(  # dropped FY (all values are 2023)
        FUND_CODE = pl.col('Fund_Code').cast(pl.Int64),
        FUND_NAME = pl.col('Fund_Name'),
        DEPT_CODE = pl.col('Dept_Code'),
        DEPT = pl.col('Department'),
        DIV_CODE = pl.col('Div_Code'),
        DIV = pl.col('Division'),
        UNIT_CODE = pl.col('Unit_Code'),
        COST_CENTER = pl.col('Cost_Center'),
        TYPE_CODE = pl.col('Obj_Type_Code'),
        TYPE_NAME = pl.col('Obj_Type_Name'),
        CAT_CODE = pl.col('Obj_Cat_Code'),
        EXP_CAT = pl.col('Expenditure_Category'),
        CLASS_CODE = pl.col('Obj_Class_Code'),
        CLASS_NAME = pl.col('Obj_Class_Name'),
        CODE = pl.col('Obj_Code'),
        EXP_LINE_ITEM = pl.col('Expenditure_Line_Item'),
        REC_DATE = pl.col('Record_Date'),
        CHECK_NUM = pl.col('Check_Number'),
        DOC_ID = pl.col('Document_ID'),
        BUD_AMT = pl.col('Budgeted_Amount'),
        ACT_AMT = pl.col('Actual_Amount'),
    )
)
print(df_budget.columns)
print(df_budget.head(1))
print(df_budget.describe())

['FUND_CODE', 'FUND_NAME', 'DEPT_CODE', 'DEPT', 'DIV_CODE', 'DIV', 'UNIT_CODE', 'COST_CENTER', 'TYPE_CODE', 'TYPE_NAME', 'CAT_CODE', 'EXP_CAT', 'CLASS_CODE', 'CLASS_NAME', 'CODE', 'EXP_LINE_ITEM', 'REC_DATE', 'CHECK_NUM', 'DOC_ID', 'BUD_AMT', 'ACT_AMT']
shape: (1, 21)
┌───────────┬────────────┬───────────┬────────────┬───┬───────────┬────────────┬─────────┬─────────┐
│ FUND_CODE ┆ FUND_NAME  ┆ DEPT_CODE ┆ DEPT       ┆ … ┆ CHECK_NUM ┆ DOC_ID     ┆ BUD_AMT ┆ ACT_AMT │
│ ---       ┆ ---        ┆ ---       ┆ ---        ┆   ┆ ---       ┆ ---        ┆ ---     ┆ ---     │
│ i64       ┆ str        ┆ i64       ┆ str        ┆   ┆ str       ┆ str        ┆ i64     ┆ str     │
╞═══════════╪════════════╪═══════════╪════════════╪═══╪═══════════╪════════════╪═════════╪═════════╡
│ 1000      ┆ General    ┆ 10        ┆ Board Of   ┆ … ┆ null      ┆ WCBE,10,20 ┆ 308762  ┆ null    │
│           ┆ Fund       ┆           ┆ Commission ┆   ┆           ┆ 2306210000 ┆         ┆         │
│           ┆           

In [21]:
for col in df_budget.columns:
    print(f"{col.upper()} = pl.col('{col}'),")

FY = pl.col('FY'),
APD = pl.col('APD'),
FUND_CODE = pl.col('Fund_Code'),
FUND_NAME = pl.col('Fund_Name'),
DEPT_CODE = pl.col('Dept_Code'),
DEPARTMENT = pl.col('Department'),
DIV_CODE = pl.col('Div_Code'),
DIVISION = pl.col('Division'),
UNIT_CODE = pl.col('Unit_Code'),
COST_CENTER = pl.col('Cost_Center'),
OBJ_TYPE_CODE = pl.col('Obj_Type_Code'),
OBJ_TYPE_NAME = pl.col('Obj_Type_Name'),
OBJ_CAT_CODE = pl.col('Obj_Cat_Code'),
EXPENDITURE_CATEGORY = pl.col('Expenditure_Category'),
OBJ_CLASS_CODE = pl.col('Obj_Class_Code'),
OBJ_CLASS_NAME = pl.col('Obj_Class_Name'),
OBJ_CODE = pl.col('Obj_Code'),
EXPENDITURE_LINE_ITEM = pl.col('Expenditure_Line_Item'),
RECORD_DATE = pl.col('Record_Date'),
CHECK_NUMBER = pl.col('Check_Number'),
DOCUMENT_ID = pl.col('Document_ID'),
BUDGETED_AMOUNT = pl.col('Budgeted_Amount'),
ACTUAL_AMOUNT = pl.col('Actual_Amount'),


In [3]:
df_historical = (
    pl.read_csv('CPI-historical.csv')
    .select(
        COUNTRY = pl.col('Country / Territory'),
        ISO3 = pl.col('ISO3'), # no mods to ISO3 column
        YEAR = pl.col('Year'),
        REGION = pl.col('Region'),
        CPI_SCORE = pl.col('CPI score'),
        RANK= pl.col('Rank'),
        STD_ERR = pl.col('Standard error')
    )
    .with_columns(
        COUNTRY_MED = pl.col('CPI_SCORE').median().over('COUNTRY'),
        REGIONAL_MED = pl.col('CPI_SCORE').median().over('REGION'),
    )
)
print(df_historical.columns)
df_historical.filter(pl.col('COUNTRY') == 'Iran')

['COUNTRY', 'ISO3', 'YEAR', 'REGION', 'CPI_SCORE', 'RANK', 'STD_ERR', 'COUNTRY_MED', 'REGIONAL_MED']


COUNTRY,ISO3,YEAR,REGION,CPI_SCORE,RANK,STD_ERR,COUNTRY_MED,REGIONAL_MED
str,str,i64,str,i64,i64,f64,f64,f64
"""Iran""","""IRN""",2012,"""MENA""",28,133,4.6,26.0,39.0
"""Iran""","""IRN""",2013,"""MENA""",25,144,3.9,26.0,39.0
"""Iran""","""IRN""",2014,"""MENA""",27,136,4.72,26.0,39.0
"""Iran""","""IRN""",2015,"""MENA""",27,130,3.28,26.0,39.0
"""Iran""","""IRN""",2016,"""MENA""",29,131,2.47,26.0,39.0
"""Iran""","""IRN""",2017,"""MENA""",30,130,3.51,26.0,39.0
"""Iran""","""IRN""",2018,"""MENA""",28,138,2.89,26.0,39.0
"""Iran""","""IRN""",2019,"""MENA""",26,146,3.63,26.0,39.0
"""Iran""","""IRN""",2020,"""MENA""",25,149,2.56,26.0,39.0
"""Iran""","""IRN""",2021,"""MENA""",25,150,2.29,26.0,39.0


In [4]:
df_historical['REGION'].value_counts() #.sort('Year')

REGION,count
str,u32
"""MENA""",234
"""SSA""",630
"""AP""",388
"""AME""",410
"""ECA""",247
"""WE/EU""",403


In [14]:
(
    set(df_cpi['COUNTRY']) - 
    set(df_historical['COUNTRY'])
)

{'Korea, North', 'Korea, South', 'United States'}

In [15]:
(
    set(df_historical['COUNTRY']) - 
    set(df_cpi['COUNTRY'])
)

{'Brunei Darussalam',
 'North Korea',
 'Puerto Rico',
 'South Korea',
 'United States of America'}

In [6]:
[t for t in range(400, 1000, 100)]

[400, 500, 600, 700, 800, 900]

In [7]:
[t for t in range(1000, 11000, 1000)]

[1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000, 10000]

In [8]:
[t for t in range(400, 1000, 100)] +  [t for t in range(1000, 11000, 1000)]

[400,
 500,
 600,
 700,
 800,
 900,
 1000,
 2000,
 3000,
 4000,
 5000,
 6000,
 7000,
 8000,
 9000,
 10000]