In [None]:
import polars as pl
import polars.selectors as cs
import os


In [None]:
#----- FUNCTIONS ---------------------------------------------------------------
def read_and_clean_csv():
    print('reading and cleaning csv file')
    no_education = 'I’ve never completed any formal education'
    elem_education = 'Primary / elementary school'
    high_school = (
        'Secondary school (e.g. American high school, ' +
        'German Realschule or Gymnasium, etc.)'
    )
    some_college = (
        "Some college or university study " +
        "without earning a bachelor’s degree"
    )
    bachelors = 'Bachelor’s degree (BA, BS, B.Eng., etc.)'
    masters = 'Master’s degree (MA, MS, M.Eng., MBA, etc.)'
    doctorate = 'Doctoral degree (Ph.D, Ed.D., etc.)'
    professional = 'Professional degree (JD, MD, etc.)'
    return(
        pl.scan_csv('dataset.csv')
        .select(
            COUNTRY = cs.starts_with('[1]').cast(pl.Categorical()),
            GENDER = cs.starts_with('[2]').cast(pl.Categorical()),
            AGE_RANGE = cs.starts_with('[3]')
                .replace('60 or older', '60+')
                .cast(pl.Categorical()),
            WORK_LANG = pl.concat_list(cs.starts_with('[9]'))
                .list.eval(pl.element().filter(pl.element().is_not_null())),
            EDUCATION = cs.starts_with('[12]')
                .replace(no_education, 'None')
                .replace(elem_education, 'Elementary')
                .replace(high_school, 'High School')
                .replace(some_college, 'Some College')
                .replace(bachelors, 'Bachelors')
                .replace(masters, 'Masters')
                .replace(doctorate, 'Doctorate')
            .replace(professional, 'Professional')
            .cast(pl.Categorical()),
            JOB_ROLE = pl.concat_list(cs.starts_with('[23]'))
                .list.eval(pl.element().filter(pl.element().is_not_null())),
            JOB_LEVEL = cs.starts_with('[24]').cast(pl.Categorical()),
            YEARS_EXPERIENCE = cs.starts_with('[25]')
                .str.replace(' years', '')
                .str.replace(' year', '')
                .str.replace('Less than 1', '<1')
                .str.replace("I don't have any professional coding experience", '0'),
            CS_LANGS = pl.concat_list(cs.starts_with('[44]'))
                .list.eval(pl.element().filter(pl.element().is_not_null())),
            AI_ASST = pl.concat_list(cs.starts_with('[62]'))
                .list.eval(pl.element().filter(pl.element().is_not_null())),
            AI_FEATURES = pl.concat_list(cs.starts_with('[63]'))
                .list.eval(pl.element().filter(pl.element().is_not_null())),
        )
        .with_columns(
            JOB_ROLE_COUNT = pl.col('JOB_ROLE').list.len(),
            WORK_LANG_COUNT = pl.col('WORK_LANG').list.len(),
            CS_LANGS_COUNT = pl.col('CS_LANGS').list.len(),
            AI_ASST_COUNT = pl.col('AI_ASST').list.len(),
            AI_FEATURES_COUNT = pl.col('AI_FEATURES').list.len(),
        )
        .with_columns(
            JOB_ROLE_WT = pl.when(pl.col('JOB_ROLE_COUNT') > 0)
                .then(1/pl.col('JOB_ROLE_COUNT')).otherwise(pl.lit(0)),        
            WORK_LANG_WT = pl.when(pl.col('WORK_LANG_COUNT') > 0)
                .then(1/pl.col('WORK_LANG_COUNT')).otherwise(pl.lit(0)),
            CS_LANGS_WT = pl.when(pl.col('CS_LANGS_COUNT') > 0)
                .then(1/pl.col('CS_LANGS_COUNT')).otherwise(pl.lit(0)),
            AI_ASST_WT = pl.when(pl.col('AI_ASST_COUNT') > 0)
                .then(1/pl.col('AI_ASST_COUNT')).otherwise(pl.lit(0)),
            AI_FEATURES_WT = pl.when(pl.col('AI_FEATURES_COUNT') > 0)
                .then(1/pl.col('AI_FEATURES_COUNT')).otherwise(pl.lit(0)),
        )
        .drop_nulls('GENDER')
        .select(
            'COUNTRY', 'GENDER', 'AGE_RANGE', 
            'EDUCATION',  'JOB_LEVEL', 'YEARS_EXPERIENCE', 
            'JOB_ROLE',     'JOB_ROLE_COUNT',     'JOB_ROLE_WT',       
            'WORK_LANG',    'WORK_LANG_COUNT',     'WORK_LANG_WT',
            'CS_LANGS',     'CS_LANGS_COUNT',      'CS_LANGS_WT',
            'AI_ASST',      'AI_ASST_COUNT',       'AI_ASST_WT',
            'AI_FEATURES',  'AI_FEATURES_COUNT',   'AI_FEATURES_WT',
        )
        .with_columns(cs.float().cast(pl.Float32))
        .with_columns(cs.integer().cast(pl.UInt8))
        .with_row_index(name='INDEX', offset=1)
        .with_columns(pl.col('INDEX').cast(pl.UInt16))
        .explode('JOB_ROLE')
        .explode('WORK_LANG')
        .explode('CS_LANGS') 
        .explode('AI_ASST')
        .explode('AI_FEATURES')
        .with_columns(cs.string().cast(pl.Categorical()))
        .collect()  # convert Lazy Frame to Data Frame
    )

In [None]:
#----- GATHER AND CLEAN DATA ---------------------------------------------------

if False: # os.path.exists('df.parquet'):     # read parquet file if it exists
    print('reading data from parquet file')
    df=pl.read_parquet('df.parquet')
else:                                # read csv file, clean, save df as parquet
    df = read_and_clean_csv()
    print(df)
    df.write_parquet('df.parquet')

In [None]:
print(f'{len(df.columns) = }')
print(f'{df.columns = }')
print(f'{df.shape = }')
df.sample(10).glimpse()

In [None]:
df['GENDER'].value_counts() #.unique().to_list() # .value_counts().sort('AGE_RANGE') # ['NET_SALARY'].to_list()

In [None]:
df.glimpse()