In [2]:
!pip install polars

Collecting polars
  Obtaining dependency information for polars from https://files.pythonhosted.org/packages/f6/c7/412912cc735bec03de751e506c3380ae393032f2e786e2a93d160acbf1dd/polars-0.20.6-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata
  Using cached polars-0.20.6-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Using cached polars-0.20.6-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (26.8 MB)
Installing collected packages: polars
Successfully installed polars-0.20.6


In [3]:
import polars as pl

def feature_engineering(df):
    code_dict = {32: 31, 33: 31, 45: 44, 49: 48}
    revline_dict = {'0': None, 'T': None}
    lowdoc_dict = {'C': None, '0': None, 'S': None, 'A': None}
    
    df = df.with_columns([
    pl.col('DisbursementDate').str.strptime(pl.Date, '%d-%b-%y').alias('DisbursementDate'),
    pl.col('DisbursementDate').str.strptime(pl.Date, '%d-%b-%y').dt.year().alias('DisbursementYear'),
    pl.col('DisbursementDate').str.strptime(pl.Date, '%d-%b-%y').dt.month().alias('DisbursementMonth'),
    ])

    df = df.with_columns([
        pl.col("Sector").replace(code_dict),
        pl.col("RevLineCr").replace(revline_dict),
        pl.col("LowDoc").replace(lowdoc_dict),
        *[
        pl.col(col).str.replace_all('[$,]', '').str.strip().cast(pl.Float64).alias(col)
        for col in ["DisbursementGross", "GrAppv", "SBA_Appv"]
        ],
        pl.col("ApprovalFY").cast(pl.Int64) - pl.col("DisbursementYear").cast(pl.Int64).alias("FY_Diff"),
        (pl.col("State") == pl.col("BankState")).cast(pl.UInt8).alias("State_is_BankState"),
        
        sum([pl.col(col).is_null().cast(pl.Int32) for col in df.columns]).alias("NullCount")])
    
    df = df.with_columns([
        (pl.col("GrAppv") / pl.when(pl.col("Term") == 0).then(1).otherwise(pl.col("Term"))).alias("MonthlyRepayment"),
        (pl.col('DisbursementGross') / (1 + pl.col('NoEmp'))).alias('Gross/Emp'),
        (pl.col('DisbursementGross') / pl.col('Term')).alias('Gross/Term'),
        ((pl.col('DisbursementGross') / (1 + pl.col('NoEmp'))) / pl.col('Term')).alias('Gross/Emp/Term'),
        (pl.col('DisbursementGross') / (1 + pl.col('NoEmp') + pl.col('CreateJob'))).alias('Gross/(Emp&CreateJob)'),
        ((pl.col('DisbursementGross') / (1 + pl.col('NoEmp') + pl.col('CreateJob'))) / pl.col('Term')).alias('Gross/(Emp&CreateJob)/Term'),
        (pl.col('DisbursementGross') / (1 + pl.col('RetainedJob') + pl.col('CreateJob'))).alias('Gross/(RetainedJob&CreateJob)'),
        ((pl.col('DisbursementGross') / (1 + pl.col('RetainedJob') + pl.col('CreateJob'))) / pl.col('Term')).alias('Gross/(RetainedJob&CreateJob)/Term'),
        (pl.col('DisbursementGross') / pl.col('GrAppv')).alias('Gross/GrAppv'),
        (pl.col('DisbursementGross') / pl.col('SBA_Appv')).alias('Gross/SBA_Appv'),
        ((pl.col('GrAppv') / pl.col('SBA_Appv')) - 1).alias('GrAppv/SBA_Appv')
    ]).drop('DisbursementDate')
    
    return df