## Basic Check

In [19]:
import polars as pl

lf = pl.scan_parquet("arcos_filtered_2006_2015.parquet")
n_rows = lf.select(pl.count()).collect()[0, 0]
print("Rows:", n_rows)

# Year range check
lf.select([
    pl.col("year").min().alias("min_year"),
    pl.col("year").max().alias("max_year"),
]).collect()


(Deprecated in version 0.20.5)
  n_rows = lf.select(pl.count()).collect()[0, 0]


Rows: 218477461


min_year,max_year
i32,i32
2006,2015


In [6]:
import polars as pl

sample = pl.read_parquet(
    "arcos_filtered_2006_2015.parquet",
    n_rows=10000000,
)

sample.shape, sample.head()

((10000000, 11),
 shape: (5, 11)
 ┌────────────┬────────────┬────────────┬────────────┬───┬────────────┬───────────┬──────────┬──────┐
 │ BUYER_BUS_ ┆ BUYER_STAT ┆ BUYER_COUN ┆ DRUG_NAME  ┆ … ┆ CALC_BASE_ ┆ DOSAGE_UN ┆ MME      ┆ year │
 │ ACT        ┆ E          ┆ TY         ┆ ---        ┆   ┆ WT_IN_GM   ┆ IT        ┆ ---      ┆ ---  │
 │ ---        ┆ ---        ┆ ---        ┆ str        ┆   ┆ ---        ┆ ---       ┆ f64      ┆ i32  │
 │ str        ┆ str        ┆ str        ┆            ┆   ┆ f64        ┆ f64       ┆          ┆      │
 ╞════════════╪════════════╪════════════╪════════════╪═══╪════════════╪═══════════╪══════════╪══════╡
 │ ANALYTICAL ┆ CA         ┆ ALAMEDA    ┆ METHADONE  ┆ … ┆ 0.1529424  ┆ 0.0       ┆ 458.8272 ┆ 2015 │
 │ LAB        ┆            ┆            ┆            ┆   ┆            ┆           ┆          ┆      │
 │ ANALYTICAL ┆ CA         ┆ ALAMEDA    ┆ MEPERIDINE ┆ … ┆ 0.392175   ┆ 0.0       ┆ 39.2175  ┆ 2015 │
 │ LAB        ┆            ┆            ┆        

In [7]:
# type check
sample.schema

Schema([('BUYER_BUS_ACT', String),
        ('BUYER_STATE', String),
        ('BUYER_COUNTY', String),
        ('DRUG_NAME', String),
        ('MME_Conversion_Factor', Float64),
        ('TRANSACTION_DATE', String),
        ('Reporter_family', String),
        ('CALC_BASE_WT_IN_GM', Float64),
        ('DOSAGE_UNIT', Float64),
        ('MME', Float64),
        ('year', Int32)])

In [8]:
# MISSING VALUE Check
sample.null_count()

BUYER_BUS_ACT,BUYER_STATE,BUYER_COUNTY,DRUG_NAME,MME_Conversion_Factor,TRANSACTION_DATE,Reporter_family,CALC_BASE_WT_IN_GM,DOSAGE_UNIT,MME,year
u32,u32,u32,u32,u32,u32,u32,u32,u32,u32,u32
0,0,0,0,0,0,0,0,0,0,0


In [10]:
# Unreasonable Rows Check
bad_rows = sample.filter(
    (pl.col("MME") <= 0) | (pl.col("DOSAGE_UNIT") <= 0)
).head(10)

bad_rows

BUYER_BUS_ACT,BUYER_STATE,BUYER_COUNTY,DRUG_NAME,MME_Conversion_Factor,TRANSACTION_DATE,Reporter_family,CALC_BASE_WT_IN_GM,DOSAGE_UNIT,MME,year
str,str,str,str,f64,str,str,f64,f64,f64,i32
"""ANALYTICAL LAB""","""CA""","""ALAMEDA""","""METHADONE""",3.0,"""2015-06-02""","""Exp Pharmaceutical Services Co…",0.1529424,0.0,458.8272,2015
"""ANALYTICAL LAB""","""CA""","""ALAMEDA""","""MEPERIDINE""",0.1,"""2015-06-02""","""Exp Pharmaceutical Services Co…",0.392175,0.0,39.2175,2015
"""ANALYTICAL LAB""","""CA""","""ALAMEDA""","""FENTANYL""",100.0,"""2015-06-02""","""Exp Pharmaceutical Services Co…",0.0001,0.0,10.0,2015
"""ANALYTICAL LAB""","""CA""","""ALAMEDA""","""HYDROMORPHONE""",4.0,"""2015-06-02""","""Exp Pharmaceutical Services Co…",0.8866,0.0,3546.4,2015
"""ANALYTICAL LAB""","""CA""","""ALAMEDA""","""HYDROMORPHONE""",4.0,"""2015-06-02""","""Exp Pharmaceutical Services Co…",0.0053196,0.0,21.2784,2015
"""ANALYTICAL LAB""","""CA""","""ALAMEDA""","""FENTANYL""",100.0,"""2015-06-02""","""Exp Pharmaceutical Services Co…",6e-05,0.0,6.0,2015
"""ANALYTICAL LAB""","""CA""","""ALAMEDA""","""FENTANYL""",100.0,"""2015-06-02""","""Exp Pharmaceutical Services Co…",4e-05,0.0,4.0,2015
"""ANALYTICAL LAB""","""CA""","""ALAMEDA""","""FENTANYL""",100.0,"""2015-06-02""","""Exp Pharmaceutical Services Co…",0.001,0.0,100.0,2015
"""ANALYTICAL LAB""","""CA""","""ALAMEDA""","""FENTANYL""",100.0,"""2015-06-02""","""Exp Pharmaceutical Services Co…",0.0025,0.0,250.0,2015
"""ANALYTICAL LAB""","""CA""","""ALAMEDA""","""MORPHINE""",1.0,"""2015-06-02""","""Exp Pharmaceutical Services Co…",0.3384,0.0,338.4,2015
