In [3]:
# Import libraries
from typing import Any
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
from whylogs.api.pyspark.experimental import collect_column_profile_views
from whylogs.api.pyspark.experimental import collect_dataset_profile_view
from whylogs.core.metrics.condition_count_metric import Condition
from whylogs.core.relations import Predicate
from whylogs.core.schema import DeclarativeSchema
from whylogs.core.resolvers import STANDARD_RESOLVER
from whylogs.core.specialized_resolvers import ConditionCountMetricSpec
from whylogs.core.constraints.factories import condition_meets
from whylogs.core.constraints import ConstraintsBuilder
from whylogs.core.constraints.factories import no_missing_values
from whylogs.core.constraints.factories import greater_than_number
from whylogs.viz import NotebookProfileVisualizer
import pandas as pd
import datetime

In [4]:
# initialize sparkSession
spark = SparkSession.builder.appName('whylogs').getOrCreate()
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled","true")

In [5]:
df = spark.read.option("header",True).option("inferSchema",True).csv("patient_data.csv")
df.printSchema()

root
 |-- patient_id: integer (nullable = true)
 |-- patient_name: string (nullable = true)
 |-- height: integer (nullable = true)
 |-- weight: integer (nullable = true)
 |-- visit_date: string (nullable = true)



In [6]:
df.show(n=2, vertical=True)

-RECORD 0---------------------
 patient_id   | 8286975       
 patient_name | Jane Davis    
 height       | 170           
 weight       | 97            
 visit_date   | 2023-04-19    
-RECORD 1---------------------
 patient_id   | 2130375       
 patient_name | Michael Brown 
 height       | 150           
 weight       | 62            
 visit_date   | 2021-11-30    
only showing top 2 rows



In [8]:
df_profile = collect_column_profile_views(df)
print(df_profile)

{'height': <whylogs.core.view.column_profile_view.ColumnProfileView object at 0x7fba3560c890>, 'patient_id': <whylogs.core.view.column_profile_view.ColumnProfileView object at 0x7fba355f7450>, 'patient_name': <whylogs.core.view.column_profile_view.ColumnProfileView object at 0x7fb9d0bf9950>, 'visit_date': <whylogs.core.view.column_profile_view.ColumnProfileView object at 0x7fba355fd490>, 'weight': <whylogs.core.view.column_profile_view.ColumnProfileView object at 0x7fba3560e4d0>}


In [12]:
# average height
df_profile["height"].get_metric("distribution").mean.value

174.98855

In [13]:
# average height using dataframe 
df.select(F.mean(F.col("height"))).show()

+-----------+
|avg(height)|
+-----------+
|  174.98855|
+-----------+



In [14]:
# compile profile view
df_profile_view = collect_dataset_profile_view(input_df=df)
df_profile_view.to_pandas().head()

Unnamed: 0_level_0,cardinality/est,cardinality/lower_1,cardinality/upper_1,counts/inf,counts/n,counts/nan,counts/null,counts/true,distribution/max,distribution/mean,...,frequent_items/frequent_strings,ints/max,ints/min,type,types/boolean,types/fractional,types/integral,types/object,types/string,types/tensor
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
height,51.000006,51.0,51.002553,0,100000,0,0,0,200.0,174.9886,...,"[FrequentItem(value='174', est=2203, upper=220...",200.0,150.0,SummaryType.COLUMN,0,0,100000,0,0,0
patient_id,9624.479972,9471.462071,9782.831357,0,100000,0,0,0,9998201.0,5441664.0,...,[],9998201.0,1000595.0,SummaryType.COLUMN,0,0,100000,0,0,0
patient_name,100.000025,100.0,100.005018,0,100000,0,0,0,,0.0,...,"[FrequentItem(value='Robert Williams', est=125...",,,SummaryType.COLUMN,0,0,0,0,100000,0
visit_date,2230.288911,2194.829943,2266.983812,0,100000,0,0,0,,0.0,...,[],,,SummaryType.COLUMN,0,0,0,0,100000,0
weight,56.000008,56.0,56.002804,0,100000,0,0,0,102.0,73.60038,...,"[FrequentItem(value='55', est=2107, upper=2107...",102.0,0.0,SummaryType.COLUMN,0,0,100000,0,0,0


In [15]:
# persist profile as a file
df_profile_view.to_pandas().reset_index().to_csv("patient_profile.csv")