In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('whylogs-testing').getOrCreate()
arrow_config_key = "spark.sql.execution.arrow.pyspark.enabled"
spark.conf.set(arrow_config_key, "true")

In [4]:
from pyspark import SparkFiles

data_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality/winequality-red.csv"
spark.sparkContext.addFile(data_url)

In [5]:
spark_dataframe = spark.read.option("delimiter", ";").option("inferSchema", "true").csv(SparkFiles.get("winequality-red.csv"), header=True)

In [8]:
spark_dataframe.show(n=1, vertical=True)

-RECORD 0----------------------
 fixed acidity        | 7.4    
 volatile acidity     | 0.7    
 citric acid          | 0.0    
 residual sugar       | 1.9    
 chlorides            | 0.076  
 free sulfur dioxide  | 11.0   
 total sulfur dioxide | 34.0   
 density              | 0.9978 
 pH                   | 3.51   
 sulphates            | 0.56   
 alcohol              | 9.4    
 quality              | 5      
only showing top 1 row



In [9]:
spark_dataframe.printSchema()

root
 |-- fixed acidity: double (nullable = true)
 |-- volatile acidity: double (nullable = true)
 |-- citric acid: double (nullable = true)
 |-- residual sugar: double (nullable = true)
 |-- chlorides: double (nullable = true)
 |-- free sulfur dioxide: double (nullable = true)
 |-- total sulfur dioxide: double (nullable = true)
 |-- density: double (nullable = true)
 |-- pH: double (nullable = true)
 |-- sulphates: double (nullable = true)
 |-- alcohol: double (nullable = true)
 |-- quality: integer (nullable = true)



In [10]:
from whylogs.api.pyspark.experimental import collect_column_profile_views
column_views_dict = collect_column_profile_views(spark_dataframe)

In [11]:
print(column_views_dict)

{'alcohol': <whylogs.core.view.column_profile_view.ColumnProfileView object at 0x7fa6b91c92d0>, 'chlorides': <whylogs.core.view.column_profile_view.ColumnProfileView object at 0x7fa6b91c8550>, 'citric acid': <whylogs.core.view.column_profile_view.ColumnProfileView object at 0x7fa6b91ca810>, 'density': <whylogs.core.view.column_profile_view.ColumnProfileView object at 0x7fa6b91ec650>, 'fixed acidity': <whylogs.core.view.column_profile_view.ColumnProfileView object at 0x7fa6b91ed890>, 'free sulfur dioxide': <whylogs.core.view.column_profile_view.ColumnProfileView object at 0x7fa6b91eec10>, 'pH': <whylogs.core.view.column_profile_view.ColumnProfileView object at 0x7fa6b91ee610>, 'quality': <whylogs.core.view.column_profile_view.ColumnProfileView object at 0x7fa6b91ca690>, 'residual sugar': <whylogs.core.view.column_profile_view.ColumnProfileView object at 0x7fa6b91f4950>, 'sulphates': <whylogs.core.view.column_profile_view.ColumnProfileView object at 0x7fa6b91f7a10>, 'total sulfur dioxide

In [12]:
column_views_dict["density"].get_metric("counts").n.value, spark_dataframe.count()

(1599, 1599)

In [13]:
column_views_dict["density"].get_metric("distribution").mean.value

0.9967466791744841

In [15]:
from pyspark.sql.functions import mean
spark_dataframe.select(mean("density")).show()

+------------------+
|      avg(density)|
+------------------+
|0.9967466791744831|
+------------------+



In [16]:
from whylogs.api.pyspark.experimental import collect_dataset_profile_view
dataset_profile_view = collect_dataset_profile_view(input_df=spark_dataframe)

In [17]:
import pandas as pd
dataset_profile_view.to_pandas().head()

Unnamed: 0_level_0,cardinality/est,cardinality/lower_1,cardinality/upper_1,counts/inf,counts/n,counts/nan,counts/null,counts/true,distribution/max,distribution/mean,...,type,types/boolean,types/fractional,types/integral,types/object,types/string,types/tensor,frequent_items/frequent_strings,ints/max,ints/min
column,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
alcohol,65.00001,65.0,65.003256,0,1599,0,0,0,14.9,10.422983,...,SummaryType.COLUMN,0,1599,0,0,0,0,,,
chlorides,153.000058,153.0,153.007697,0,1599,0,0,0,0.611,0.087467,...,SummaryType.COLUMN,0,1599,0,0,0,0,,,
citric acid,80.000016,80.0,80.00401,0,1599,0,0,0,1.0,0.270976,...,SummaryType.COLUMN,0,1599,0,0,0,0,,,
density,439.557368,433.943761,445.310933,0,1599,0,0,0,1.00369,0.996747,...,SummaryType.COLUMN,0,1599,0,0,0,0,,,
fixed acidity,96.000023,96.0,96.004816,0,1599,0,0,0,15.9,8.319637,...,SummaryType.COLUMN,0,1599,0,0,0,0,,,


In [19]:
dataset_profile_view.write(path="wine_profile.csv")

  dataset_profile_view.write(path="wine_profile.csv")


(True, 'wine_profile.csv')