In [4]:
# notebook parameters

import os

spark_master = "local[*]"
app_name = "data-summary"
input_prefix = ""
input_file = "churn-etl"
output_prefix = ""
input_kind = "parquet"
driver_memory = '8g'
executor_memory = '8g'

In [2]:
import pyspark

session = pyspark.sql.SparkSession.builder \
    .master(spark_master) \
    .appName(app_name) \
    .config("spark.driver.memory", driver_memory) \
    .config("spark.executor.memory", executor_memory) \
    .getOrCreate()
session

In [5]:
df = session.read.parquet("%s%s.%s" % (input_prefix, input_file, input_kind))

In [6]:
df.columns

['customerID',
 'gender',
 'SeniorCitizen',
 'Partner',
 'Dependents',
 'tenure',
 'PhoneService',
 'MultipleLines',
 'InternetService',
 'OnlineSecurity',
 'OnlineBackup',
 'DeviceProtection',
 'TechSupport',
 'StreamingTV',
 'StreamingMovies',
 'Contract',
 'PaperlessBilling',
 'PaymentMethod',
 'MonthlyCharges',
 'TotalCharges',
 'Churn']

In [20]:
import pyspark.sql.types as T
import pyspark.sql.functions as F

string_columns = []
boolean_columns = []
numeric_columns = []
other_columns = []

def isnumeric(data_type):
    numeric_types = [T.ByteType, T.ShortType, T.IntegerType, T.LongType, T.FloatType, T.DoubleType, T.DecimalType]
    return any([isinstance(data_type, t) for t in numeric_types])


for field in df.schema.fields:
    if isinstance(field.dataType, T.StringType):
        string_columns.append(field.name)
    elif isinstance(field.dataType, T.BooleanType):
        boolean_columns.append(field.name)
    elif isnumeric(field.dataType):
        numeric_columns.append(field.name)
    else:
        other_columns.append(field.name)

In [54]:
def approx_cardinalities(df, cols):
    from functools import reduce
    
    counts = df.groupBy(
        F.lit(True).alias("drop_me")
    ).agg(
        F.count('*').alias("total"),
        *[F.approx_count_distinct(F.col(c)).alias(c) for c in cols]
    ).drop("drop_me").cache()
    
    result = reduce(lambda l, r: l.unionAll(r), [counts.select(F.lit(c).alias("field"), F.col(c).alias("approx_count")) for c in counts.columns]).collect()
    counts.unpersist()
    
    return dict([(r[0],r[1]) for r in result])

def likely_unique(counts):
    total = counts["total"]
    return [k for (k, v) in counts.items() if k != "total" and abs(total - v) < total * 0.15]

In [52]:
cardinalities = approx_cardinalities(df, string_columns)

In [55]:
likely_unique(cardinalities)

['customerID']

In [24]:
other_columns

[]

In [48]:
df.summary().toPandas()

Unnamed: 0,summary,customerID,gender,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges
0,count,703200,703200,703200,703200,703200.0,703200,703200,703200,703200,703200,703200,703200,703200,703200,703200,703200,703200,703200.0,703200.0
1,mean,,,,,32.421786120591584,,,,,,,,,,,,,64.798208,2283.302416
2,stddev,,,,,24.543531844004946,,,,,,,,,,,,,30.083855979833203,2266.6133663155583
3,min,0002-ORFBO-0Fx71N4elIvH,Female,No,No,1.0,No,No,DSL,No,No,No,No,No,No,Month-to-month,No,Bank transfer (automatic),18.25,18.8
4,25%,,,,,9.0,,,,,,,,,,,,,35.55,401.28
5,50%,,,,,29.0,,,,,,,,,,,,,70.35,1397.28
6,75%,,,,,55.0,,,,,,,,,,,,,89.85,3794.4
7,max,9995-HOTOH-TS79j2LZbS4N,Male,Yes,Yes,72.0,Yes,Yes,No,Yes,Yes,Yes,Yes,Yes,Yes,Two year,Yes,Mailed check,118.75,8684.64


[{},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {},
 {}]