In [1]:
import findspark, os, pandas as pd
os.environ["SPARK_HOME"]="/Users/ankitkansal/spark/spark-2.4.4-bin-without-hadoop"
findspark.init()

In [2]:
import pyspark
import datetime
import re
from functools import reduce
import pandas as pd
from pyspark import SparkContext, sql
from pyspark.sql import functions as F, SparkSession
from pyspark.sql.functions import udf
from pyspark.sql.types import *
from pyspark.sql.functions import array, col, explode, lit, struct
from pyspark.sql import DataFrame
from typing import Iterable

In [7]:
spark = SparkSession.builder.getOrCreate()

In [3]:
def melt(df: DataFrame,
         id_vars: Iterable[str], value_vars: Iterable[str],
         var_name: str = "variable", value_name: str = "value") -> DataFrame:
    """Convert :class:`DataFrame` from wide to long format."""
    # Create array<struct<variable: str, value: ...>>
    _vars_and_vals = array(*(
        struct(lit(c).alias(var_name), col(c).alias(value_name))
        for c in value_vars))
    # Add to the DataFrame and explode
    _tmp = df.withColumn("_vars_and_vals", explode(_vars_and_vals))
    cols = id_vars + [
        col("_vars_and_vals")[x].alias(x) for x in [var_name, value_name]]
    return _tmp.select(*cols)

In [4]:
def _build_qc_df_from_func(agg_function, df, label, column_dtype=None):
    """
    Args:
        agg_function:
        df:
        label:
        column_dtype:
    Returns:
    """
    ss = SparkSession.builder.getOrCreate()
    sc = SparkContext.getOrCreate()
    columns = []
    schema = [(x.name, str(x.dataType)) for x in df.schema.fields]
    if not column_dtype:
        columns = df.columns
    elif 'string' in column_dtype:
        columns = columns + [x[0] for x in schema if x[1] in ['StringType']]
    elif 'numeric' in column_dtype:
        columns = columns + [x[0] for x in schema if re.match(
            '|'.join(['DecimalType', 'DoubleType', 'FloatType', 'IntegerType', 'LongType', 'ShortType']), x[1])]
    elif 'date' in column_dtype:
        columns = columns + [x[0] for x in schema if x[1] in ['DateType', 'TimestampType']]
    elif 'bool' in column_dtype:
        columns = columns + [x[0] for x in schema if x[1] in ['BooleanType']]
    elif 'qty' in column_dtype:
        columns = columns + [x[0] for x in schema if 'qty' in x[0]]
    else:
        raise ValueError('unsupported column_dtype argument: {}'.format(column_dtype))
    if len(columns) == 0:
        output = ss.createDataFrame(sc.emptyRDD(),
                                    StructType(
                                        [StructField('field', StringType()), StructField(label, StringType())]))
    else:
        col_batch_list = [columns[x:x + 10] for x in range(0, len(columns), 10)]
        df_list = [df.agg(*[agg_function(x).alias(x) for x in column_batch]) for
                   column_batch in col_batch_list]
        wrking_df = reduce(lambda x, y: x.crossJoin(y), df_list).withColumn('temp', F.lit("DISCARD"))
        melted_df = melt(wrking_df, ['temp'], columns).drop('temp') \
            .withColumnRenamed('value', label) \
            .withColumnRenamed('variable', 'field')
        output = melted_df
    return output

In [13]:
def _generate_qc_summary_table(wrk_df: sql.DataFrame, table_name: str) -> sql.DataFrame:
    """
    Args:
        wrk_df:
    Returns:
    """
    ss = SparkSession.builder.getOrCreate()
    aggregate_stats_pandas = [
        _build_qc_df_from_func(lambda x: F.count(F.col(x)), df=wrk_df, label='n'),
        _build_qc_df_from_func(lambda x: F.countDistinct(F.col(x)), df=wrk_df, label='n_distinct'),
        _build_qc_df_from_func(lambda x: F.sum(F.when(F.col(x).isNull(), 1).otherwise(0)), df=wrk_df,
                               label='is_null_cnt')
        ,
        _build_qc_df_from_func(lambda x: F.sum((F.col(x).isNotNull().cast('integer'))), df=wrk_df,
                               label='is_not_null_cnt'),
        _build_qc_df_from_func(lambda x: F.sum(F.col(x)).cast('string'), df=wrk_df, label='sum',
                               column_dtype=['numeric']),
        _build_qc_df_from_func(lambda x: F.avg(F.col(x)).cast('string'), df=wrk_df, label='mean_val',
                               column_dtype=['numeric']),
        _build_qc_df_from_func(lambda x: F.max(F.col(x)).cast('string'), df=wrk_df, label='max_val',
                               column_dtype=['numeric', 'date']),
        _build_qc_df_from_func(lambda x: F.min(F.col(x)).cast('string'), df=wrk_df, label='min_val',
                               column_dtype=['numeric', 'date']),
        _build_qc_df_from_func(lambda x: F.sum((F.col(x) == F.lit('')).cast('integer')), df=wrk_df,
                               label='is_blank_count',
                               column_dtype=['string'])
    ]
    total_rows = wrk_df.count()
    schema = [(x.name, str(x.dataType)) for x in wrk_df.schema.fields]
    dtypes_df = ss.createDataFrame(schema, ['field', 'type'])
    aggregation_results = reduce(lambda x, y: x.join(y, 'field', 'outer'), aggregate_stats_pandas)
    reduced_df = dtypes_df.join(aggregation_results, 'field', 'left')
    missing_data_cols = ['is_null_cnt', 'is_blank_count']
    results_df = reduced_df \
        .withColumn('overall_missing_values',
                    reduce(lambda x, y: F.coalesce(F.col(x), F.lit(0)) + F.coalesce(F.col(y), F.lit(0)),
                           missing_data_cols)) \
        .withColumn('total_rows', F.lit(total_rows)) \
        .withColumn('overall_missing_pct', F.round((F.col('overall_missing_values') / F.col('total_rows')) * 100, 2))
    results_df = results_df.select("field",
                                   F.col("total_rows").alias("tot_rows"),
                                   F.col("n_distinct").alias("distinct_vals"),
                                   "sum",
                                   F.col("mean_val").alias("mean"),
                                   F.col("max_val").alias("max"),
                                   F.col("min_val").alias("min"),
                                   F.col("overall_missing_values").alias("tot_missing"),
                                   F.col("overall_missing_pct").alias("perc_missing"),
                                   F.lit(table_name).alias("table_name")
                                  )
    return results_df

In [14]:
df = spark.read.parquet("/Users/ankitkansal/IdeaProjects/project-samudra/data/L0/USAGE/usage_ru_a_gprs_cbs_usage_daily")

In [15]:
report = _generate_qc_summary_table(df, 'usage_ru_a_gprs_cbs_usage_daily')

In [16]:
report.toPandas()

Unnamed: 0,field,tot_rows,distinct_vals,sum,mean,max,min,tot_missing,perc_missing,table_name
0,call_free_charge_volume,250,115,7714806784.0,30859227.136,2053974016.0,0.0,0,0.0,usage_ru_a_gprs_cbs_usage_daily
1,package_id,250,4,,,,,0,0.0,usage_ru_a_gprs_cbs_usage_daily
2,call_roaming_operator_cd,250,3,,,,,247,98.8,usage_ru_a_gprs_cbs_usage_daily
3,roaming_flag_cd,250,4,,,,,0,0.0,usage_ru_a_gprs_cbs_usage_daily
4,speed_type,250,3,,,,,0,0.0,usage_ru_a_gprs_cbs_usage_daily
5,usage_service_class_cd,250,1,,,,,0,0.0,usage_ru_a_gprs_cbs_usage_daily
6,call_free_charge_time,250,1,0.0,0.0,0.0,0.0,0,0.0,usage_ru_a_gprs_cbs_usage_daily
7,charge_measure_type_cd,250,2,,,,,0,0.0,usage_ru_a_gprs_cbs_usage_daily
8,post_net_revenue_amt,250,1,0.0,0.0,0.0,0.0,0,0.0,usage_ru_a_gprs_cbs_usage_daily
9,gprs_usage_type,250,2,,,,,0,0.0,usage_ru_a_gprs_cbs_usage_daily


In [55]:
week.toPandas()

Unnamed: 0,id,start_of_week,category_week
0,1,2019-01-01 00:00:00,"[movie, music, movie, music, movie, movie, movie]"
1,1,2019-01-08 00:00:00,"[music, music, music, music, music, music, movie]"


Unnamed: 0,id,start_of_week,category_week,week
0,1,2019-01-01 00:00:00,"[movie, music, movie, music, movie, movie, movie]","[[movie, music, movie, music, movie, movie, movie], [music, music, music, music, music, music, movie]]"
1,1,2019-01-08 00:00:00,"[music, music, music, music, music, music, movie]","[[movie, music, movie, music, movie, movie, movie], [music, music, music, music, music, music, movie]]"
