In [0]:
df = spark.sql('select * from default.gold_fnb_sales')

from pyspark.sql.types import NumericType, StringType

# Filter only numeric columns
numeric_cols = [f.name for f in df.schema.fields if isinstance(f.dataType, NumericType)]

# Select only those columns
df_numeric = df.select(numeric_cols)

categorical_cols = [f.name for f in df.schema.fields if isinstance(f.dataType, StringType)]
df_categorical = df.select(categorical_cols)

In [0]:
df.display()

In [0]:
from pyspark.sql.functions import col, sum as _sum, lit
from pyspark.sql.types import NumericType

# Get only numeric columns
numeric_cols = [f.name for f in df_numeric.schema.fields if isinstance(f.dataType, NumericType)]

# Build expressions to count zeros in each numeric column
zero_count_exprs = [_sum((col(c) == 0).cast("int")).alias(c) for c in numeric_cols]

# Aggregate and show result
df_zeros = df_numeric.agg(*zero_count_exprs)
df_zeros = df_zeros.select([col(c).cast("string").alias(c) for c in df_zeros.columns]).withColumn('summary',lit('zeros'))

In [0]:
from pyspark.sql import functions as F

# Total row count
total_count = df_numeric.count()

# Compute null % for each column
null_stats = (
    df_numeric.select([
        (F.sum(F.when(F.col(c).isNull(), 1).otherwise(0)) / total_count * 100)
        .cast("string").alias(c)
        for c in df_numeric.columns
    ]).withColumn('summary',F.lit('null_%'))
)

In [0]:
df_describe = df_numeric.describe()

In [0]:
df_union = df_describe.unionByName(null_stats).unionByName(df_zeros)
display(df_union)

In [0]:
from pyspark.sql import functions as F

# Step 1: Collect all rows into a dictionary
rows = df_union.collect()

# Step 2: Extract summary labels (like count, mean, etc.)
summary_types = [row['summary'] for row in rows]

# Step 3: Build a list of dicts with column names as keys
data = []
for col in df_union.columns:
    if col != 'summary':
        record = {'column_name': col}
        for row in rows:
            record[row['summary']] = row[col]
        data.append(record)

# Step 4: Create the transposed DataFrame
df_transposed = spark.createDataFrame(data).select('column_name','null_%','mean','stddev','min','max','count')

display(df_transposed)


In [0]:
from pyspark.sql import functions as F

# Select only string/categorical columns
categorical_cols = [f.name for f in df_categorical.schema.fields if f.dataType.simpleString() == 'string']

# Get total number of rows
total_count = df_categorical.count()

# Initialize results
result = []

for col_name in categorical_cols:
    # distinct count
    distinct_count = df_categorical.select(col_name).distinct().count()
    
    # null percentage
    null_count = df_categorical.filter(F.col(col_name).isNull() | (F.col(col_name) == "")).count()
    null_pct = round((null_count / total_count) * 100, 2)
    
    # top value and its frequency
    top_row = (
        df_categorical.groupBy(col_name)
          .count()
          .orderBy(F.desc("count"))
          .first()
    )
    
    if top_row:
        top_value = top_row[0]
        freq_top = top_row[1]
    else:
        top_value = None
        freq_top = 0
    
    # append result
    result.append((col_name, distinct_count, null_pct, distinct_count, top_value, freq_top))

# create final dataframe
profile_df = spark.createDataFrame(result, ["column_name", "distinct_count", "null%", "unique", "top", "frequen_top"])

display(profile_df)


In [0]:

from datetime import datetime
import uuid, os

df = profile_df
CAT, SCH, TBL = "workspace", "default", "profile_distribution"
FULL = f"{CAT}.{SCH}.{TBL}"

try:
    # df.write.format("delta").mode("overwrite").saveAsTable(FULL)
    print(f"✅ Successfully wrote table: {FULL}")
except Exception as e:
    print(f"⚠️ Write to {FULL} failed: {str(e).splitlines()[0]}")
    # Fallback: save in your user folder (always writable)
    try:
        user = spark.sql("SELECT current_user() as u").collect()[0]["u"]
    except Exception:
        user = os.environ.get("USER") or "unknown_user"
    safe_user = user.replace("@", "_at_").replace(" ", "_")
    path = f"/Users/{safe_user}/do_tool/{TBL}_{uuid.uuid4().hex}"
    # df.write.format("delta").mode("overwrite").save(path)
    print(f"✅ Saved Delta files to: {path}")
    print(f"\nIf you want it registered as a shared table, ask an admin to run:\n"
          f"CREATE TABLE {FULL} USING DELTA LOCATION '{path}';")


In [0]:

from datetime import datetime
import uuid, os

df = df_transposed
CAT, SCH, TBL = "workspace", "default", "numerical_distribution"
FULL = f"{CAT}.{SCH}.{TBL}"

try:
    # df.write.format("delta").mode("overwrite").saveAsTable(FULL)
    print(f"✅ Successfully wrote table: {FULL}")
except Exception as e:
    print(f"⚠️ Write to {FULL} failed: {str(e).splitlines()[0]}")
    # Fallback: save in your user folder (always writable)
    try:
        user = spark.sql("SELECT current_user() as u").collect()[0]["u"]
    except Exception:
        user = os.environ.get("USER") or "unknown_user"
    safe_user = user.replace("@", "_at_").replace(" ", "_")
    path = f"/Users/{safe_user}/do_tool/{TBL}_{uuid.uuid4().hex}"
    # df.write.format("delta").mode("overwrite").save(path)
    print(f"✅ Saved Delta files to: {path}")
    print(f"\nIf you want it registered as a shared table, ask an admin to run:\n"
          f"CREATE TABLE {FULL} USING DELTA LOCATION '{path}';")
