In [8]:
import os
import sys
import spark_utils as sut
import pandas as pd

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

## Demo dataframe

In [9]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("SparkFunctionDemo") \
    .getOrCreate()

# Define schema for the DataFrame
schema = StructType([
    StructField("ID1", IntegerType(), True),
    StructField("ID2", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("City", StringType(), True)
])

# Sample data with duplicates based on ID1 and ID2
data = [
    (101, 'A', 'Alice', 25, 'New York'),
    (102, 'B', 'Bob', 30, 'Los Angeles'),
    (103, 'A', 'Alice', 26, 'San Francisco'),
    (104, 'C', 'Charlie', 35, 'Chicago'),
    (105, 'B', 'Bob', 30, 'Los Angeles'),
    (101, 'A', 'Alice', 25, 'NY')  
]

# Create Spark DataFrame directly
df = spark.createDataFrame(data, schema=schema)

# Show the Spark DataFrame
df.show()


+---+---+-------+---+-------------+
|ID1|ID2|   Name|Age|         City|
+---+---+-------+---+-------------+
|101|  A|  Alice| 25|     New York|
|102|  B|    Bob| 30|  Los Angeles|
|103|  A|  Alice| 26|San Francisco|
|104|  C|Charlie| 35|      Chicago|
|105|  B|    Bob| 30|  Los Angeles|
|101|  A|  Alice| 25|           NY|
+---+---+-------+---+-------------+



# Custom pySpark functions

## shape

In [11]:
sut.shape(df)

Number of rows: 6
Number of columns: 5


## print schema alphabetically

In [12]:
sut.print_schema_alphabetically(df)

root
 |-- Age: integer (nullable = true)
 |-- City: string (nullable = true)
 |-- ID1: integer (nullable = true)
 |-- ID2: string (nullable = true)
 |-- Name: string (nullable = true)



## verify primary key

In [13]:
id_cols = ['ID1', 'ID2']
sut.is_primary_key(df, id_cols)

Total row count after filtering out missings: 6
Unique row count after filtering out missings: 5
The column(s) ID1, ID2 does not form a primary key.


False

## find duplicates

In [14]:
if sut.is_primary_key(df, id_cols) == False:
    dups = sut.find_duplicates(df, id_cols)
    dups.show()

Total row count after filtering out missings: 6
Unique row count after filtering out missings: 5
The column(s) ID1, ID2 does not form a primary key.
+-----+---+---+-----+---+--------+
|count|ID1|ID2| Name|Age|    City|
+-----+---+---+-----+---+--------+
|    2|101|  A|Alice| 25|New York|
|    2|101|  A|Alice| 25|      NY|
+-----+---+---+-----+---+--------+



## identify columns responsible for dups

With our simple dummy table, we can easily tell that the dups are due to `City` column. But when you have very wide table and many dups, this task becomes much trickier. The `cols_responsible_for_id_dups` function comes in rescue by summarizing the `difference_counts` for each column based on the primary key(s) provided.

In [7]:
if sut.is_primary_key(df, id_cols) == False:
    dup_cols = sut.cols_responsible_for_id_dups(df, id_cols)
    dup_cols.show()

Total row count after filtering out missings: 6
Unique row count after filtering out missings: 5
The column(s) ID1, ID2 does not form a primary key.
+--------+-----------------+
|col_name|difference_counts|
+--------+-----------------+
|    City|                1|
|    Name|                0|
|     Age|                0|
+--------+-----------------+

