In [28]:
import os
import sys
import spark_utils as sut
import pandas as pd

os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

## Demo dataframe

In [29]:
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, IntegerType, StringType

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("SparkFunctionDemo") \
    .getOrCreate()

# Define schema for the DataFrame
schema = StructType([
    StructField("ID1", IntegerType(), True),
    StructField("ID2", StringType(), True),
    StructField("Name", StringType(), True),
    StructField("Age", IntegerType(), True),
    StructField("City", StringType(), True)
])

# Sample data with duplicates based on ID1 and ID2
data = [
    (101, 'A', 'Alice', 25, 'New York'),
    (102, 'B', 'Bob', 30, 'Los Angeles'),
    (103, 'A', 'Alice', 25, 'New York'),
    (104, 'C', 'Charlie', 35, 'Chicago'),
    (105, 'B', 'Bob', 30, 'Los Angeles'),
    (101, 'A', 'Alice', 25, 'New York')  # Duplicate based on ID1 and ID2
]

# Create Spark DataFrame directly
df = spark.createDataFrame(data, schema=schema)

# Show the Spark DataFrame
df.show()


+---+---+-------+---+-----------+
|ID1|ID2|   Name|Age|       City|
+---+---+-------+---+-----------+
|101|  A|  Alice| 25|   New York|
|102|  B|    Bob| 30|Los Angeles|
|103|  A|  Alice| 25|   New York|
|104|  C|Charlie| 35|    Chicago|
|105|  B|    Bob| 30|Los Angeles|
|101|  A|  Alice| 25|   New York|
+---+---+-------+---+-----------+



# Custom pySpark functions

## shape

In [25]:
sut.shape(df)

Number of rows: 6
Number of columns: 5


## print schema alphabetically

In [26]:
sut.print_schema_alphabetically(df)

root
 |-- Age: integer (nullable = true)
 |-- City: string (nullable = true)
 |-- ID1: integer (nullable = true)
 |-- ID2: string (nullable = true)
 |-- Name: string (nullable = true)



## verify primary key

In [27]:
id_cols = ['ID1', 'ID2']
sut.is_primary_key(df, id_cols)

Total row count after filtering out missings: 6
Unique row count after filtering out missings: 5
The column(s) ID1, ID2 does not form a primary key.


False