In [2]:
import findspark
findspark.init()
print("Findspark initialized successfully!")

Findspark initialized successfully!


In [6]:
from pyspark.sql import SparkSession

# Create a SparkSession
spark = SparkSession.builder.appName("JupyterPySpark").getOrCreate()

# Test PySpark with sample data
data = [("Alice", 25), ("Bob", 30), ("Charlie", 35)]
columns = ["Name", "Age"]
df = spark.createDataFrame(data, columns)

df.show()


+-------+---+
|   Name|Age|
+-------+---+
|  Alice| 25|
|    Bob| 30|
|Charlie| 35|
+-------+---+



In [14]:
# Read CSV file
df = spark.read.csv("final_data.csv", header=True, inferSchema=True)

                                                                                

In [15]:
# Show first 5 rows
df.show(5, truncate=False)

25/02/25 22:58:48 WARN DAGScheduler: Broadcasting large task binary with size 1202.6 KiB


+----------------------------------------------------------------+------+------------------+------------------+------------------+------------------+--------------------+------------------+------------------+--------------------+--------------------+------------------+------------------+-------------------+-------------------+------------------+------------------+------------------+--------------------+--------------------+------------------+--------------------+-------------------+------------------+------------------+------------------+---------------------+--------------------+------------------+-------------------+--------------------+------------------+------------------+--------------------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+------------------+---------------------+------------------+------------------+--------------------+-------------------+------------------+------------------+--

In [16]:
df.select("B_40_min_6", "D_113_mean_6").show(10, truncate=False)

+------------------+--------------------+
|B_40_min_6        |D_113_mean_6        |
+------------------+--------------------+
|0.0221063390124515|0.404841161563172   |
|0.774218644591683 |0.13842388722617657 |
|0.0057719559685709|0.005657557867750467|
|0.0056033681498543|1.2049715256496951  |
|0.0180078684964037|0.20352258212473573 |
|0.0027860110440613|0.005211994725869616|
|0.0611510807445201|0.004537847033662345|
|0.0057837432005547|0.006966972451600234|
|0.0191186187090507|0.4727665794754161  |
|0.1032646613926255|0.003394571088105433|
+------------------+--------------------+
only showing top 10 rows



In [17]:
df.printSchema()

root
 |-- customer_ID: string (nullable = true)
 |-- target: double (nullable = true)
 |-- P_2_mean_3: double (nullable = true)
 |-- P_2_min_3: double (nullable = true)
 |-- P_2_max_3: double (nullable = true)
 |-- P_2_sum_3: double (nullable = true)
 |-- D_39_mean_3: double (nullable = true)
 |-- D_39_min_3: double (nullable = true)
 |-- D_39_max_3: double (nullable = true)
 |-- D_39_sum_3: double (nullable = true)
 |-- B_1_mean_3: double (nullable = true)
 |-- B_1_min_3: double (nullable = true)
 |-- B_1_max_3: double (nullable = true)
 |-- B_1_sum_3: double (nullable = true)
 |-- B_2_mean_3: double (nullable = true)
 |-- B_2_min_3: double (nullable = true)
 |-- B_2_max_3: double (nullable = true)
 |-- B_2_sum_3: double (nullable = true)
 |-- R_1_mean_3: double (nullable = true)
 |-- R_1_min_3: double (nullable = true)
 |-- R_1_max_3: double (nullable = true)
 |-- R_1_sum_3: double (nullable = true)
 |-- S_3_mean_3: double (nullable = true)
 |-- S_3_min_3: double (nullable = true)
 |

In [20]:
num_columns = len(df.columns)
print(f"Total number of columns: {num_columns}")

Total number of columns: 3194


In [21]:
num_rows = df.count()
print(f"Total number of rows: {num_rows}")



Total number of rows: 91783




In [1]:
import os
from pyspark.sql import SparkSession

# Set the path where the Iceberg JAR is located
iceberg_jar_path = os.path.expanduser("~/spark_jars/iceberg-spark-runtime-3.5_2.12-1.4.2.jar")

# Start SparkSession with Iceberg support
spark = SparkSession.builder \
    .config("spark.jars", iceberg_jar_path) \
    .config("spark.sql.catalog.my_catalog", "org.apache.iceberg.spark.SparkCatalog") \
    .config("spark.sql.catalog.my_catalog.type", "hadoop") \
    .config("spark.sql.catalog.my_catalog.warehouse", "file:///tmp/iceberg_warehouse") \
    .appName("JupyterIceberg") \
    .getOrCreate()

# Verify SparkSession
print("Spark Version:", spark.version)


25/02/25 23:21:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


Spark Version: 3.5.4


In [6]:
# Create Iceberg Table
spark.sql("""
    CREATE TABLE my_catalog.db.testTable1 (
        id INT,
        name STRING
    ) USING DELTA
""")

# Insert Sample Data
spark.sql("INSERT INTO my_catalog.db.testTable VALUES (1,'Neel'), (2,'Vinoth'), (3,'Niral'), (4,'Dhruv'), (5,'Raunit')")

# Query Table
df = spark.sql("SELECT * FROM my_catalog.db.testTable")
df.show()


+---+------+
| id|  name|
+---+------+
|  1|  Neel|
|  2|Vinoth|
|  3| Niral|
|  4| Dhruv|
|  5|Raunit|
+---+------+



In [7]:
print(spark.conf.get("spark.jars"))


/Users/neelkalavadiya/spark_jars/iceberg-spark-runtime-3.5_2.12-1.4.2.jar


In [1]:
from pyspark.sql import SparkSession

# Create Spark Session
spark = SparkSession.builder \
    .appName("FlattenJSON") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "4g") \
    .getOrCreate()  # Ensure Spark session is instantiated

# Now read the JSON file
df = spark.read.option("multiLine", "true").json("Data/TX/BaptistMedicalCenter_Sanantonio.json")

# Repartition to avoid memory issues
df = df.repartition(10)

# Show schema to inspect data structure
df.printSchema()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/03/04 14:38:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

root
 |-- affirmation: struct (nullable = true)
 |    |-- affirmation: string (nullable = true)
 |    |-- confirm_affirmation: boolean (nullable = true)
 |-- hospital_address: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- hospital_location: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- hospital_name: string (nullable = true)
 |-- last_updated_on: string (nullable = true)
 |-- license_information: struct (nullable = true)
 |    |-- license: string (nullable = true)
 |    |-- state: string (nullable = true)
 |-- pId: string (nullable = true)
 |-- standard_charge_information: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- code_information: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- code: string (nullable = true)
 |    |    |    |    |-- modifiers: string (nullable = true)
 |    |    |    |    |-- type: string (nullable = true

In [7]:
from pyspark.sql.functions import col

df_flat = df.select(
    col("hospital_name"),
    col("last_updated_on"),
    col("pId"),
    col("version"),
    col("affirmation.affirmation").alias("affirmation_text"),
    col("affirmation.confirm_affirmation").alias("affirmation_confirmed"),
    col("license_information.license").alias("license"),
    col("license_information.state").alias("license_state")
)

In [9]:
from pyspark.sql.functions import explode

# Explode hospital_address if it's an array
df_flat = df.withColumn("hospital_address_exploded", explode(col("hospital_address"))).drop("hospital_address")

# Show results
df_flat.select("hospital_name", "hospital_address_exploded").show(truncate=False)

                                                                                

+----------------------+------------------------------------+
|hospital_name         |hospital_address_exploded           |
+----------------------+------------------------------------+
|Baptist Medical Center|111 Dallas St, San Antonio, TX 78205|
+----------------------+------------------------------------+



In [10]:
df_flat = df_flat.withColumn("charge_info", explode(col("standard_charge_information")))

df_flat = df_flat.select(
    "*",
    col("charge_info.description").alias("procedure_description"),
    explode(col("charge_info.standard_charges")).alias("charge_details")  # Exploding standard_charges
).drop("charge_info")


In [11]:
df_flat = df_flat.select(
    "*",
    col("charge_details.additional_generic_notes").alias("additional_notes"),
    col("charge_details.discounted_cash").alias("discounted_cash"),
    col("charge_details.gross_charge").alias("gross_charge"),
    col("charge_details.maximum").alias("max_charge"),
    col("charge_details.minimum").alias("min_charge"),
    col("charge_details.setting").alias("charge_setting")
).drop("charge_details")


In [13]:
from pyspark.sql.functions import explode, col

# Explode standard_charge_information
df_flat = df.withColumn("charge_info", explode(col("standard_charge_information")))

# Explode standard_charges
df_flat = df_flat.withColumn("charge_details", explode(col("charge_info.standard_charges"))).drop("charge_info")

# ✅ Now explode payers_information
df_flat = df_flat.withColumn("payer_info", explode(col("charge_details.payers_information")))

# Extract relevant payer fields
df_flat = df_flat.select(
    "*",
    col("payer_info.payer_name").alias("payer_name"),
    col("payer_info.plan_name").alias("plan_name"),
    col("payer_info.standard_charge_dollar").alias("charge_dollar"),
    col("payer_info.standard_charge_percentage").alias("charge_percentage")
).drop("payer_info")  # Drop intermediary column after extraction


In [14]:
df_flat.printSchema()
df_flat.show(truncate=False)

root
 |-- affirmation: struct (nullable = true)
 |    |-- affirmation: string (nullable = true)
 |    |-- confirm_affirmation: boolean (nullable = true)
 |-- hospital_address: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- hospital_location: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- hospital_name: string (nullable = true)
 |-- last_updated_on: string (nullable = true)
 |-- license_information: struct (nullable = true)
 |    |-- license: string (nullable = true)
 |    |-- state: string (nullable = true)
 |-- pId: string (nullable = true)
 |-- standard_charge_information: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- code_information: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- code: string (nullable = true)
 |    |    |    |    |-- modifiers: string (nullable = true)
 |    |    |    |    |-- type: string (nullable = true

25/03/04 14:36:34 ERROR Executor: Exception in task 4.0 in stage 15.0 (TID 19)5]
java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.unsafe.UTF8StringBuilder.grow(UTF8StringBuilder.java:60)
	at org.apache.spark.unsafe.UTF8StringBuilder.append(UTF8StringBuilder.java:76)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.generate_doConsume_2$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.generate_doConsume_1$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.generate_doConsume_0$(Unknown Source)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluato

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/site-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/anaconda3/lib/python3.9/socket.py", line 704, in readinto
    return self._sock.recv_into(b)
ConnectionResetError: [Errno 54] Connection reset by peer

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/anaconda3/lib/python3.9/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/site-packages/IPython/core/interactives

ConnectionRefusedError: [Errno 61] Connection refused

In [7]:
from pyspark.sql.functions import col, explode
from pyspark.sql.types import StructType, ArrayType

def flatten_schema(df):
    """
    Recursively flattens a nested DataFrame schema.
    """
    flat_cols = []  # List to store column names for selection
    for field in df.schema.fields:
        col_name = field.name
        if isinstance(field.dataType, StructType):
            # Expand struct fields (dot notation)
            for sub_field in field.dataType.fields:
                flat_cols.append(col(col_name + "." + sub_field.name).alias(col_name + "_" + sub_field.name))
        elif isinstance(field.dataType, ArrayType) and isinstance(field.dataType.elementType, StructType):
            # If it's an array of structs, explode and flatten
            df = df.withColumn(col_name, explode(col(col_name)))
            for sub_field in field.dataType.elementType.fields:
                flat_cols.append(col(col_name + "." + sub_field.name).alias(col_name + "_" + sub_field.name))
        else:
            # Keep simple columns
            flat_cols.append(col(col_name))

    return df.select(flat_cols)

In [8]:
# Apply flattening recursively
while any(isinstance(field.dataType, StructType) or 
          (isinstance(field.dataType, ArrayType) and isinstance(field.dataType.elementType, StructType)) 
          for field in df.schema.fields):
    df = flatten_schema(df)

# Show result
df.show(truncate=False)

25/03/04 14:22:28 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
Exception in thread "refresh progress" java.lang.OutOfMemoryError: Java heap space
	at scala.Option.map(Option.scala:230)
	at org.apache.spark.status.AppStatusStore.activeStages(AppStatusStore.scala:170)
	at org.apache.spark.ui.ConsoleProgressBar.org$apache$spark$ui$ConsoleProgressBar$$refresh(ConsoleProgressBar.scala:64)
	at org.apache.spark.ui.ConsoleProgressBar$$anon$1.run(ConsoleProgressBar.scala:52)
	at java.base/java.util.TimerThread.mainLoop(Timer.java:566)
	at java.base/java.util.TimerThread.run(Timer.java:516)
25/03/04 14:22:35 ERROR Executor: Exception in task 0.0 in stage 1.0 (TID 1)
java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.sql.catalyst.expressions.codegen.BufferHolder.grow(BufferHolder.java:80)
	at org.apache.spark.sql.catalyst.expressions.codegen.UnsafeWriter.grow

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.9/site-packages/IPython/core/interactiveshell.py", line 3505, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "/var/folders/zh/dswmqvz111g5pwvplk3b9zz80000gn/T/ipykernel_52429/2663756128.py", line 8, in <module>
    df.show(truncate=False)
  File "/opt/anaconda3/lib/python3.9/site-packages/pyspark/sql/dataframe.py", line 947, in show
    print(self._show_string(n, truncate, vertical))
  File "/opt/anaconda3/lib/python3.9/site-packages/pyspark/sql/dataframe.py", line 978, in _show_string
    return self._jdf.showString(n, int_truncate, vertical)
  File "/opt/anaconda3/lib/python3.9/site-packages/py4j/java_gateway.py", line 1322, in __call__
    return_value = get_return_value(
  File "/opt/anaconda3/lib/python3.9/site-packages/pyspark/errors/exceptions/captured.py", line 179, in deco
    return f(*a, **kw)
  File "/opt/anaconda3/lib/python3.9/site-pa

ConnectionRefusedError: [Errno 61] Connection refused