In [2]:
import findspark
findspark.init()  # Initialize findspark

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("PySpark in Jupyter") \
    .getOrCreate()

# Verify SparkSession is created
print(spark)

<pyspark.sql.session.SparkSession object at 0x79b3d05172e0>


In [25]:
from pyspark.sql.types import *



In [27]:
schema = StructType([
    StructField("id" , dataType=IntegerType(),nullable=True),
    StructField("name",dataType=StringType(),nullable=True),
    StructField("dept",dataType=StringType(),nullable=True),
    StructField("salary",dataType=FloatType(),nullable=True)
])

In [28]:
df = spark.read.json(path ="/home/hdoop/Docs/spark_files/json_files/emp.json",schema = schema,multiLine=True)

In [29]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- salary: float (nullable = true)



In [43]:
df.show()

+---+------+-------+-------+
| id|  name|   dept| salary|
+---+------+-------+-------+
|  1|Mayank|     IT|65000.0|
|  2| SAttu|     IT|67000.0|
|  3|   May|     HR|75000.0|
|  4| Mayan|Finance|55000.0|
+---+------+-------+-------+



In [45]:
num_partitions = df.rdd.getNumPartitions()
print(num_partitions)
df.repartition
df.show()
num_partitions = df.rdd.getNumPartitions()
print(num_partitions)

1
+---+------+-------+-------+
| id|  name|   dept| salary|
+---+------+-------+-------+
|  1|Mayank|     IT|65000.0|
|  2| SAttu|     IT|67000.0|
|  3|   May|     HR|75000.0|
|  4| Mayan|Finance|55000.0|
+---+------+-------+-------+

1


In [40]:
df.write.json(path='/home/hdoop/Docs/spark_files/json_files/writer_11Mar2025/json_write/emp_file_2',mode = 'overwrite')

In [52]:
df.show(n=2,truncate = 2)

+---+----+----+------+
| id|name|dept|salary|
+---+----+----+------+
|  1|  Ma|  IT|    65|
|  2|  SA|  IT|    67|
+---+----+----+------+
only showing top 2 rows



### withColumn and wih ColumnRenamed

In [80]:
data = [(1,"xyz",50000.0),(2,"abc",55000.0),(3,"mnc",65000.0)]

schema =  StructType(
    [
        StructField(name= "id",dataType=IntegerType(),nullable=True),
        StructField(name="name",dataType=StringType(),nullable=True),
        StructField(name= "Salary",dataType=FloatType(),nullable=True)
    ]
)

In [81]:
df= spark.createDataFrame(data =data,schema=schema)

In [82]:
df.show()

+---+----+-------+
| id|name| Salary|
+---+----+-------+
|  1| xyz|50000.0|
|  2| abc|55000.0|
|  3| mnc|65000.0|
+---+----+-------+



In [83]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- Salary: float (nullable = true)



In [96]:
from pyspark.sql.functions import col , expr ,lit

In [85]:
df1 =df.withColumn(colName="salary",col = col('salary').cast('String'))

In [86]:
df1.show()

+---+----+-------+
| id|name| salary|
+---+----+-------+
|  1| xyz|50000.0|
|  2| abc|55000.0|
|  3| mnc|65000.0|
+---+----+-------+



In [87]:
df1.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: string (nullable = true)



In [88]:
df2 = df1.withColumn('salary',expr("salary * 2")) 

In [89]:
df2.show()

+---+----+--------+
| id|name|  salary|
+---+----+--------+
|  1| xyz|100000.0|
|  2| abc|110000.0|
|  3| mnc|130000.0|
+---+----+--------+



In [90]:
df3 = df1.withColumn("bonus",expr("salary * 0.2"))

In [91]:
df3.show()

+---+----+-------+-------+
| id|name| salary|  bonus|
+---+----+-------+-------+
|  1| xyz|50000.0|10000.0|
|  2| abc|55000.0|11000.0|
|  3| mnc|65000.0|13000.0|
+---+----+-------+-------+



In [97]:
df4  = df1.withColumn("country",lit("India"))

In [98]:
df4.show()

+---+----+-------+-------+
| id|name| salary|country|
+---+----+-------+-------+
|  1| xyz|50000.0|  India|
|  2| abc|55000.0|  India|
|  3| mnc|65000.0|  India|
+---+----+-------+-------+



In [103]:
display(df3.withColumnRenamed("bonus","fined").show()) #new dataframe created with the bonus column renamed as the fine column

+---+----+-------+-------+
| id|name| salary|  fined|
+---+----+-------+-------+
|  1| xyz|50000.0|10000.0|
|  2| abc|55000.0|11000.0|
|  3| mnc|65000.0|13000.0|
+---+----+-------+-------+



None

In [104]:
df3.show()

+---+----+-------+-------+
| id|name| salary|  bonus|
+---+----+-------+-------+
|  1| xyz|50000.0|10000.0|
|  2| abc|55000.0|11000.0|
|  3| mnc|65000.0|13000.0|
+---+----+-------+-------+



### videos ## 12


In [108]:
data = [(1,"xyz",50000.0),(2,"abc",55000.0),(3,"mnc",65000.0)]

schema=StructType([\
                    StructField(name="id",dataType=IntegerType(),nullable=True),\
                    StructField(name="name",dataType=StringType(),nullable = True),\
                    StructField(name="salary",dataType=FloatType(),nullable= True)
])

In [109]:
df = spark.createDataFrame(data =data,schema =schema)

In [110]:
df.show()

+---+----+-------+
| id|name| salary|
+---+----+-------+
|  1| xyz|50000.0|
|  2| abc|55000.0|
|  3| mnc|65000.0|
+---+----+-------+



In [111]:
df.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: string (nullable = true)
 |-- salary: float (nullable = true)



#### for the complex datatypes


In [113]:
data = [(1,("xyz","XYZZ"),50000.0),(2,("abc","ABCD"),55000.0),(3,("mnc","MNCV"),65000.0)]
structname =  StructType([\
    StructField(name= "firstname",dataType = StringType(),nullable =True),\
    StructField(name = "lastname",dataType= StringType(),nullable =True)
])
schema=StructType([\
                    StructField(name="id",dataType=IntegerType(),nullable=True),\
                    StructField(name="name",dataType=structname,nullable = True),\
                    StructField(name="salary",dataType=FloatType(),nullable= True)
])

In [115]:
df2 = spark.createDataFrame(data =data , schema =schema)

In [116]:
df2.show()

+---+-----------+-------+
| id|       name| salary|
+---+-----------+-------+
|  1|{xyz, XYZZ}|50000.0|
|  2|{abc, ABCD}|55000.0|
|  3|{mnc, MNCV}|65000.0|
+---+-----------+-------+



In [117]:
df2.printSchema()

root
 |-- id: integer (nullable = true)
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- salary: float (nullable = true)



In [118]:
display(df)

DataFrame[id: int, name: string, salary: float]

25/03/11 16:28:43 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 2369742 ms exceeds timeout 120000 ms
25/03/11 16:28:43 WARN SparkContext: Killing executors is not supported by current scheduler.
25/03/11 16:28:45 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$

In [119]:
spark.stop()

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.p

ConnectionRefusedError: [Errno 111] Connection refused