In [29]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType, IntegerType, StringType

sc = SparkSession.builder\
        .master("local[8]") \
        .appName("Dataframe")\
        .getOrCreate()


data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]

columns = ["firstname","middlename","lastname","dob","gender","salary"]
df = sc.createDataFrame(data=data, schema = columns)

In [54]:
data1 = [("Alice", 30), ("Bob", 25), ("Charlie", 35)]
df1 = sc.createDataFrame(data1, ["Name", "Age"])

# Select specific columns using 'select'
df_select = df1.select("Name", "Age")

# Select columns using SQL-like expressions with 'selectExpr'
df_select_expr = df1.selectExpr("Name", "Age + 2 as Age_Plus_2")

df_select.show()
df_select_expr.show()

+-------+---+
|   Name|Age|
+-------+---+
|  Alice| 30|
|    Bob| 25|
|Charlie| 35|
+-------+---+

+-------+----------+
|   Name|Age_Plus_2|
+-------+----------+
|  Alice|        32|
|    Bob|        27|
|Charlie|        37|
+-------+----------+



In [11]:
df.limit(10).toPandas()
# df.show()

Unnamed: 0,firstname,middlename,lastname,dob,gender,salary
0,James,,Smith,1991-04-01,M,3000
1,Michael,Rose,,2000-05-19,M,4000
2,Robert,,Williams,1978-09-05,M,4000
3,Maria,Anne,Jones,1967-12-01,F,4000
4,Jen,Mary,Brown,1980-02-17,F,-1


In [6]:
cases = sc.read.load("/Users/kulyashdahiya/Study/DataEngineering/PysparkLearning/Basics/data/Case.csv", format="csv", sep=",", inferSchema="true", header="true")

regions = sc.read.load("/Users/kulyashdahiya/Study/DataEngineering/PysparkLearning/Basics/data/Region.csv" \
    , format="csv", sep=",", inferSchema= "true" , header="true")

# cases.show()
cases.limit(10).toPandas()

Unnamed: 0,case_id,province,city,group,infection_case,confirmed,latitude,longitude
0,1000001,Seoul,Yongsan-gu,True,Itaewon Clubs,72,37.538621,126.992652
1,1000002,Seoul,Guro-gu,True,Guro-gu Call Center,98,37.508163,126.884387
2,1000003,Seoul,Dongdaemun-gu,True,Dongan Church,20,37.592888,127.056766
3,1000004,Seoul,Guro-gu,True,Manmin Central Church,41,37.481059,126.894343
4,1000005,Seoul,Eunpyeong-gu,True,Eunpyeong St. Mary's Hospital,14,37.63369,126.9165
5,1000006,Seoul,Seongdong-gu,True,Seongdong-gu APT,13,37.55713,127.0403
6,1000007,Seoul,Jongno-gu,True,Jongno Community Center,10,37.57681,127.006
7,1000008,Seoul,Jung-gu,True,Jung-gu Fashion Company,7,37.562405,126.984377
8,1000009,Seoul,from other city,True,Shincheonji Church,8,-,-
9,1000010,Seoul,-,False,overseas inflow,321,-,-


In [13]:
cases = cases.withColumnRenamed("infection_case", "infection_source")
cases.limit(10).toPandas()

Unnamed: 0,case_id,province,city,group,infection_source,confirmed,latitude,longitude
0,1000001,Seoul,Yongsan-gu,True,Itaewon Clubs,72,37.538621,126.992652
1,1000002,Seoul,Guro-gu,True,Guro-gu Call Center,98,37.508163,126.884387
2,1000003,Seoul,Dongdaemun-gu,True,Dongan Church,20,37.592888,127.056766
3,1000004,Seoul,Guro-gu,True,Manmin Central Church,41,37.481059,126.894343
4,1000005,Seoul,Eunpyeong-gu,True,Eunpyeong St. Mary's Hospital,14,37.63369,126.9165
5,1000006,Seoul,Seongdong-gu,True,Seongdong-gu APT,13,37.55713,127.0403
6,1000007,Seoul,Jongno-gu,True,Jongno Community Center,10,37.57681,127.006
7,1000008,Seoul,Jung-gu,True,Jung-gu Fashion Company,7,37.562405,126.984377
8,1000009,Seoul,from other city,True,Shincheonji Church,8,-,-
9,1000010,Seoul,-,False,overseas inflow,321,-,-


In [17]:
cases = cases.select('province','city','infection_source','confirmed')
cases.limit(10).show()

+--------+---------------+--------------------+---------+
|province|           city|    infection_source|confirmed|
+--------+---------------+--------------------+---------+
|   Seoul|     Yongsan-gu|       Itaewon Clubs|       72|
|   Seoul|        Guro-gu| Guro-gu Call Center|       98|
|   Seoul|  Dongdaemun-gu|       Dongan Church|       20|
|   Seoul|        Guro-gu|Manmin Central Ch...|       41|
|   Seoul|   Eunpyeong-gu|Eunpyeong St. Mar...|       14|
|   Seoul|   Seongdong-gu|    Seongdong-gu APT|       13|
|   Seoul|      Jongno-gu|Jongno Community ...|       10|
|   Seoul|        Jung-gu|Jung-gu Fashion C...|        7|
|   Seoul|from other city|  Shincheonji Church|        8|
|   Seoul|              -|     overseas inflow|      321|
+--------+---------------+--------------------+---------+



In [24]:
# Sort and Descending Sort

# cases.sort("confirmed").limit(10).toPandas()

cases.sort(F.desc("confirmed")).limit(10).toPandas()

Unnamed: 0,province,city,infection_source,confirmed
0,Daegu,Nam-gu,Shincheonji Church,4510
1,Daegu,-,contact with patient,929
2,Daegu,-,etc,724
3,Gyeongsangbuk-do,from other city,Shincheonji Church,566
4,Seoul,-,overseas inflow,321
5,Gyeonggi-do,-,overseas inflow,225
6,Daegu,Dalseong-gun,Second Mi-Ju Hospital,196
7,Gyeongsangbuk-do,-,contact with patient,192
8,Gyeongsangbuk-do,-,etc,134
9,Daegu,Seo-gu,Hansarang Convalescent Hospital,128


In [34]:
# Cast

cases = cases.withColumn("confirmed", F.col("confirmed").cast(IntegerType()))

In [33]:
display(cases)

DataFrame[province: string, city: string, infection_source: string, confirmed: string]

In [37]:
# Filter

cases.filter( (cases.confirmed>10) & (cases.province == 'Daegu')  ).limit(10).toPandas()

Unnamed: 0,province,city,infection_source,confirmed
0,Daegu,Nam-gu,Shincheonji Church,4510
1,Daegu,Dalseong-gun,Second Mi-Ju Hospital,196
2,Daegu,Seo-gu,Hansarang Convalescent Hospital,128
3,Daegu,Dalseong-gun,Daesil Convalescent Hospital,100
4,Daegu,Dong-gu,Fatima Hospital,37
5,Daegu,-,overseas inflow,24
6,Daegu,-,contact with patient,929
7,Daegu,-,etc,724


In [39]:
#Group By

cases.groupBy(["province", "city"]).agg(F.sum("confirmed"), F.max("confirmed")).limit(15).toPandas()

Unnamed: 0,province,city,sum(confirmed),max(confirmed)
0,Gyeongsangnam-do,Jinju-si,10,10
1,Seoul,Guro-gu,139,98
2,Daejeon,-,27,10
3,Jeollabuk-do,from other city,1,1
4,Gyeongsangnam-do,Changnyeong-gun,7,7
5,Seoul,-,363,321
6,Jeju-do,from other city,1,1
7,Gyeongsangbuk-do,-,336,192
8,Gyeongsangnam-do,Geochang-gun,18,10
9,Incheon,from other city,22,20


In [46]:
# Group By (Alias)

cases.groupBy(["province", "city"]) \
    .agg(F.sum("confirmed").alias("Total_Confimed"), F.max("confirmed").alias("Max_Confirmed")) \
    .limit(10).toPandas()

Unnamed: 0,province,city,Total_Confimed,Max_Confirmed
0,Gyeongsangnam-do,Jinju-si,10,10
1,Seoul,Guro-gu,139,98
2,Daejeon,-,27,10
3,Jeollabuk-do,from other city,1,1
4,Gyeongsangnam-do,Changnyeong-gun,7,7
5,Seoul,-,363,321
6,Jeju-do,from other city,1,1
7,Gyeongsangbuk-do,-,336,192
8,Gyeongsangnam-do,Geochang-gun,18,10
9,Incheon,from other city,22,20


In [47]:
regions.limit(5).toPandas()

Unnamed: 0,code,province,city,latitude,longitude,elementary_school_count,kindergarten_count,university_count,academy_ratio,elderly_population_ratio,elderly_alone_ratio,nursing_home_count
0,10000,Seoul,Seoul,37.566953,126.977977,607,830,48,1.44,15.38,5.8,22739
1,10010,Seoul,Gangnam-gu,37.518421,127.047222,33,38,0,4.18,13.17,4.3,3088
2,10020,Seoul,Gangdong-gu,37.530492,127.123837,27,32,0,1.54,14.55,5.4,1023
3,10030,Seoul,Gangbuk-gu,37.639938,127.025508,14,21,0,0.67,19.49,8.5,628
4,10040,Seoul,Gangseo-gu,37.551166,126.849506,36,56,1,1.17,14.39,5.7,1080


In [48]:
# Joins

cases = cases.join(regions, ["province", "city"], how = 'left')

cases.limit(10).toPandas()

Unnamed: 0,province,city,case_id,group,infection_case,confirmed,latitude,longitude,code,latitude.1,longitude.1,elementary_school_count,kindergarten_count,university_count,academy_ratio,elderly_population_ratio,elderly_alone_ratio,nursing_home_count
0,Seoul,Yongsan-gu,1000001,True,Itaewon Clubs,72,37.538621,126.992652,10210.0,37.532768,126.990021,15.0,13.0,1.0,0.68,16.87,6.5,435.0
1,Seoul,Guro-gu,1000002,True,Guro-gu Call Center,98,37.508163,126.884387,10070.0,37.495632,126.88765,26.0,34.0,3.0,1.0,16.21,5.7,741.0
2,Seoul,Dongdaemun-gu,1000003,True,Dongan Church,20,37.592888,127.056766,10110.0,37.574552,127.039721,21.0,31.0,4.0,1.06,17.26,6.7,832.0
3,Seoul,Guro-gu,1000004,True,Manmin Central Church,41,37.481059,126.894343,10070.0,37.495632,126.88765,26.0,34.0,3.0,1.0,16.21,5.7,741.0
4,Seoul,Eunpyeong-gu,1000005,True,Eunpyeong St. Mary's Hospital,14,37.63369,126.9165,10220.0,37.603481,126.929173,31.0,44.0,1.0,1.09,17.0,6.5,874.0
5,Seoul,Seongdong-gu,1000006,True,Seongdong-gu APT,13,37.55713,127.0403,10160.0,37.563277,127.036647,21.0,30.0,2.0,0.97,14.76,5.3,593.0
6,Seoul,Jongno-gu,1000007,True,Jongno Community Center,10,37.57681,127.006,10230.0,37.572999,126.979189,13.0,17.0,3.0,1.71,18.27,6.8,668.0
7,Seoul,Jung-gu,1000008,True,Jung-gu Fashion Company,7,37.562405,126.984377,10240.0,37.563988,126.99753,12.0,14.0,2.0,0.94,18.42,7.4,728.0
8,Seoul,from other city,1000009,True,Shincheonji Church,8,-,-,,,,,,,,,,
9,Seoul,-,1000010,False,overseas inflow,321,-,-,,,,,,,,,,


In [50]:
# Broadcast Joins
from pyspark.sql.functions import broadcast

cases = cases.join(broadcast(regions), ['province','city'],how='left')
cases.limit(10).toPandas()

24/09/02 09:44:29 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.


Unnamed: 0,province,city,case_id,group,infection_case,confirmed,latitude,longitude,code,latitude.1,...,code.1,latitude.2,longitude.1,elementary_school_count,kindergarten_count,university_count,academy_ratio,elderly_population_ratio,elderly_alone_ratio,nursing_home_count
0,Seoul,Yongsan-gu,1000001,True,Itaewon Clubs,72,37.538621,126.992652,10210.0,37.532768,...,10210.0,37.532768,126.990021,15.0,13.0,1.0,0.68,16.87,6.5,435.0
1,Seoul,Guro-gu,1000002,True,Guro-gu Call Center,98,37.508163,126.884387,10070.0,37.495632,...,10070.0,37.495632,126.88765,26.0,34.0,3.0,1.0,16.21,5.7,741.0
2,Seoul,Dongdaemun-gu,1000003,True,Dongan Church,20,37.592888,127.056766,10110.0,37.574552,...,10110.0,37.574552,127.039721,21.0,31.0,4.0,1.06,17.26,6.7,832.0
3,Seoul,Guro-gu,1000004,True,Manmin Central Church,41,37.481059,126.894343,10070.0,37.495632,...,10070.0,37.495632,126.88765,26.0,34.0,3.0,1.0,16.21,5.7,741.0
4,Seoul,Eunpyeong-gu,1000005,True,Eunpyeong St. Mary's Hospital,14,37.63369,126.9165,10220.0,37.603481,...,10220.0,37.603481,126.929173,31.0,44.0,1.0,1.09,17.0,6.5,874.0
5,Seoul,Seongdong-gu,1000006,True,Seongdong-gu APT,13,37.55713,127.0403,10160.0,37.563277,...,10160.0,37.563277,127.036647,21.0,30.0,2.0,0.97,14.76,5.3,593.0
6,Seoul,Jongno-gu,1000007,True,Jongno Community Center,10,37.57681,127.006,10230.0,37.572999,...,10230.0,37.572999,126.979189,13.0,17.0,3.0,1.71,18.27,6.8,668.0
7,Seoul,Jung-gu,1000008,True,Jung-gu Fashion Company,7,37.562405,126.984377,10240.0,37.563988,...,10240.0,37.563988,126.99753,12.0,14.0,2.0,0.94,18.42,7.4,728.0
8,Seoul,from other city,1000009,True,Shincheonji Church,8,-,-,,,...,,,,,,,,,,
9,Seoul,-,1000010,False,overseas inflow,321,-,-,,,...,,,,,,,,,,


In [78]:
sc.stop()

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import IntegerType, StringType, DoubleType, BooleanType, StructType, StructField

sc = SparkSession.builder\
    .master("local[12]")\
    .appName("Dataframe")\
    .getOrCreate()

In [3]:
timeprovince = sc.read.load("/Users/kulyashdahiya/Study/DataEngineering/PysparkLearning/Basics/data/TimeProvince.csv",format="csv", sep=",", inferSchema="true", header="true")

timeprovince.limit(10).toPandas()

Unnamed: 0,date,time,province,confirmed,released,deceased
0,2020-01-20,16,Seoul,0,0,0
1,2020-01-20,16,Busan,0,0,0
2,2020-01-20,16,Daegu,0,0,0
3,2020-01-20,16,Incheon,1,0,0
4,2020-01-20,16,Gwangju,0,0,0
5,2020-01-20,16,Daejeon,0,0,0
6,2020-01-20,16,Ulsan,0,0,0
7,2020-01-20,16,Sejong,0,0,0
8,2020-01-20,16,Gyeonggi-do,0,0,0
9,2020-01-20,16,Gangwon-do,0,0,0


In [7]:
# Window Function
from pyspark.sql.window import Window

windowSpec = Window().partitionBy(['province']).orderBy(F.desc('confirmed'))
cases.withColumn("rank",F.rank().over(windowSpec)).limit(5).toPandas()

Unnamed: 0,case_id,province,city,group,infection_case,confirmed,latitude,longitude,rank
0,1100001,Busan,Dongnae-gu,True,Onchun Church,39,35.21628,129.0771,1
1,1100009,Busan,-,False,etc,29,-,-,2
2,1100007,Busan,-,False,overseas inflow,25,-,-,3
3,1100008,Busan,-,False,contact with patient,18,-,-,4
4,1100002,Busan,from other city,True,Shincheonji Church,12,-,-,5


In [5]:
# Lag Variable

windowSpec = Window().partitionBy(['province']).orderBy('date')
timeprovinceWithLag = timeprovince.withColumn("lag_7",F.lag("confirmed", 1, 0).over(windowSpec))

timeprovinceWithLag.filter(timeprovinceWithLag.date>'2020-03-10').limit(10).toPandas()

Unnamed: 0,date,time,province,confirmed,released,deceased,lag_7
0,2020-03-11,0,Busan,98,21,0,96
1,2020-03-12,0,Busan,99,29,0,98
2,2020-03-13,0,Busan,100,36,0,99
3,2020-03-14,0,Busan,103,40,0,100
4,2020-03-15,0,Busan,106,52,1,103
5,2020-03-16,0,Busan,107,53,1,106
6,2020-03-17,0,Busan,107,54,1,107
7,2020-03-18,0,Busan,107,58,1,107
8,2020-03-19,0,Busan,107,58,1,107
9,2020-03-20,0,Busan,108,60,1,107


24/09/03 16:17:13 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [6]:
## Explode etc.

from pyspark.sql import SparkSession

spark = SparkSession.builder.appName("pyspark-by-examples")\
        .master("local[8]")\
        .getOrCreate()

columns = ['name', 'knownLanguages', 'properties']
arrayData = [
        ('James',['Java','Scala'],{'hair':'black','eye':'brown'}),
        ('Michael',['Spark','Java',None],{'hair':'brown','eye':None}),
        ('Robert',['CSharp',''],{'hair':'red','eye':''}),
        ('Washington',None,None),
        ('Jefferson',['1','2'],{})
    ]

df = spark.createDataFrame(data = arrayData, schema = columns)
df.printSchema()
df.toPandas()

root
 |-- name: string (nullable = true)
 |-- knownLanguages: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



Unnamed: 0,name,knownLanguages,properties
0,James,"[Java, Scala]","{'eye': 'brown', 'hair': 'black'}"
1,Michael,"[Spark, Java, None]","{'eye': None, 'hair': 'brown'}"
2,Robert,"[CSharp, ]","{'eye': '', 'hair': 'red'}"
3,Washington,,
4,Jefferson,"[1, 2]",{}


In [8]:
#1.1 explode – array column example

from pyspark.sql import functions as F

df2 = df.select(df.name, F.explode(df.knownLanguages))
df2.printSchema()
df2.toPandas()

root
 |-- name: string (nullable = true)
 |-- col: string (nullable = true)



Unnamed: 0,name,col
0,James,Java
1,James,Scala
2,Michael,Spark
3,Michael,Java
4,Michael,
5,Robert,CSharp
6,Robert,
7,Jefferson,1
8,Jefferson,2


In [9]:
#1.2 explode – map column example
# explode() on map column

df3 = df.select(df.name,F.explode(df.properties))
df3.printSchema()
df3.toPandas()

root
 |-- name: string (nullable = true)
 |-- key: string (nullable = false)
 |-- value: string (nullable = true)



Unnamed: 0,name,key,value
0,James,eye,brown
1,James,hair,black
2,Michael,eye,
3,Michael,hair,brown
4,Robert,eye,
5,Robert,hair,red


24/09/04 01:09:45 WARN HeartbeatReceiver: Removing executor driver with no recent heartbeats: 907841 ms exceeds timeout 120000 ms
24/09/04 01:09:45 WARN SparkContext: Killing executors is not supported by current scheduler.
24/09/04 01:09:52 ERROR Inbox: Ignoring error
org.apache.spark.SparkException: Exception thrown in awaitResult: 
	at org.apache.spark.util.SparkThreadUtils$.awaitResult(SparkThreadUtils.scala:56)
	at org.apache.spark.util.ThreadUtils$.awaitResult(ThreadUtils.scala:310)
	at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRefByURI(RpcEnv.scala:102)
	at org.apache.spark.rpc.RpcEnv.setupEndpointRef(RpcEnv.scala:110)
	at org.apache.spark.util.RpcUtils$.makeDriverRef(RpcUtils.scala:36)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.driverEndpoint$lzycompute(BlockManagerMasterEndpoint.scala:124)
	at org.apache.spark.storage.BlockManagerMasterEndpoint.org$apache$spark$storage$BlockManagerMasterEndpoint$$