In [29]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import DoubleType, IntegerType, StringType

sc = SparkSession.builder\
        .master("local[8]") \
        .appName("Dataframe")\
        .getOrCreate()


data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]

columns = ["firstname","middlename","lastname","dob","gender","salary"]
df = sc.createDataFrame(data=data, schema = columns)

In [11]:
df.limit(10).toPandas()
# df.show()

Unnamed: 0,firstname,middlename,lastname,dob,gender,salary
0,James,,Smith,1991-04-01,M,3000
1,Michael,Rose,,2000-05-19,M,4000
2,Robert,,Williams,1978-09-05,M,4000
3,Maria,Anne,Jones,1967-12-01,F,4000
4,Jen,Mary,Brown,1980-02-17,F,-1


In [12]:
cases = sc.read.load("/Users/kulyashdahiya/Study/DataEngineering/PysparkLearning/Basics/data/Case.csv", format="csv", sep=",", inferSchema="true", header="true")

# cases.show()
cases.limit(20).toPandas()

Unnamed: 0,case_id,province,city,group,infection_case,confirmed,latitude,longitude
0,1000001,Seoul,Yongsan-gu,True,Itaewon Clubs,72,37.538621,126.992652
1,1000002,Seoul,Guro-gu,True,Guro-gu Call Center,98,37.508163,126.884387
2,1000003,Seoul,Dongdaemun-gu,True,Dongan Church,20,37.592888,127.056766
3,1000004,Seoul,Guro-gu,True,Manmin Central Church,41,37.481059,126.894343
4,1000005,Seoul,Eunpyeong-gu,True,Eunpyeong St. Mary's Hospital,14,37.63369,126.9165
5,1000006,Seoul,Seongdong-gu,True,Seongdong-gu APT,13,37.55713,127.0403
6,1000007,Seoul,Jongno-gu,True,Jongno Community Center,10,37.57681,127.006
7,1000008,Seoul,Jung-gu,True,Jung-gu Fashion Company,7,37.562405,126.984377
8,1000009,Seoul,from other city,True,Shincheonji Church,8,-,-
9,1000010,Seoul,-,False,overseas inflow,321,-,-


In [13]:
cases = cases.withColumnRenamed("infection_case", "infection_source")
cases.limit(10).toPandas()

Unnamed: 0,case_id,province,city,group,infection_source,confirmed,latitude,longitude
0,1000001,Seoul,Yongsan-gu,True,Itaewon Clubs,72,37.538621,126.992652
1,1000002,Seoul,Guro-gu,True,Guro-gu Call Center,98,37.508163,126.884387
2,1000003,Seoul,Dongdaemun-gu,True,Dongan Church,20,37.592888,127.056766
3,1000004,Seoul,Guro-gu,True,Manmin Central Church,41,37.481059,126.894343
4,1000005,Seoul,Eunpyeong-gu,True,Eunpyeong St. Mary's Hospital,14,37.63369,126.9165
5,1000006,Seoul,Seongdong-gu,True,Seongdong-gu APT,13,37.55713,127.0403
6,1000007,Seoul,Jongno-gu,True,Jongno Community Center,10,37.57681,127.006
7,1000008,Seoul,Jung-gu,True,Jung-gu Fashion Company,7,37.562405,126.984377
8,1000009,Seoul,from other city,True,Shincheonji Church,8,-,-
9,1000010,Seoul,-,False,overseas inflow,321,-,-


In [17]:
cases = cases.select('province','city','infection_source','confirmed')
cases.limit(10).show()

+--------+---------------+--------------------+---------+
|province|           city|    infection_source|confirmed|
+--------+---------------+--------------------+---------+
|   Seoul|     Yongsan-gu|       Itaewon Clubs|       72|
|   Seoul|        Guro-gu| Guro-gu Call Center|       98|
|   Seoul|  Dongdaemun-gu|       Dongan Church|       20|
|   Seoul|        Guro-gu|Manmin Central Ch...|       41|
|   Seoul|   Eunpyeong-gu|Eunpyeong St. Mar...|       14|
|   Seoul|   Seongdong-gu|    Seongdong-gu APT|       13|
|   Seoul|      Jongno-gu|Jongno Community ...|       10|
|   Seoul|        Jung-gu|Jung-gu Fashion C...|        7|
|   Seoul|from other city|  Shincheonji Church|        8|
|   Seoul|              -|     overseas inflow|      321|
+--------+---------------+--------------------+---------+



In [24]:
# Sort and Descending Sort

# cases.sort("confirmed").limit(10).toPandas()

cases.sort(F.desc("confirmed")).limit(10).toPandas()

Unnamed: 0,province,city,infection_source,confirmed
0,Daegu,Nam-gu,Shincheonji Church,4510
1,Daegu,-,contact with patient,929
2,Daegu,-,etc,724
3,Gyeongsangbuk-do,from other city,Shincheonji Church,566
4,Seoul,-,overseas inflow,321
5,Gyeonggi-do,-,overseas inflow,225
6,Daegu,Dalseong-gun,Second Mi-Ju Hospital,196
7,Gyeongsangbuk-do,-,contact with patient,192
8,Gyeongsangbuk-do,-,etc,134
9,Daegu,Seo-gu,Hansarang Convalescent Hospital,128


In [34]:
# Cast

cases = cases.withColumn("confirmed", F.col("confirmed").cast(IntegerType()))

In [33]:
display(cases)

DataFrame[province: string, city: string, infection_source: string, confirmed: string]

In [37]:
# Filter

cases.filter( (cases.confirmed>10) & (cases.province == 'Daegu')  ).limit(10).toPandas()

Unnamed: 0,province,city,infection_source,confirmed
0,Daegu,Nam-gu,Shincheonji Church,4510
1,Daegu,Dalseong-gun,Second Mi-Ju Hospital,196
2,Daegu,Seo-gu,Hansarang Convalescent Hospital,128
3,Daegu,Dalseong-gun,Daesil Convalescent Hospital,100
4,Daegu,Dong-gu,Fatima Hospital,37
5,Daegu,-,overseas inflow,24
6,Daegu,-,contact with patient,929
7,Daegu,-,etc,724


In [39]:
cases.groupBy(["province", "city"]).agg(F.sum("confirmed"), F.max("confirmed")).limit(15).toPandas()

Unnamed: 0,province,city,sum(confirmed),max(confirmed)
0,Gyeongsangnam-do,Jinju-si,10,10
1,Seoul,Guro-gu,139,98
2,Daejeon,-,27,10
3,Jeollabuk-do,from other city,1,1
4,Gyeongsangnam-do,Changnyeong-gun,7,7
5,Seoul,-,363,321
6,Jeju-do,from other city,1,1
7,Gyeongsangbuk-do,-,336,192
8,Gyeongsangnam-do,Geochang-gun,18,10
9,Incheon,from other city,22,20
