In [1]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.getOrCreate()

# Create DataFrame

In [2]:
columns = ["language","users_count"]
data = [("Java", "20000"), ("Python", "100000"), ("Scala", "3000")]
           
# create a rdd 
rdd = spark.sparkContext.parallelize(data)
rdd.collect()

[('Java', '20000'), ('Python', '100000'), ('Scala', '3000')]

In [3]:
dfFromRDD1 = rdd.toDF()
dfFromRDD1.show()

+------+------+
|    _1|    _2|
+------+------+
|  Java| 20000|
|Python|100000|
| Scala|  3000|
+------+------+



In [4]:
dfFromRDD1.printSchema()

root
 |-- _1: string (nullable = true)
 |-- _2: string (nullable = true)



In [5]:
dfFromRDD1=rdd.toDF(columns)
dfFromRDD1.show()

+--------+-----------+
|language|users_count|
+--------+-----------+
|    Java|      20000|
|  Python|     100000|
|   Scala|       3000|
+--------+-----------+



# createDataFrame from SparkSession

In [6]:
dfFromRDD2 = spark.createDataFrame(data).toDF(*columns)

dfFromRDD2.show()

+--------+-----------+
|language|users_count|
+--------+-----------+
|    Java|      20000|
|  Python|     100000|
|   Scala|       3000|
+--------+-----------+



# Create DataFrame with schema

In [7]:
from pyspark.sql.types import StructType,StructField,StringType,IntegerType

data2 = [("James","","Smith","36636","M",3000),
    ("Michael","Rose","","40288","M",4000),
    ("Robert","","Williams","42114","M",4000),
    ("Maria","Anne","Jones","39192","F",4000),
    ("Jen","Mary","Brown","","F",-1)
  ]

schema = StructType([
    StructField("firstname",StringType(),True),
    StructField("middlename",StringType(),True),
    StructField("lastname",StringType(),True),
    StructField("id",StringType(),True),
    StructField("gender",StringType(),True),
    StructField("salary",StringType(),True)
])

df = spark.createDataFrame(data=data2,schema=schema)

In [8]:
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- id: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: string (nullable = true)



In [9]:
df.show()

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|   id|gender|salary|
+---------+----------+--------+-----+------+------+
|    James|          |   Smith|36636|     M|  3000|
|  Michael|      Rose|        |40288|     M|  4000|
|   Robert|          |Williams|42114|     M|  4000|
|    Maria|      Anne|   Jones|39192|     F|  4000|
|      Jen|      Mary|   Brown|     |     F|    -1|
+---------+----------+--------+-----+------+------+



In [10]:
df.show(truncate=False)

+---------+----------+--------+-----+------+------+
|firstname|middlename|lastname|id   |gender|salary|
+---------+----------+--------+-----+------+------+
|James    |          |Smith   |36636|M     |3000  |
|Michael  |Rose      |        |40288|M     |4000  |
|Robert   |          |Williams|42114|M     |4000  |
|Maria    |Anne      |Jones   |39192|F     |4000  |
|Jen      |Mary      |Brown   |     |F     |-1    |
+---------+----------+--------+-----+------+------+



# DataFrame from Data sources

In [11]:
path="D:/PySpark/pyspark-examples-master/resources/"

df2=spark.read.option("header",True).csv(path+"zipcodes.csv")

df2.show(5)

+------------+-------+-----------+-------------------+-----+--------------+-----+------+-----+-----+-----+-----------+-------+--------------------+--------------------+-------------+---------------+-------------------+----------+-----+
|RecordNumber|Zipcode|ZipCodeType|               City|State|  LocationType|  Lat|  Long|Xaxis|Yaxis|Zaxis|WorldRegion|Country|        LocationText|            Location|Decommisioned|TaxReturnsFiled|EstimatedPopulation|TotalWages|Notes|
+------------+-------+-----------+-------------------+-----+--------------+-----+------+-----+-----+-----+-----------+-------+--------------------+--------------------+-------------+---------------+-------------------+----------+-----+
|           1|    704|   STANDARD|        PARC PARQUE|   PR|NOT ACCEPTABLE|17.96|-66.22| 0.38|-0.87|  0.3|         NA|     US|     Parc Parque, PR|NA-US-PR-PARC PARQUE|        FALSE|           null|               null|      null| null|
|           2|    704|   STANDARD|PASEO COSTA DEL SUR|  

In [12]:
df2=spark.read.option("header",True).format("csv").load(path+"zipcodes.csv")

df2.show(5)

+------------+-------+-----------+-------------------+-----+--------------+-----+------+-----+-----+-----+-----------+-------+--------------------+--------------------+-------------+---------------+-------------------+----------+-----+
|RecordNumber|Zipcode|ZipCodeType|               City|State|  LocationType|  Lat|  Long|Xaxis|Yaxis|Zaxis|WorldRegion|Country|        LocationText|            Location|Decommisioned|TaxReturnsFiled|EstimatedPopulation|TotalWages|Notes|
+------------+-------+-----------+-------------------+-----+--------------+-----+------+-----+-----+-----+-----------+-------+--------------------+--------------------+-------------+---------------+-------------------+----------+-----+
|           1|    704|   STANDARD|        PARC PARQUE|   PR|NOT ACCEPTABLE|17.96|-66.22| 0.38|-0.87|  0.3|         NA|     US|     Parc Parque, PR|NA-US-PR-PARC PARQUE|        FALSE|           null|               null|      null| null|
|           2|    704|   STANDARD|PASEO COSTA DEL SUR|  

# Create an Empty DataFrame & RDD

In [13]:
# empty rdd
emptRDD = spark.sparkContext.emptyRDD()

emptRDD.collect()

[]

In [14]:
emptyRDD = spark.sparkContext.parallelize([])

emptyRDD.collect()

[]

In [15]:
# empty dataframe
schema = StructType([
    StructField('firstname',StringType(),True),
    StructField('middlename',StringType(),True),
    StructField('lastname',StringType(),True)
])

df = spark.createDataFrame(emptyRDD,schema)
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)



In [16]:
df.show(5)

+---------+----------+--------+
|firstname|middlename|lastname|
+---------+----------+--------+
+---------+----------+--------+



In [17]:
#Convert empty RDD to Dataframe
df1 = emptyRDD.toDF(schema)
df1.show()

+---------+----------+--------+
|firstname|middlename|lastname|
+---------+----------+--------+
+---------+----------+--------+



In [18]:
#Create empty DataFrame directly.
df2 = spark.createDataFrame([],schema)

df2.show()

+---------+----------+--------+
|firstname|middlename|lastname|
+---------+----------+--------+
+---------+----------+--------+



# Convert PySpark DataFrame to Pandas

In [19]:
data = [("James","","Smith","36636","M",60000),
        ("Michael","Rose","","40288","M",70000),
        ("Robert","","Williams","42114","",400000),
        ("Maria","Anne","Jones","39192","F",500000),
        ("Jen","Mary","Brown","","F",0)]

columns = ["first_name","middle_name","last_name","dob","gender","salary"]

pySparkDF = spark.createDataFrame(data=data,schema=columns)

pySparkDF.printSchema()

root
 |-- first_name: string (nullable = true)
 |-- middle_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [20]:
pySparkDF.show()

+----------+-----------+---------+-----+------+------+
|first_name|middle_name|last_name|  dob|gender|salary|
+----------+-----------+---------+-----+------+------+
|     James|           |    Smith|36636|     M| 60000|
|   Michael|       Rose|         |40288|     M| 70000|
|    Robert|           | Williams|42114|      |400000|
|     Maria|       Anne|    Jones|39192|     F|500000|
|       Jen|       Mary|    Brown|     |     F|     0|
+----------+-----------+---------+-----+------+------+



In [21]:
pandasDF = pySparkDF.toPandas()

pandasDF.head()

Unnamed: 0,first_name,middle_name,last_name,dob,gender,salary
0,James,,Smith,36636.0,M,60000
1,Michael,Rose,,40288.0,M,70000
2,Robert,,Williams,42114.0,,400000
3,Maria,Anne,Jones,39192.0,F,500000
4,Jen,Mary,Brown,,F,0


In [22]:
pySparkDF.show(vertical=True)

-RECORD 0---------------
 first_name  | James    
 middle_name |          
 last_name   | Smith    
 dob         | 36636    
 gender      | M        
 salary      | 60000    
-RECORD 1---------------
 first_name  | Michael  
 middle_name | Rose     
 last_name   |          
 dob         | 40288    
 gender      | M        
 salary      | 70000    
-RECORD 2---------------
 first_name  | Robert   
 middle_name |          
 last_name   | Williams 
 dob         | 42114    
 gender      |          
 salary      | 400000   
-RECORD 3---------------
 first_name  | Maria    
 middle_name | Anne     
 last_name   | Jones    
 dob         | 39192    
 gender      | F        
 salary      | 500000   
-RECORD 4---------------
 first_name  | Jen      
 middle_name | Mary     
 last_name   | Brown    
 dob         |          
 gender      | F        
 salary      | 0        



# Nested StructType object struct

In [23]:
structureData = [
    (("James","","Smith"),"36636","M",3100),
    (("Michael","Rose",""),"40288","M",4300),
    (("Robert","","Williams"),"42114","M",1400),
    (("Maria","Anne","Jones"),"39192","F",5500),
    (("Jen","Mary","Brown"),"","F",-1)
  ]

structureSchema = StructType([
        StructField('name', StructType([
             StructField('firstname', StringType(), True),
             StructField('middlename', StringType(), True),
             StructField('lastname', StringType(), True)
             ])),
         StructField('id', StringType(), True),
         StructField('gender', StringType(), True),
         StructField('salary', IntegerType(), True)
         ])

df2=spark.createDataFrame(data=structureData,schema=structureSchema)

In [24]:
df2.show()

+--------------------+-----+------+------+
|                name|   id|gender|salary|
+--------------------+-----+------+------+
|    [James, , Smith]|36636|     M|  3100|
|   [Michael, Rose, ]|40288|     M|  4300|
|[Robert, , Williams]|42114|     M|  1400|
|[Maria, Anne, Jones]|39192|     F|  5500|
|  [Jen, Mary, Brown]|     |     F|    -1|
+--------------------+-----+------+------+



# Adding & Changing struct of the DataFrame

In [25]:
from pyspark.sql.functions import col,struct,when

updatedDF= df2.withColumn("OtherInfo",
                         struct(col("id").alias("identifier"),
                                col("gender").alias("Sex"),
                                col("salary").alias("Salary"),
                                when(col("salary").cast(IntegerType())<2000,"Low")\
                               .when(col("salary").cast(IntegerType())<4000,"Medium")\
                               .otherwise("High").alias("Salary_Grade")
                               )).drop("id","gender","salary")

updatedDF.printSchema()

root
 |-- name: struct (nullable = true)
 |    |-- firstname: string (nullable = true)
 |    |-- middlename: string (nullable = true)
 |    |-- lastname: string (nullable = true)
 |-- OtherInfo: struct (nullable = false)
 |    |-- identifier: string (nullable = true)
 |    |-- Sex: string (nullable = true)
 |    |-- Salary: integer (nullable = true)
 |    |-- Salary_Grade: string (nullable = false)



In [26]:
updatedDF.show(truncate=False)

+--------------------+------------------------+
|name                |OtherInfo               |
+--------------------+------------------------+
|[James, , Smith]    |[36636, M, 3100, Medium]|
|[Michael, Rose, ]   |[40288, M, 4300, High]  |
|[Robert, , Williams]|[42114, M, 1400, Low]   |
|[Maria, Anne, Jones]|[39192, F, 5500, High]  |
|[Jen, Mary, Brown]  |[, F, -1, Low]          |
+--------------------+------------------------+



In [27]:
print(df2.schema.json())

{"fields":[{"metadata":{},"name":"name","nullable":true,"type":{"fields":[{"metadata":{},"name":"firstname","nullable":true,"type":"string"},{"metadata":{},"name":"middlename","nullable":true,"type":"string"},{"metadata":{},"name":"lastname","nullable":true,"type":"string"}],"type":"struct"}},{"metadata":{},"name":"id","nullable":true,"type":"string"},{"metadata":{},"name":"gender","nullable":true,"type":"string"},{"metadata":{},"name":"salary","nullable":true,"type":"integer"}],"type":"struct"}


In [28]:
print(df2.schema.simpleString())

struct<name:struct<firstname:string,middlename:string,lastname:string>,id:string,gender:string,salary:int>


# Row Object

In [29]:
from pyspark.sql import Row

row=Row("Kazmi",40)

print(row[0]+","+str(row[1]))

Kazmi,40


In [30]:
# with names arguments
row=Row(name="Hamza",age=10)

print(row.name)
print(row.age)

Hamza
10


# Custom Class from Row

In [31]:
Person = Row("name","age")
p1=Person("James",40)
p2=Person("Alice",35)

print(p1.name)
print(p2.name)

James
Alice


# Row class on PySpark RDD

In [32]:
data = [Row(name="James,,Smith",lang=["Java","Scala","C++"],state="CA"), 
        Row(name="Michael,Rose,",lang=["Spark","Java","C++"],state="NJ"),
        Row(name="Robert,,Williams",lang=["CSharp","VB"],state="NV")]

rdd=spark.sparkContext.parallelize(data)

rdd.collect()


[Row(lang=['Java', 'Scala', 'C++'], name='James,,Smith', state='CA'),
 Row(lang=['Spark', 'Java', 'C++'], name='Michael,Rose,', state='NJ'),
 Row(lang=['CSharp', 'VB'], name='Robert,,Williams', state='NV')]

# Row class on PySpark DataFrame

In [33]:
df=spark.createDataFrame(data)

In [34]:
df.printSchema()

root
 |-- lang: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- name: string (nullable = true)
 |-- state: string (nullable = true)



In [35]:
df.show()

+------------------+----------------+-----+
|              lang|            name|state|
+------------------+----------------+-----+
|[Java, Scala, C++]|    James,,Smith|   CA|
|[Spark, Java, C++]|   Michael,Rose,|   NJ|
|      [CSharp, VB]|Robert,,Williams|   NV|
+------------------+----------------+-----+



In [36]:
# changing column names
columns = ["name","languagesAtSchool","currentState"]
df=spark.createDataFrame(data).toDF(*columns)

df.printSchema()

root
 |-- name: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- languagesAtSchool: string (nullable = true)
 |-- currentState: string (nullable = true)



# Nested Struct Using Row Class

In [37]:
data=[Row(name="james",prop=Row(hair="black",eye="black")),
      Row(name="ann",prop=Row(hair="grey",eye="blue"))
     ]

df=spark.createDataFrame(data)

df.show()

+-----+--------------+
| name|          prop|
+-----+--------------+
|james|[black, black]|
|  ann|  [blue, grey]|
+-----+--------------+



# Column Class Object

In [38]:
# create a Column class object is by using lit
from pyspark.sql.functions import lit

colObj = lit("col")

print(type(colObj))
print(colObj)

<class 'pyspark.sql.column.Column'>
Column<b'col'>


# Access the Column from DataFrame

In [39]:
data=[("James",23),("Ann",40)]

df=spark.createDataFrame(data).toDF("name","gender")

In [40]:
df.select(df.gender).show()

+------+
|gender|
+------+
|    23|
|    40|
+------+



In [41]:
df.select(df["gender"]).show()

+------+
|gender|
+------+
|    23|
|    40|
+------+



In [42]:
df.select("gender").show()

+------+
|gender|
+------+
|    23|
|    40|
+------+



In [43]:
df.select(col("gender")).show()

+------+
|gender|
+------+
|    23|
|    40|
+------+



# Column Operators

In [44]:
data=[(100,2,1),(200,3,4),(300,4,4)]

df=spark.createDataFrame(data).toDF("col1","col2","col3")

df.show()

+----+----+----+
|col1|col2|col3|
+----+----+----+
| 100|   2|   1|
| 200|   3|   4|
| 300|   4|   4|
+----+----+----+



In [45]:
df.select(df.col1 + df.col2).show()

+-------------+
|(col1 + col2)|
+-------------+
|          102|
|          203|
|          304|
+-------------+



In [46]:
df.select(df.col1 - df.col2).show() 

+-------------+
|(col1 - col2)|
+-------------+
|           98|
|          197|
|          296|
+-------------+



In [47]:
df.select(df.col1 * df.col2).show()


+-------------+
|(col1 * col2)|
+-------------+
|          200|
|          600|
|         1200|
+-------------+



In [48]:
df.select(df.col1 / df.col2).show()


+-----------------+
|    (col1 / col2)|
+-----------------+
|             50.0|
|66.66666666666667|
|             75.0|
+-----------------+



In [49]:
df.select(df.col1 % df.col2).show()


+-------------+
|(col1 % col2)|
+-------------+
|            0|
|            2|
|            0|
+-------------+



In [50]:
df.select(df.col2 > df.col3).show()

+-------------+
|(col2 > col3)|
+-------------+
|         true|
|        false|
|        false|
+-------------+



In [51]:
df.select(df.col2 < df.col3).show()

+-------------+
|(col2 < col3)|
+-------------+
|        false|
|         true|
|        false|
+-------------+



In [52]:
df.select(df.col2 == df.col3).show()

+-------------+
|(col2 = col3)|
+-------------+
|        false|
|        false|
|         true|
+-------------+



# Column Functions

In [53]:
data=[("James","Bond","100",None),
      ("Ann","Varsa","200",'F'),
      ("Tom Cruise","XXX","400",''),
      ("Tom Brand",None,"400",'M')] 

columns=["fname","lname","id","gender"]

df=spark.createDataFrame(data,columns)

In [54]:
from pyspark.sql.functions import expr

df.select(df["fname"].alias("first_name"),
          df["lname"].alias("last_name")
         ).show()

+----------+---------+
|first_name|last_name|
+----------+---------+
|     James|     Bond|
|       Ann|    Varsa|
|Tom Cruise|      XXX|
| Tom Brand|     null|
+----------+---------+



In [55]:
df.select(expr("fname ||','||lname").alias("fullname")
         ).show()

+--------------+
|      fullname|
+--------------+
|    James,Bond|
|     Ann,Varsa|
|Tom Cruise,XXX|
|          null|
+--------------+



# Ascending or Descending order.

In [56]:
df.sort(df["fname"].asc()).show()

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|       Ann|Varsa|200|     F|
|     James| Bond|100|  null|
| Tom Brand| null|400|     M|
|Tom Cruise|  XXX|400|      |
+----------+-----+---+------+



In [57]:
df.sort(df["fname"].desc()).show()

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|Tom Cruise|  XXX|400|      |
| Tom Brand| null|400|     M|
|     James| Bond|100|  null|
|       Ann|Varsa|200|     F|
+----------+-----+---+------+



# cast() & astype()

In [58]:
df.select(df["fname"],df["id"].cast("int")).printSchema()

root
 |-- fname: string (nullable = true)
 |-- id: integer (nullable = true)



In [59]:
df.select(df["fname"],df["id"].astype("int")).printSchema()

root
 |-- fname: string (nullable = true)
 |-- id: integer (nullable = true)



# between()

In [60]:
df.filter(df["id"].between(100,300)).show()

+-----+-----+---+------+
|fname|lname| id|gender|
+-----+-----+---+------+
|James| Bond|100|  null|
|  Ann|Varsa|200|     F|
+-----+-----+---+------+



# contains()

In [61]:
df.filter(df["fname"].contains("Cruise")).show()

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|Tom Cruise|  XXX|400|      |
+----------+-----+---+------+



# startswith() & endswith()

In [62]:
df.filter(df["fname"].startswith("T")).show()

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|Tom Cruise|  XXX|400|      |
| Tom Brand| null|400|     M|
+----------+-----+---+------+



In [63]:
df.filter(df["fname"].endswith("Cruise")).show()

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|Tom Cruise|  XXX|400|      |
+----------+-----+---+------+



# isNull & isNotNull()

In [64]:
df.filter(df["lname"].isNull()).show()

+---------+-----+---+------+
|    fname|lname| id|gender|
+---------+-----+---+------+
|Tom Brand| null|400|     M|
+---------+-----+---+------+



In [65]:
df.filter(df["lname"].isNotNull()).show()

+----------+-----+---+------+
|     fname|lname| id|gender|
+----------+-----+---+------+
|     James| Bond|100|  null|
|       Ann|Varsa|200|     F|
|Tom Cruise|  XXX|400|      |
+----------+-----+---+------+



# like()

In [66]:
df.select(df["fname"],df["lname"],df["id"]).filter(df["fname"].like("%n")).show()

+-----+-----+---+
|fname|lname| id|
+-----+-----+---+
|  Ann|Varsa|200|
+-----+-----+---+



# substr()

In [67]:
df.select(df["fname"].substr(1,2).alias("subtr")).show()

+-----+
|subtr|
+-----+
|   Ja|
|   An|
|   To|
|   To|
+-----+



# when() & otherwise()

In [68]:
from pyspark.sql.functions import when
df.select(df["fname"],df["lname"],when(df["gender"]=="M","Male")
                                 .when(df["gender"]=="F","Female")
                                 .when(df["gender"]==None,"")
                                 .otherwise(df["gender"]).alias("new_gender")
         ).show()

+----------+-----+----------+
|     fname|lname|new_gender|
+----------+-----+----------+
|     James| Bond|      null|
|       Ann|Varsa|    Female|
|Tom Cruise|  XXX|          |
| Tom Brand| null|      Male|
+----------+-----+----------+



# isin()

In [69]:
li=["100","200"]
df.select(df["fname"],df["lname"],df["id"]).filter(df["id"].isin(li)).show()

+-----+-----+---+
|fname|lname| id|
+-----+-----+---+
|James| Bond|100|
|  Ann|Varsa|200|
+-----+-----+---+



# getField()

In [70]:
from pyspark.sql.types import StructType,StructField,StringType,ArrayType,MapType
data=[(("James","Bond"),["Java","C#"],{'hair':'black','eye':'brown'}),
      (("Ann","Varsa"),[".NET","Python"],{'hair':'brown','eye':'black'}),
      (("Tom Cruise",""),["Python","Scala"],{'hair':'red','eye':'grey'}),
      (("Tom Brand",None),["Perl","Ruby"],{'hair':'black','eye':'blue'})]

schema = StructType([
        StructField('name', StructType([
            StructField('fname', StringType(), True),
            StructField('lname', StringType(), True)])),
        StructField('languages', ArrayType(StringType()),True),
        StructField('properties', MapType(StringType(),StringType()),True)
     ])
df=spark.createDataFrame(data,schema)
df.show()

+--------------+---------------+--------------------+
|          name|      languages|          properties|
+--------------+---------------+--------------------+
| [James, Bond]|     [Java, C#]|[eye -> brown, ha...|
|  [Ann, Varsa]| [.NET, Python]|[eye -> black, ha...|
|[Tom Cruise, ]|[Python, Scala]|[eye -> grey, hai...|
|  [Tom Brand,]|   [Perl, Ruby]|[eye -> blue, hai...|
+--------------+---------------+--------------------+



In [71]:
df.select(df["properties"].getField("hair")).show()

+----------------+
|properties[hair]|
+----------------+
|           black|
|           brown|
|             red|
|           black|
+----------------+



In [72]:
df.select(df["name"].getField("fname")).show()

+----------+
|name.fname|
+----------+
|     James|
|       Ann|
|Tom Cruise|
| Tom Brand|
+----------+



# getItem()

In [73]:
df.select(df["languages"].getItem(1)).show()

+------------+
|languages[1]|
+------------+
|          C#|
|      Python|
|       Scala|
|        Ruby|
+------------+



In [74]:
df.select(df["properties"].getItem("hair")).show()

+----------------+
|properties[hair]|
+----------------+
|           black|
|           brown|
|             red|
|           black|
+----------------+



# select()

In [75]:
df.select("name.fname","name.lname").show()

+----------+-----+
|     fname|lname|
+----------+-----+
|     James| Bond|
|       Ann|Varsa|
|Tom Cruise|     |
| Tom Brand| null|
+----------+-----+



In [76]:
df.select("name.*").show()

+----------+-----+
|     fname|lname|
+----------+-----+
|     James| Bond|
|       Ann|Varsa|
|Tom Cruise|     |
| Tom Brand| null|
+----------+-----+



# collect()

In [77]:
data=[("Finance",10),("Marketing",20),("Sales",30),("IT",40)]
deptColumns = ["dept_name","dept_id"]

deptDF=spark.createDataFrame(data=data,schema=deptColumns)
deptDF.show(truncate=False)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+



In [78]:
dataCollect=deptDF.collect()
print(dataCollect)

[Row(dept_name='Finance', dept_id=10), Row(dept_name='Marketing', dept_id=20), Row(dept_name='Sales', dept_id=30), Row(dept_name='IT', dept_id=40)]


In [79]:
for row in dataCollect:
    print(row['dept_name']+","+str(row["dept_id"]))

Finance,10
Marketing,20
Sales,30
IT,40


In [80]:
deptDF.collect()[0][0]

'Finance'

# withColumn()

In [81]:
data = [('James','','Smith','1991-04-01','M',3000),
  ('Michael','Rose','','2000-05-19','M',4000),
  ('Robert','','Williams','1978-09-05','M',4000),
  ('Maria','Anne','Jones','1967-12-01','F',4000),
  ('Jen','Mary','Brown','1980-02-17','F',-1)
]

columns = ["firstname","middlename","lastname","dob","gender","salary"]

df=spark.createDataFrame(data=data,schema=columns)

df.show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



In [82]:
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [83]:
df.withColumn("salary",col("salary").cast("string")).show()

+---------+----------+--------+----------+------+------+
|firstname|middlename|lastname|       dob|gender|salary|
+---------+----------+--------+----------+------+------+
|    James|          |   Smith|1991-04-01|     M|  3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|
+---------+----------+--------+----------+------+------+



In [84]:
df.printSchema()

root
 |-- firstname: string (nullable = true)
 |-- middlename: string (nullable = true)
 |-- lastname: string (nullable = true)
 |-- dob: string (nullable = true)
 |-- gender: string (nullable = true)
 |-- salary: long (nullable = true)



In [85]:
# updated value of existing column
df.withColumn("salary",col("salary")*1000).show()

+---------+----------+--------+----------+------+-------+
|firstname|middlename|lastname|       dob|gender| salary|
+---------+----------+--------+----------+------+-------+
|    James|          |   Smith|1991-04-01|     M|3000000|
|  Michael|      Rose|        |2000-05-19|     M|4000000|
|   Robert|          |Williams|1978-09-05|     M|4000000|
|    Maria|      Anne|   Jones|1967-12-01|     F|4000000|
|      Jen|      Mary|   Brown|1980-02-17|     F|  -1000|
+---------+----------+--------+----------+------+-------+



In [86]:
# create a new column 
df.withColumn("CopiedColumn",col("salary")*-1).show()

+---------+----------+--------+----------+------+------+------------+
|firstname|middlename|lastname|       dob|gender|salary|CopiedColumn|
+---------+----------+--------+----------+------+------+------------+
|    James|          |   Smith|1991-04-01|     M|  3000|       -3000|
|  Michael|      Rose|        |2000-05-19|     M|  4000|       -4000|
|   Robert|          |Williams|1978-09-05|     M|  4000|       -4000|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|       -4000|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|           1|
+---------+----------+--------+----------+------+------+------------+



In [87]:
df.withColumn("Country",lit("USA")).show()

+---------+----------+--------+----------+------+------+-------+
|firstname|middlename|lastname|       dob|gender|salary|Country|
+---------+----------+--------+----------+------+------+-------+
|    James|          |   Smith|1991-04-01|     M|  3000|    USA|
|  Michael|      Rose|        |2000-05-19|     M|  4000|    USA|
|   Robert|          |Williams|1978-09-05|     M|  4000|    USA|
|    Maria|      Anne|   Jones|1967-12-01|     F|  4000|    USA|
|      Jen|      Mary|   Brown|1980-02-17|     F|    -1|    USA|
+---------+----------+--------+----------+------+------+-------+



# withColumnRenamed()

In [88]:
df.withColumnRenamed("gender","sex").show(truncate=False)

+---------+----------+--------+----------+---+------+
|firstname|middlename|lastname|dob       |sex|salary|
+---------+----------+--------+----------+---+------+
|James    |          |Smith   |1991-04-01|M  |3000  |
|Michael  |Rose      |        |2000-05-19|M  |4000  |
|Robert   |          |Williams|1978-09-05|M  |4000  |
|Maria    |Anne      |Jones   |1967-12-01|F  |4000  |
|Jen      |Mary      |Brown   |1980-02-17|F  |-1    |
+---------+----------+--------+----------+---+------+



# drop()

In [89]:
df.drop("salary").show()

+---------+----------+--------+----------+------+
|firstname|middlename|lastname|       dob|gender|
+---------+----------+--------+----------+------+
|    James|          |   Smith|1991-04-01|     M|
|  Michael|      Rose|        |2000-05-19|     M|
|   Robert|          |Williams|1978-09-05|     M|
|    Maria|      Anne|   Jones|1967-12-01|     F|
|      Jen|      Mary|   Brown|1980-02-17|     F|
+---------+----------+--------+----------+------+



# distinct()

In [90]:
from pyspark.sql.functions import expr
data = [("James", "Sales", 3000), \
    ("Michael", "Sales", 4600), \
    ("Robert", "Sales", 4100), \
    ("Maria", "Finance", 3000), \
    ("James", "Sales", 3000), \
    ("Scott", "Finance", 3300), \
    ("Jen", "Finance", 3900), \
    ("Jeff", "Marketing", 3000), \
    ("Kumar", "Marketing", 2000), \
    ("Saif", "Sales", 4100) \
  ]
columns= ["employee_name", "department", "salary"]

df=spark.createDataFrame(data=data,schema=columns)


In [91]:
df.printSchema()

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)



In [92]:
df.show(truncate=False)

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|James        |Sales     |3000  |
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|James        |Sales     |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Jeff         |Marketing |3000  |
|Kumar        |Marketing |2000  |
|Saif         |Sales     |4100  |
+-------------+----------+------+



In [93]:
distinctDF=df.distinct()
print("Distinct count: "+str(distinctDF.count()))
distinctDF.show(truncate=False)

Distinct count: 9
+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|Jen          |Finance   |3900  |
|Michael      |Sales     |4600  |
|Scott        |Finance   |3300  |
|Kumar        |Marketing |2000  |
|James        |Sales     |3000  |
|Robert       |Sales     |4100  |
|Jeff         |Marketing |3000  |
|Saif         |Sales     |4100  |
|Maria        |Finance   |3000  |
+-------------+----------+------+



# dropDuplicates()

In [94]:
df2=df.dropDuplicates()
print("Distinct count: "+str(df2.count()))
print("Duplicate count: "+str(df.count()))

Distinct count: 9
Duplicate count: 10


In [95]:
dropDisDF=df.dropDuplicates(["department","salary"])
print("Discount count of department & salary: "+str(dropDisDF.count()))
dropDisDF.show(truncate=False)

Discount count of department & salary: 8
+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Jen          |Finance   |3900  |
|Maria        |Finance   |3000  |
|Scott        |Finance   |3300  |
|Kumar        |Marketing |2000  |
|James        |Sales     |3000  |
|Jeff         |Marketing |3000  |
+-------------+----------+------+



# orderBy() and sort()

In [96]:
simpleData = [("James","Sales","NY",90000,34,10000), \
    ("Michael","Sales","NY",86000,56,20000), \
    ("Robert","Sales","CA",81000,30,23000), \
    ("Maria","Finance","CA",90000,24,23000), \
    ("Raman","Finance","CA",99000,40,24000), \
    ("Scott","Finance","NY",83000,36,19000), \
    ("Jen","Finance","NY",79000,53,15000), \
    ("Jeff","Marketing","CA",80000,25,18000), \
    ("Kumar","Marketing","NY",91000,50,21000) \
  ]
columns= ["employee_name","department","state","salary","age","bonus"]

df=spark.createDataFrame(data=simpleData,schema=columns)
df.show(truncate=False)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Raman        |Finance   |CA   |99000 |40 |24000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
+-------------+----------+-----+------+---+-----+



In [97]:
df.sort("department","state").show(truncate=False)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|Raman        |Finance   |CA   |99000 |40 |24000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
+-------------+----------+-----+------+---+-----+



In [98]:
df.sort(col("department"),col("state")).show(truncate=False)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|Raman        |Finance   |CA   |99000 |40 |24000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
+-------------+----------+-----+------+---+-----+



In [99]:
df.orderBy("department","state").show(truncate=False)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|Maria        |Finance   |CA   |90000 |24 |23000|
|Raman        |Finance   |CA   |99000 |40 |24000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
+-------------+----------+-----+------+---+-----+



In [100]:
df.sort(df["department"].asc(),df["state"].asc()).show(truncate=False)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|Raman        |Finance   |CA   |99000 |40 |24000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
+-------------+----------+-----+------+---+-----+



In [101]:
df.orderBy(df["department"].desc(),df["state"].desc()).show(truncate=False)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Raman        |Finance   |CA   |99000 |40 |24000|
+-------------+----------+-----+------+---+-----+



In [102]:

df.createOrReplaceTempView("EMP")
spark.sql("select employee_name,department,state,salary,age,bonus from EMP ORDER BY department asc").show(truncate=False)


+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|Scott        |Finance   |NY   |83000 |36 |19000|
|Raman        |Finance   |CA   |99000 |40 |24000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
+-------------+----------+-----+------+---+-----+



# groupBy()

In [103]:
"""
count() - Returns the count of rows for each group.

mean() - Returns the mean of values for each group.

max() - Returns the maximum of values for each group.

min() - Returns the minimum of values for each group.

sum() - Returns the total for values for each group.

avg() - Returns the average for values for each group.

agg() - Using agg() function, we can calculate more than one aggregate at a time.

pivot()
"""
df.show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
+-------------+----------+-----+------+---+-----+



In [104]:
df.groupBy("department").sum("salary").show(truncate=False)

+----------+-----------+
|department|sum(salary)|
+----------+-----------+
|Sales     |257000     |
|Finance   |351000     |
|Marketing |171000     |
+----------+-----------+



In [105]:
df.groupBy("department","state").sum("salary","bonus").show()

+----------+-----+-----------+----------+
|department|state|sum(salary)|sum(bonus)|
+----------+-----+-----------+----------+
|   Finance|   NY|     162000|     34000|
| Marketing|   NY|      91000|     21000|
|     Sales|   CA|      81000|     23000|
| Marketing|   CA|      80000|     18000|
|   Finance|   CA|     189000|     47000|
|     Sales|   NY|     176000|     30000|
+----------+-----+-----------+----------+



In [106]:
# Running more aggregates at a time
from pyspark.sql.functions import sum,avg,max,min,mean,count

df.groupBy("department").agg(sum("salary").alias("sum_salry")\
                            ,avg("salary").alias("avg_salary")\
                            ,sum("bonus").alias("sum_bonus")\
                            ,max("bonus").alias("max_bonus")
                            ).show(truncate=False)

+----------+---------+-----------------+---------+---------+
|department|sum_salry|avg_salary       |sum_bonus|max_bonus|
+----------+---------+-----------------+---------+---------+
|Sales     |257000   |85666.66666666667|53000    |23000    |
|Finance   |351000   |87750.0          |81000    |24000    |
|Marketing |171000   |85500.0          |39000    |21000    |
+----------+---------+-----------------+---------+---------+



# Using filter on aggregate data

In [107]:
df.groupBy("department").agg(sum("salary").alias("sum_salry")\
                            ,avg("salary").alias("avg_salary")\
                            ,sum("bonus").alias("sum_bonus")\
                            ,max("bonus").alias("max_bonus")
                            ).where(col("sum_bonus")>=50000).show(truncate=False)

+----------+---------+-----------------+---------+---------+
|department|sum_salry|avg_salary       |sum_bonus|max_bonus|
+----------+---------+-----------------+---------+---------+
|Sales     |257000   |85666.66666666667|53000    |23000    |
|Finance   |351000   |87750.0          |81000    |24000    |
+----------+---------+-----------------+---------+---------+



# JOINS

In [108]:
emp = [(1,"Smith",-1,"2018","10","M",3000), \
    (2,"Rose",1,"2010","20","M",4000), \
    (3,"Williams",1,"2010","10","M",1000), \
    (4,"Jones",2,"2005","10","F",2000), \
    (5,"Brown",2,"2010","40","",-1), \
      (6,"Brown",2,"2010","50","",-1) \
  ]
empColumns = ["emp_id","name","superior_emp_id","year_joined", \
       "emp_dept_id","gender","salary"]

dept = [("Finance",10), \
    ("Marketing",20), \
    ("Sales",30), \
    ("IT",40) \
  ]
deptColumns = ["dept_name","dept_id"]

empDF=spark.createDataFrame(data=emp,schema=empColumns)
deptDF=spark.createDataFrame(data=dept,schema=deptColumns)

In [109]:
empDF.show(truncate=False)

+------+--------+---------------+-----------+-----------+------+------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+--------+---------------+-----------+-----------+------+------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |
|2     |Rose    |1              |2010       |20         |M     |4000  |
|3     |Williams|1              |2010       |10         |M     |1000  |
|4     |Jones   |2              |2005       |10         |F     |2000  |
|5     |Brown   |2              |2010       |40         |      |-1    |
|6     |Brown   |2              |2010       |50         |      |-1    |
+------+--------+---------------+-----------+-----------+------+------+



In [110]:
deptDF.show(truncate=False)

+---------+-------+
|dept_name|dept_id|
+---------+-------+
|Finance  |10     |
|Marketing|20     |
|Sales    |30     |
|IT       |40     |
+---------+-------+



# INNER JOIN

In [111]:
empDF.join(deptDF,empDF.emp_dept_id==deptDF.dept_id,"inner").show(truncate=False)

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |
|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |
|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |
|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



# OUTER JOIN

In [112]:
empDF.join(deptDF,empDF.emp_dept_id==deptDF.dept_id,"outer").show(truncate=False)

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|6     |Brown   |2              |2010       |50         |      |-1    |null     |null   |
|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |
|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |
|null  |null    |null           |null       |null       |null  |null  |Sales    |30     |
|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |
|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



In [113]:
empDF.join(deptDF,empDF.emp_dept_id==deptDF.dept_id,"full").show(truncate=False)

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|6     |Brown   |2              |2010       |50         |      |-1    |null     |null   |
|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |
|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |
|null  |null    |null           |null       |null       |null  |null  |Sales    |30     |
|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |
|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



In [114]:
empDF.join(deptDF,empDF.emp_dept_id==deptDF.dept_id,"fullouter").show(truncate=False)

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|6     |Brown   |2              |2010       |50         |      |-1    |null     |null   |
|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |
|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |
|null  |null    |null           |null       |null       |null  |null  |Sales    |30     |
|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |
|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



# Left Outer Join

In [115]:
empDF.join(deptDF,empDF.emp_dept_id==deptDF.dept_id,"left").show(truncate=False)

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|6     |Brown   |2              |2010       |50         |      |-1    |null     |null   |
|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |
|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |
|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |
|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



In [116]:
empDF.join(deptDF,empDF.emp_dept_id==deptDF.dept_id,"right").show(truncate=False)

+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|
+------+--------+---------------+-----------+-----------+------+------+---------+-------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |
|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |
|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |
|null  |null    |null           |null       |null       |null  |null  |Sales    |30     |
|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |
|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |
+------+--------+---------------+-----------+-----------+------+------+---------+-------+



# Leftsemi

In [117]:
"""
leftsemi join is similar to inner join difference being leftsemi join returns all columns from the left dataset and ignores
all columns from the right dataset. In other words, this join returns columns from the only left dataset for the records match
in the right dataset on join expression, records not matched on join expression are ignored from both left and right datasets.
"""
empDF.join(deptDF,empDF.emp_dept_id==deptDF.dept_id,"leftsemi").show(truncate=False)

+------+--------+---------------+-----------+-----------+------+------+
|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+--------+---------------+-----------+-----------+------+------+
|1     |Smith   |-1             |2018       |10         |M     |3000  |
|3     |Williams|1              |2010       |10         |M     |1000  |
|4     |Jones   |2              |2005       |10         |F     |2000  |
|2     |Rose    |1              |2010       |20         |M     |4000  |
|5     |Brown   |2              |2010       |40         |      |-1    |
+------+--------+---------------+-----------+-----------+------+------+



# Left Anti Join

In [118]:
"""
leftanti join does the exact opposite of the leftsemi, leftanti join returns only columns from the left dataset 
for non-matched records.
"""
empDF.join(deptDF,empDF.emp_dept_id==deptDF.dept_id,"leftanti").show(truncate=False)

+------+-----+---------------+-----------+-----------+------+------+
|emp_id|name |superior_emp_id|year_joined|emp_dept_id|gender|salary|
+------+-----+---------------+-----------+-----------+------+------+
|6     |Brown|2              |2010       |50         |      |-1    |
+------+-----+---------------+-----------+-----------+------+------+



# Self Join

In [119]:
empDF.alias("emp1").join(empDF.alias("emp2")\
                        ,col("emp1.superior_emp_id")==col("emp2.emp_id")
                        ,"inner").select(
                                    col("emp1.emp_id"),col("emp1.name")\
                                    ,col("emp2.emp_id").alias("superior_emp_id")\
                                    ,col("emp2.name").alias("superior_emp_name")
                                ).show(truncate=False)

+------+--------+---------------+-----------------+
|emp_id|name    |superior_emp_id|superior_emp_name|
+------+--------+---------------+-----------------+
|2     |Rose    |1              |Smith            |
|3     |Williams|1              |Smith            |
|4     |Jones   |2              |Rose             |
|5     |Brown   |2              |Rose             |
|6     |Brown   |2              |Rose             |
+------+--------+---------------+-----------------+



# Union and UnionAll

In [120]:
df.show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
+-------------+----------+-----+------+---+-----+



In [121]:
df2=df
df2.show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
+-------------+----------+-----+------+---+-----+



In [122]:
unionDF=df.union(df2)
unionDF.show()

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|
|        Kumar| Marketing|   NY| 91000| 50|21000|
|        James|     Sales|   NY| 90000| 34|10000|
|      Michael|     Sales|   NY| 86000| 56|20000|
|       Robert|     Sales|   CA| 81000| 30|23000|
|        Maria|   Finance|   CA| 90000| 24|23000|
|        Raman|   Finance|   CA| 99000| 40|24000|
|        Scott|   Finance|   NY| 83000| 36|19000|
|          Jen|   Finance|   NY| 79000| 53|15000|
|         Jeff| Marketing|   CA| 80000| 25|18000|


In [123]:
# DataFrame unionAll() method is deprecated since PySpark “2.0.0” version and recommends using the union() method
unionAllDF=df.unionAll(df2)
unionAllDF.show(truncate=False)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Raman        |Finance   |CA   |99000 |40 |24000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
|James        |Sales     |NY   |90000 |34 |10000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Robert       |Sales     |CA   |81000 |30 |23000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Raman        |Finance   |CA   |99000 |40 |24000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|


In [124]:
# this how we can do only union
disDF=df.union(df2).distinct()
disDF.show(truncate=False)

+-------------+----------+-----+------+---+-----+
|employee_name|department|state|salary|age|bonus|
+-------------+----------+-----+------+---+-----+
|Raman        |Finance   |CA   |99000 |40 |24000|
|Scott        |Finance   |NY   |83000 |36 |19000|
|James        |Sales     |NY   |90000 |34 |10000|
|Maria        |Finance   |CA   |90000 |24 |23000|
|Kumar        |Marketing |NY   |91000 |50 |21000|
|Michael      |Sales     |NY   |86000 |56 |20000|
|Jen          |Finance   |NY   |79000 |53 |15000|
|Jeff         |Marketing |CA   |80000 |25 |18000|
|Robert       |Sales     |CA   |81000 |30 |23000|
+-------------+----------+-----+------+---+-----+



# Merge Two DataFrames with Different Columns or Schema

In [125]:
data = [("James","Sales",34), ("Michael","Sales",56), \
    ("Robert","Sales",30), ("Maria","Finance",24) ]
columns= ["name","dept","age"]

df1=spark.createDataFrame(data=data,schema=columns)
df1.printSchema()

root
 |-- name: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- age: long (nullable = true)



In [126]:
df1.show()

+-------+-------+---+
|   name|   dept|age|
+-------+-------+---+
|  James|  Sales| 34|
|Michael|  Sales| 56|
| Robert|  Sales| 30|
|  Maria|Finance| 24|
+-------+-------+---+



In [127]:
data2=[
       ("James","Sales","NY",9000),
       ("Maria","Finance","CA",9000), \
       ("Jen","Finance","NY",7900),
       ("Jeff","Marketing","CA",8000)
      ]
columns2= ["name","dept","state","salary"]

df2=spark.createDataFrame(data=data2,schema=columns2)
df2.printSchema()

root
 |-- name: string (nullable = true)
 |-- dept: string (nullable = true)
 |-- state: string (nullable = true)
 |-- salary: long (nullable = true)



In [128]:
df2.show()

+-----+---------+-----+------+
| name|     dept|state|salary|
+-----+---------+-----+------+
|James|    Sales|   NY|  9000|
|Maria|  Finance|   CA|  9000|
|  Jen|  Finance|   NY|  7900|
| Jeff|Marketing|   CA|  8000|
+-----+---------+-----+------+



In [129]:
for column in [column for column in df2.columns if column not in df1.columns]:
    df1 = df1.withColumn(column, lit(None))

In [130]:
df1.show()

+-------+-------+---+-----+------+
|   name|   dept|age|state|salary|
+-------+-------+---+-----+------+
|  James|  Sales| 34| null|  null|
|Michael|  Sales| 56| null|  null|
| Robert|  Sales| 30| null|  null|
|  Maria|Finance| 24| null|  null|
+-------+-------+---+-----+------+



In [131]:
for column in [column for column in df1.columns if column not in df2.columns]:
    df2 = df2.withColumn(column, lit(None))
    
df2.show()

+-----+---------+-----+------+----+
| name|     dept|state|salary| age|
+-----+---------+-----+------+----+
|James|    Sales|   NY|  9000|null|
|Maria|  Finance|   CA|  9000|null|
|  Jen|  Finance|   NY|  7900|null|
| Jeff|Marketing|   CA|  8000|null|
+-----+---------+-----+------+----+



In [132]:
merged_df=df1.unionByName(df2)
merged_df.show()

+-------+---------+----+-----+------+
|   name|     dept| age|state|salary|
+-------+---------+----+-----+------+
|  James|    Sales|  34| null|  null|
|Michael|    Sales|  56| null|  null|
| Robert|    Sales|  30| null|  null|
|  Maria|  Finance|  24| null|  null|
|  James|    Sales|null|   NY|  9000|
|  Maria|  Finance|null|   CA|  9000|
|    Jen|  Finance|null|   NY|  7900|
|   Jeff|Marketing|null|   CA|  8000|
+-------+---------+----+-----+------+



# map()

In [133]:
data = ["Project","Gutenberg’s","Alice’s","Adventures",
"in","Wonderland","Project","Gutenberg’s","Adventures",
"in","Wonderland","Project","Gutenberg’s"]

rdd=spark.sparkContext.parallelize(data)
rdd.collect()

['Project',
 'Gutenberg’s',
 'Alice’s',
 'Adventures',
 'in',
 'Wonderland',
 'Project',
 'Gutenberg’s',
 'Adventures',
 'in',
 'Wonderland',
 'Project',
 'Gutenberg’s']

In [134]:
rdd2=rdd.map(lambda x:(x,1))

for element in rdd2.collect():
    print(element)

('Project', 1)
('Gutenberg’s', 1)
('Alice’s', 1)
('Adventures', 1)
('in', 1)
('Wonderland', 1)
('Project', 1)
('Gutenberg’s', 1)
('Adventures', 1)
('in', 1)
('Wonderland', 1)
('Project', 1)
('Gutenberg’s', 1)


# map() with DataFrame

In [135]:
data = [('James','Smith','M',30),
  ('Anna','Rose','F',41),
  ('Robert','Williams','M',62), 
]

columns = ["firstname","lastname","gender","salary"]

df=spark.createDataFrame(data=data,schema=columns)
df.show()

+---------+--------+------+------+
|firstname|lastname|gender|salary|
+---------+--------+------+------+
|    James|   Smith|     M|    30|
|     Anna|    Rose|     F|    41|
|   Robert|Williams|     M|    62|
+---------+--------+------+------+



In [136]:
rdd2=df.rdd.map(lambda x:(x[0]+","+x[1],x[2],x[3]*2))

df2=rdd2.toDF(["name","gender","new_salary"])

df2.show()

+---------------+------+----------+
|           name|gender|new_salary|
+---------------+------+----------+
|    James,Smith|     M|        60|
|      Anna,Rose|     F|        82|
|Robert,Williams|     M|       124|
+---------------+------+----------+



In [137]:
def func1(x):
    firstName=x.firstname
    lastName=x.lastname
    name=firstName+","+lastName
    gender=x.gender.lower()
    salary=x.salary*2
    return (name,gender,salary)

rdd2=df.rdd.map(lambda x: func1(x))

rdd2.collect()

[('James,Smith', 'm', 60),
 ('Anna,Rose', 'f', 82),
 ('Robert,Williams', 'm', 124)]

# flatmap()

In [138]:
data = ["Project Gutenberg’s",
        "Alice’s Adventures in Wonderland",
        "Project Gutenberg’s",
        "Adventures in Wonderland",
        "Project Gutenberg’s"]
rdd = spark.sparkContext.parallelize(data)

rdd.collect()

['Project Gutenberg’s',
 'Alice’s Adventures in Wonderland',
 'Project Gutenberg’s',
 'Adventures in Wonderland',
 'Project Gutenberg’s']

In [139]:
rdd2=rdd.flatMap(lambda x:x.split(" "))

rdd2.collect()

['Project',
 'Gutenberg’s',
 'Alice’s',
 'Adventures',
 'in',
 'Wonderland',
 'Project',
 'Gutenberg’s',
 'Adventures',
 'in',
 'Wonderland',
 'Project',
 'Gutenberg’s']

In [140]:
rdd3=rdd.map(lambda x:x.split(" "))

rdd3.collect()

[['Project', 'Gutenberg’s'],
 ['Alice’s', 'Adventures', 'in', 'Wonderland'],
 ['Project', 'Gutenberg’s'],
 ['Adventures', 'in', 'Wonderland'],
 ['Project', 'Gutenberg’s']]

# fillna() & fill()

In [141]:
path="D:/PySpark/pyspark-examples-master/resources/"

df=spark.read.format("csv").options(header="true",inferSchema="true").load(path+"small_zipcode.csv")

df.show()

+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
+---+-------+--------+-------------------+-----+----------+
|  1|    704|STANDARD|               null|   PR|     30100|
|  2|    704|    null|PASEO COSTA DEL SUR|   PR|      null|
|  3|    709|    null|       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|               null|   TX|      null|
+---+-------+--------+-------------------+-----+----------+



In [142]:
df.na.fill(value=0).show()

+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
+---+-------+--------+-------------------+-----+----------+
|  1|    704|STANDARD|               null|   PR|     30100|
|  2|    704|    null|PASEO COSTA DEL SUR|   PR|         0|
|  3|    709|    null|       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|               null|   TX|         0|
+---+-------+--------+-------------------+-----+----------+



In [143]:
df.na.fill("unknown",["city"])\
.na.fill("",["type"]).show()

+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
+---+-------+--------+-------------------+-----+----------+
|  1|    704|STANDARD|            unknown|   PR|     30100|
|  2|    704|        |PASEO COSTA DEL SUR|   PR|      null|
|  3|    709|        |       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|            unknown|   TX|      null|
+---+-------+--------+-------------------+-----+----------+



In [144]:
df.na.fill({"city":"unknown","type":"NA"}).show()

+---+-------+--------+-------------------+-----+----------+
| id|zipcode|    type|               city|state|population|
+---+-------+--------+-------------------+-----+----------+
|  1|    704|STANDARD|            unknown|   PR|     30100|
|  2|    704|      NA|PASEO COSTA DEL SUR|   PR|      null|
|  3|    709|      NA|       BDA SAN LUIS|   PR|      3700|
|  4|  76166|  UNIQUE|  CINGULAR WIRELESS|   TX|     84000|
|  5|  76177|STANDARD|            unknown|   TX|      null|
+---+-------+--------+-------------------+-----+----------+



# pivot()

In [145]:
data = [("Banana",1000,"USA"), ("Carrots",1500,"USA"), ("Beans",1600,"USA"), \
      ("Orange",2000,"USA"),("Orange",2000,"USA"),("Banana",400,"China"), \
      ("Carrots",1200,"China"),("Beans",1500,"China"),("Orange",4000,"China"), \
      ("Banana",2000,"Canada"),("Carrots",2000,"Canada"),("Beans",2000,"Mexico")]

columns= ["Product","Amount","Country"]

df = spark.createDataFrame(data=data,schema=columns)

df.printSchema()

root
 |-- Product: string (nullable = true)
 |-- Amount: long (nullable = true)
 |-- Country: string (nullable = true)



In [146]:
df.show()

+-------+------+-------+
|Product|Amount|Country|
+-------+------+-------+
| Banana|  1000|    USA|
|Carrots|  1500|    USA|
|  Beans|  1600|    USA|
| Orange|  2000|    USA|
| Orange|  2000|    USA|
| Banana|   400|  China|
|Carrots|  1200|  China|
|  Beans|  1500|  China|
| Orange|  4000|  China|
| Banana|  2000| Canada|
|Carrots|  2000| Canada|
|  Beans|  2000| Mexico|
+-------+------+-------+



In [147]:
df.groupBy("Product").sum("Amount").show()

+-------+-----------+
|Product|sum(Amount)|
+-------+-----------+
| Orange|       8000|
|  Beans|       5100|
| Banana|       3400|
|Carrots|       4700|
+-------+-----------+



In [148]:
pivotDF=df.groupBy("Product").pivot("Country").sum("Amount")
pivotDF.printSchema()

root
 |-- Product: string (nullable = true)
 |-- Canada: long (nullable = true)
 |-- China: long (nullable = true)
 |-- Mexico: long (nullable = true)
 |-- USA: long (nullable = true)



In [149]:
pivotDF.show(truncate=False)

+-------+------+-----+------+----+
|Product|Canada|China|Mexico|USA |
+-------+------+-----+------+----+
|Orange |null  |4000 |null  |4000|
|Beans  |null  |1500 |2000  |1600|
|Banana |2000  |400  |null  |1000|
|Carrots|2000  |1200 |null  |1500|
+-------+------+-----+------+----+



In [150]:
countries = ["USA","China","Canada","Mexico"]
pivotDF = df.groupBy("Product").pivot("Country", countries).sum("Amount")
pivotDF.show(truncate=False)

+-------+----+-----+------+------+
|Product|USA |China|Canada|Mexico|
+-------+----+-----+------+------+
|Orange |4000|4000 |null  |null  |
|Beans  |1600|1500 |null  |2000  |
|Banana |1000|400  |2000  |null  |
|Carrots|1500|1200 |2000  |null  |
+-------+----+-----+------+------+



In [151]:
pivotDF = df.groupBy("Product","Country") \
      .sum("Amount") \
      .groupBy("Product") \
      .pivot("Country") \
      .sum("sum(Amount)") 
pivotDF.show(truncate=False)

+-------+------+-----+------+----+
|Product|Canada|China|Mexico|USA |
+-------+------+-----+------+----+
|Orange |null  |4000 |null  |4000|
|Beans  |null  |1500 |2000  |1600|
|Banana |2000  |400  |null  |1000|
|Carrots|2000  |1200 |null  |1500|
+-------+------+-----+------+----+



# partitionBy()

In [152]:
df=spark.read.format("csv").option("header",True).load(path+"simple-zipcodes.csv")

df.printSchema()

root
 |-- RecordNumber: string (nullable = true)
 |-- Country: string (nullable = true)
 |-- City: string (nullable = true)
 |-- Zipcode: string (nullable = true)
 |-- State: string (nullable = true)



In [153]:
df.write.option("header",True).partitionBy("state").mode("overwrite").csv(path+"/output_data/zipcodes-state")

In [154]:
# by multiple columns
df.write.option("header",True).partitionBy("state","city").mode("overwrite").csv(path+"/output_data/zipcodes")

In [155]:
df.write.option("header",True).option("maxRecordsPerFile",2)\
.partitionBy("state")\
.mode("overwrite")\
.csv(path+"/output_data/zipcodes_1")

In [156]:
# read partition data
parDF=spark.read.format("csv").option("header",True).load(path+"output_data/zipcodes-state")

parDF.show()

+------------+-------+-------------------+-------+-----+
|RecordNumber|Country|               City|Zipcode|state|
+------------+-------+-------------------+-------+-----+
|       49347|     US|               HOLT|  32564|   FL|
|       49348|     US|          HOMOSASSA|  34487|   FL|
|       49345|     US|           HILLIARD|  32046|   FL|
|       49346|     US|             HOLDER|  34445|   FL|
|       61391|     US|  CINGULAR WIRELESS|  76166|   TX|
|       61392|     US|         FORT WORTH|  76177|   TX|
|       61393|     US|           FT WORTH|  76177|   TX|
|       54356|     US|        SPRUCE PINE|  35585|   AL|
|       54354|     US|      SPRING GARDEN|  36275|   AL|
|       54355|     US|        SPRINGVILLE|  35146|   AL|
|       39827|     US|               MESA|  85209|   AZ|
|       39828|     US|               MESA|  85210|   AZ|
|       76511|     US|           ASH HILL|  27007|   NC|
|       76512|     US|           ASHEBORO|  27203|   NC|
|       76513|     US|         

# ArraType()

In [157]:
from pyspark.sql.types import StringType,ArrayType

arrayCol=ArrayType(StringType(),False)


In [158]:
type(arrayCol)

pyspark.sql.types.ArrayType

In [159]:
data = [
 ("James,,Smith",["Java","Scala","C++"],["Spark","Java"],"OH","CA"),
 ("Michael,Rose,",["Spark","Java","C++"],["Spark","Java"],"NY","NJ"),
 ("Robert,,Williams",["CSharp","VB"],["Spark","Python"],"UT","NV")
]

schema = StructType([
                        StructField("name",StringType(),True),
                        StructField("languagesAtSchool",ArrayType(StringType()),True),
                        StructField("languagesAtWork",ArrayType(StringType()),True),
                        StructField("currentState",StringType(),True),
                        StructField("previousState",StringType(),True)  ])

df=spark.createDataFrame(data=data,schema=schema)
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- languagesAtSchool: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- languagesAtWork: array (nullable = true)
 |    |-- element: string (containsNull = true)
 |-- currentState: string (nullable = true)
 |-- previousState: string (nullable = true)



In [160]:
df.show()

+----------------+------------------+---------------+------------+-------------+
|            name| languagesAtSchool|languagesAtWork|currentState|previousState|
+----------------+------------------+---------------+------------+-------------+
|    James,,Smith|[Java, Scala, C++]|  [Spark, Java]|          OH|           CA|
|   Michael,Rose,|[Spark, Java, C++]|  [Spark, Java]|          NY|           NJ|
|Robert,,Williams|      [CSharp, VB]|[Spark, Python]|          UT|           NV|
+----------------+------------------+---------------+------------+-------------+



# explode()

In [161]:
"""Use explode() function to create a new row for each element in the given array column"""
from pyspark.sql.functions import explode

df.select(df["name"],explode(df["languagesAtSchool"])).show()

+----------------+------+
|            name|   col|
+----------------+------+
|    James,,Smith|  Java|
|    James,,Smith| Scala|
|    James,,Smith|   C++|
|   Michael,Rose,| Spark|
|   Michael,Rose,|  Java|
|   Michael,Rose,|   C++|
|Robert,,Williams|CSharp|
|Robert,,Williams|    VB|
+----------------+------+



In [162]:
from pyspark.sql.functions import split

df.select(split(df["name"],",")).alias("namesArray").show()

+--------------------+
|  split(name, ,, -1)|
+--------------------+
|    [James, , Smith]|
|   [Michael, Rose, ]|
|[Robert, , Williams]|
+--------------------+



# array()

In [163]:
"""Use array() function to create a new array column by merging the data from multiple columns"""
from pyspark.sql.functions import array

df.select(df["name"],array(df["currentState"],df["previousState"]).alias("States")).show()

+----------------+--------+
|            name|  States|
+----------------+--------+
|    James,,Smith|[OH, CA]|
|   Michael,Rose,|[NY, NJ]|
|Robert,,Williams|[UT, NV]|
+----------------+--------+



# array_contains()

In [164]:
"""array_contains() sql function is used to check if array column contains a value"""
from pyspark.sql.functions import array_contains

df.select(df["name"],array_contains(df["languagesAtSchool"],"Java").alias("array_contains")).show()

+----------------+--------------+
|            name|array_contains|
+----------------+--------------+
|    James,,Smith|          true|
|   Michael,Rose,|          true|
|Robert,,Williams|         false|
+----------------+--------------+



# MapType()

In [165]:
"""PySpark MapType (also called map type) is a data type to represent Python Dictionary (dict) to store key-value pair"""
from pyspark.sql.types import MapType

schema=StructType([
    StructField("name",StringType(),True),
    StructField("properties",MapType(StringType(),StringType()),True)
])

dataDictionary = [
        ('James',{'hair':'black','eye':'brown'}),
        ('Michael',{'hair':'brown','eye':None}),
        ('Robert',{'hair':'red','eye':'black'}),
        ('Washington',{'hair':'grey','eye':'grey'}),
        ('Jefferson',{'hair':'brown','eye':''})
        ]

df=spark.createDataFrame(data=dataDictionary,schema=schema)

df.printSchema()

root
 |-- name: string (nullable = true)
 |-- properties: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [166]:
df.show(truncate=False)

+----------+-----------------------------+
|name      |properties                   |
+----------+-----------------------------+
|James     |[eye -> brown, hair -> black]|
|Michael   |[eye ->, hair -> brown]      |
|Robert    |[eye -> black, hair -> red]  |
|Washington|[eye -> grey, hair -> grey]  |
|Jefferson |[eye -> , hair -> brown]     |
+----------+-----------------------------+



# PySpark MapType Elements

In [167]:
df3=df.rdd.map(lambda x:(x["name"],x["properties"]["hair"],x["properties"]["eye"])).toDF(["name","hair","eye"])

df3.printSchema()

root
 |-- name: string (nullable = true)
 |-- hair: string (nullable = true)
 |-- eye: string (nullable = true)



In [168]:
df3.show()

+----------+-----+-----+
|      name| hair|  eye|
+----------+-----+-----+
|     James|black|brown|
|   Michael|brown| null|
|    Robert|  red|black|
|Washington| grey| grey|
| Jefferson|brown|     |
+----------+-----+-----+



In [169]:
df.withColumn("hair",df["properties"].getItem("hair"))\
.withColumn("eye",df["properties"].getItem("eye"))\
.drop("properties")\
.show()


+----------+-----+-----+
|      name| hair|  eye|
+----------+-----+-----+
|     James|black|brown|
|   Michael|brown| null|
|    Robert|  red|black|
|Washington| grey| grey|
| Jefferson|brown|     |
+----------+-----+-----+



In [170]:
df.withColumn("hair",df["properties"]["hair"])\
.withColumn("eye",df["properties"]["eye"])\
.drop("properties")\
.show()

+----------+-----+-----+
|      name| hair|  eye|
+----------+-----+-----+
|     James|black|brown|
|   Michael|brown| null|
|    Robert|  red|black|
|Washington| grey| grey|
| Jefferson|brown|     |
+----------+-----+-----+



In [171]:
df.select(df.name,explode(df.properties)).show()

+----------+----+-----+
|      name| key|value|
+----------+----+-----+
|     James| eye|brown|
|     James|hair|black|
|   Michael| eye| null|
|   Michael|hair|brown|
|    Robert| eye|black|
|    Robert|hair|  red|
|Washington| eye| grey|
|Washington|hair| grey|
| Jefferson| eye|     |
| Jefferson|hair|brown|
+----------+----+-----+



# map_keys()

In [172]:
from pyspark.sql.functions import map_keys

df.select(df["name"],map_keys(df["properties"])).show()

+----------+--------------------+
|      name|map_keys(properties)|
+----------+--------------------+
|     James|         [eye, hair]|
|   Michael|         [eye, hair]|
|    Robert|         [eye, hair]|
|Washington|         [eye, hair]|
| Jefferson|         [eye, hair]|
+----------+--------------------+



In [173]:
from pyspark.sql.functions import map_values

df.select(df["name"],map_values(df["properties"])).show()

+----------+----------------------+
|      name|map_values(properties)|
+----------+----------------------+
|     James|        [brown, black]|
|   Michael|             [, brown]|
|    Robert|          [black, red]|
|Washington|          [grey, grey]|
| Jefferson|             [, brown]|
+----------+----------------------+



# PySpark Window Functions

In [174]:
simpleData = (("James", "Sales", 3000), \
    ("Michael", "Sales", 4600),  \
    ("Robert", "Sales", 4100),   \
    ("Maria", "Finance", 3000),  \
    ("James", "Sales", 3000),    \
    ("Scott", "Finance", 3300),  \
    ("Jen", "Finance", 3900),    \
    ("Jeff", "Marketing", 3000), \
    ("Kumar", "Marketing", 2000),\
    ("Saif", "Sales", 4100) )

columns= ["employee_name", "department", "salary"]

df=spark.createDataFrame(data=simpleData,schema=columns)

df.printSchema()

root
 |-- employee_name: string (nullable = true)
 |-- department: string (nullable = true)
 |-- salary: long (nullable = true)



In [175]:
df.show(truncate=False)

+-------------+----------+------+
|employee_name|department|salary|
+-------------+----------+------+
|James        |Sales     |3000  |
|Michael      |Sales     |4600  |
|Robert       |Sales     |4100  |
|Maria        |Finance   |3000  |
|James        |Sales     |3000  |
|Scott        |Finance   |3300  |
|Jen          |Finance   |3900  |
|Jeff         |Marketing |3000  |
|Kumar        |Marketing |2000  |
|Saif         |Sales     |4100  |
+-------------+----------+------+



Ranking functions

In [176]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number

windowSpec = Window.partitionBy("department").orderBy("salary")

df.withColumn("row_number",row_number().over(windowSpec)).show(truncate=False)

+-------------+----------+------+----------+
|employee_name|department|salary|row_number|
+-------------+----------+------+----------+
|James        |Sales     |3000  |1         |
|James        |Sales     |3000  |2         |
|Robert       |Sales     |4100  |3         |
|Saif         |Sales     |4100  |4         |
|Michael      |Sales     |4600  |5         |
|Maria        |Finance   |3000  |1         |
|Scott        |Finance   |3300  |2         |
|Jen          |Finance   |3900  |3         |
|Kumar        |Marketing |2000  |1         |
|Jeff         |Marketing |3000  |2         |
+-------------+----------+------+----------+



In [177]:
from pyspark.sql.functions import monotonically_increasing_id
df.withColumn("idx", monotonically_increasing_id()).show()

+-------------+----------+------+-----------+
|employee_name|department|salary|        idx|
+-------------+----------+------+-----------+
|        James|     Sales|  3000|          0|
|      Michael|     Sales|  4600|          1|
|       Robert|     Sales|  4100| 8589934592|
|        Maria|   Finance|  3000| 8589934593|
|        James|     Sales|  3000|17179869184|
|        Scott|   Finance|  3300|17179869185|
|          Jen|   Finance|  3900|25769803776|
|         Jeff| Marketing|  3000|25769803777|
|        Kumar| Marketing|  2000|25769803778|
|         Saif|     Sales|  4100|25769803779|
+-------------+----------+------+-----------+



# rank()

In [178]:
from pyspark.sql.functions import rank

df.withColumn("rank",rank().over(windowSpec)).show()

+-------------+----------+------+----+
|employee_name|department|salary|rank|
+-------------+----------+------+----+
|        James|     Sales|  3000|   1|
|        James|     Sales|  3000|   1|
|       Robert|     Sales|  4100|   3|
|         Saif|     Sales|  4100|   3|
|      Michael|     Sales|  4600|   5|
|        Maria|   Finance|  3000|   1|
|        Scott|   Finance|  3300|   2|
|          Jen|   Finance|  3900|   3|
|        Kumar| Marketing|  2000|   1|
|         Jeff| Marketing|  3000|   2|
+-------------+----------+------+----+



# dense_rank()

In [179]:
from pyspark.sql.functions import dense_rank

df.withColumn("dense_rank",dense_rank().over(windowSpec)).show()

+-------------+----------+------+----------+
|employee_name|department|salary|dense_rank|
+-------------+----------+------+----------+
|        James|     Sales|  3000|         1|
|        James|     Sales|  3000|         1|
|       Robert|     Sales|  4100|         2|
|         Saif|     Sales|  4100|         2|
|      Michael|     Sales|  4600|         3|
|        Maria|   Finance|  3000|         1|
|        Scott|   Finance|  3300|         2|
|          Jen|   Finance|  3900|         3|
|        Kumar| Marketing|  2000|         1|
|         Jeff| Marketing|  3000|         2|
+-------------+----------+------+----------+



# percent_rank()

In [180]:
from pyspark.sql.functions import percent_rank

df.withColumn("percent_rank",percent_rank().over(windowSpec)).show()

+-------------+----------+------+------------+
|employee_name|department|salary|percent_rank|
+-------------+----------+------+------------+
|        James|     Sales|  3000|         0.0|
|        James|     Sales|  3000|         0.0|
|       Robert|     Sales|  4100|         0.5|
|         Saif|     Sales|  4100|         0.5|
|      Michael|     Sales|  4600|         1.0|
|        Maria|   Finance|  3000|         0.0|
|        Scott|   Finance|  3300|         0.5|
|          Jen|   Finance|  3900|         1.0|
|        Kumar| Marketing|  2000|         0.0|
|         Jeff| Marketing|  3000|         1.0|
+-------------+----------+------+------------+



# lag()

In [181]:
from pyspark.sql.functions import lag
df.withColumn("lag",lag("salary",2).over(windowSpec)).show()

+-------------+----------+------+----+
|employee_name|department|salary| lag|
+-------------+----------+------+----+
|        James|     Sales|  3000|null|
|        James|     Sales|  3000|null|
|       Robert|     Sales|  4100|3000|
|         Saif|     Sales|  4100|3000|
|      Michael|     Sales|  4600|4100|
|        Maria|   Finance|  3000|null|
|        Scott|   Finance|  3300|null|
|          Jen|   Finance|  3900|3000|
|        Kumar| Marketing|  2000|null|
|         Jeff| Marketing|  3000|null|
+-------------+----------+------+----+



# lead()

In [182]:
from pyspark.sql.functions import lead
df.withColumn("lead",lead("salary",2).over(windowSpec)).show()

+-------------+----------+------+----+
|employee_name|department|salary|lead|
+-------------+----------+------+----+
|        James|     Sales|  3000|4100|
|        James|     Sales|  3000|4100|
|       Robert|     Sales|  4100|4600|
|         Saif|     Sales|  4100|null|
|      Michael|     Sales|  4600|null|
|        Maria|   Finance|  3000|3900|
|        Scott|   Finance|  3300|null|
|          Jen|   Finance|  3900|null|
|        Kumar| Marketing|  2000|null|
|         Jeff| Marketing|  3000|null|
+-------------+----------+------+----+



# Window Aggregate Functions

In [183]:
"""I will explain how to calculate sum, min, max for each department using PySpark SQL Aggregate window functions and
WindowSpec. When working with Aggregate functions, we don’t need to use order by clause."""

windowSpecAgg  = Window.partitionBy("department")
from pyspark.sql.functions import col,avg,sum,min,max,row_number 
df.withColumn("row",row_number().over(windowSpec)) \
  .withColumn("avg", avg(col("salary")).over(windowSpecAgg)) \
  .withColumn("sum", sum(col("salary")).over(windowSpecAgg)) \
  .withColumn("min", min(col("salary")).over(windowSpecAgg)) \
  .withColumn("max", max(col("salary")).over(windowSpecAgg)) \
  .where(col("row")==1).select("department","avg","sum","min","max","row") \
  .show()

+----------+------+-----+----+----+---+
|department|   avg|  sum| min| max|row|
+----------+------+-----+----+----+---+
|     Sales|3760.0|18800|3000|4600|  1|
|   Finance|3400.0|10200|3000|3900|  1|
| Marketing|2500.0| 5000|2000|3000|  1|
+----------+------+-----+----+----+---+



# PySpark SQL Date Functions

In [184]:
from pyspark.sql.functions import *

data=[["1","2020-02-01"],["2","2019-03-01"],["3","2021-03-01"]]

df=spark.createDataFrame(data,["id","date"])
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- date: string (nullable = true)



In [185]:
df.show()

+---+----------+
| id|      date|
+---+----------+
|  1|2020-02-01|
|  2|2019-03-01|
|  3|2021-03-01|
+---+----------+



# current_date()

In [186]:
df.select(current_date().alias("current_date")).show(1)

+------------+
|current_date|
+------------+
|  2021-11-17|
+------------+
only showing top 1 row



# date_format()

In [187]:
df.select(col("date"),date_format(col("date"),"MM-dd-yyyy").alias("date_format")).show()

+----------+-----------+
|      date|date_format|
+----------+-----------+
|2020-02-01| 02-01-2020|
|2019-03-01| 03-01-2019|
|2021-03-01| 03-01-2021|
+----------+-----------+



# to_date()

In [188]:
df.select(col("date"),to_date(col("date"),"yyyy-MM-dd").alias("to_date")).show()

+----------+----------+
|      date|   to_date|
+----------+----------+
|2020-02-01|2020-02-01|
|2019-03-01|2019-03-01|
|2021-03-01|2021-03-01|
+----------+----------+



# datediff()

In [189]:
df.select(col("date"),datediff(current_date(),col("date")).alias("datediff")).show()

+----------+--------+
|      date|datediff|
+----------+--------+
|2020-02-01|     655|
|2019-03-01|     992|
|2021-03-01|     261|
+----------+--------+



# months_between()

In [190]:
df.select(col("date"),months_between(current_date(),col("date")).alias("months_between_dates")).show()

+----------+--------------------+
|      date|months_between_dates|
+----------+--------------------+
|2020-02-01|         21.51612903|
|2019-03-01|         32.51612903|
|2021-03-01|          8.51612903|
+----------+--------------------+



# add_months() , date_add(), date_sub()

In [191]:

#add_months() , date_add(), date_sub()
df.select(col("date"), 
    add_months(col("date"),3).alias("add_months"), 
    add_months(col("date"),-3).alias("sub_months"), 
    date_add(col("date"),4).alias("date_add"), 
    date_sub(col("date"),4).alias("date_sub") 
  ).show()


+----------+----------+----------+----------+----------+
|      date|add_months|sub_months|  date_add|  date_sub|
+----------+----------+----------+----------+----------+
|2020-02-01|2020-05-01|2019-11-01|2020-02-05|2020-01-28|
|2019-03-01|2019-06-01|2018-12-01|2019-03-05|2019-02-25|
|2021-03-01|2021-06-01|2020-12-01|2021-03-05|2021-02-25|
+----------+----------+----------+----------+----------+



# year(), month(), month(),next_day(), weekofyear()

In [192]:
df.select(col("date"), 
     year(col("date")).alias("year"), 
     month(col("date")).alias("month"), 
     next_day(col("date"),"Sunday").alias("next_day"), 
     weekofyear(col("date")).alias("weekofyear") 
  ).show()

+----------+----+-----+----------+----------+
|      date|year|month|  next_day|weekofyear|
+----------+----+-----+----------+----------+
|2020-02-01|2020|    2|2020-02-02|         5|
|2019-03-01|2019|    3|2019-03-03|         9|
|2021-03-01|2021|    3|2021-03-07|         9|
+----------+----+-----+----------+----------+



# dayofweek(), dayofmonth(), dayofyear()

In [193]:
df.select(col("date"),  
     dayofweek(col("date")).alias("dayofweek"), 
     dayofmonth(col("date")).alias("dayofmonth"), 
     dayofyear(col("date")).alias("dayofyear"), 
  ).show()

+----------+---------+----------+---------+
|      date|dayofweek|dayofmonth|dayofyear|
+----------+---------+----------+---------+
|2020-02-01|        7|         1|       32|
|2019-03-01|        6|         1|       60|
|2021-03-01|        2|         1|       60|
+----------+---------+----------+---------+



# current_timestamp()

In [194]:
data=[["1","02-01-2020 11 01 19 06"],["2","03-01-2019 12 01 19 406"],["3","03-01-2021 12 01 19 406"]]
df2=spark.createDataFrame(data,["id","input"])
df2.show(truncate=False)

+---+-----------------------+
|id |input                  |
+---+-----------------------+
|1  |02-01-2020 11 01 19 06 |
|2  |03-01-2019 12 01 19 406|
|3  |03-01-2021 12 01 19 406|
+---+-----------------------+



In [195]:
df2.select(current_timestamp().alias("current_timestamp")
  ).show(1,truncate=False)

+-----------------------+
|current_timestamp      |
+-----------------------+
|2021-11-17 14:23:41.746|
+-----------------------+
only showing top 1 row



# to_timestamp()

In [196]:
df2.select(col("input"), 
    to_timestamp(col("input"), "MM-dd-yyyy HH mm ss SSS").alias("to_timestamp") 
  ).show(truncate=False)

+-----------------------+-----------------------+
|input                  |to_timestamp           |
+-----------------------+-----------------------+
|02-01-2020 11 01 19 06 |null                   |
|03-01-2019 12 01 19 406|2019-03-01 12:01:19.406|
|03-01-2021 12 01 19 406|2021-03-01 12:01:19.406|
+-----------------------+-----------------------+



In [197]:
data=[["1","2020-02-01 11:01:19.06"],["2","2019-03-01 12:01:19.406"],["3","2021-03-01 12:01:19.406"]]
df3=spark.createDataFrame(data,["id","input"])

df3.select(col("input"), 
    hour(col("input")).alias("hour"), 
    minute(col("input")).alias("minute"),
    second(col("input")).alias("second") 
  ).show(truncate=False)

+-----------------------+----+------+------+
|input                  |hour|minute|second|
+-----------------------+----+------+------+
|2020-02-01 11:01:19.06 |11  |1     |19    |
|2019-03-01 12:01:19.406|12  |1     |19    |
|2021-03-01 12:01:19.406|12  |1     |19    |
+-----------------------+----+------+------+



# PySpark JSON Functions

In [199]:
from pyspark.sql import Row

jsonString="""{"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR"}"""
df=spark.createDataFrame([(1, jsonString)],["id","value"])
df.show(truncate=False)

+---+--------------------------------------------------------------------------+
|id |value                                                                     |
+---+--------------------------------------------------------------------------+
|1  |{"Zipcode":704,"ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR"}|
+---+--------------------------------------------------------------------------+



# from_json()

In [200]:
""" from_json() function is used to convert JSON string into Struct type or Map type."""
from pyspark.sql.functions import from_json

df2=df.withColumn("value",from_json(df.value,MapType(StringType(),StringType())))

df2.printSchema()

root
 |-- id: long (nullable = true)
 |-- value: map (nullable = true)
 |    |-- key: string
 |    |-- value: string (valueContainsNull = true)



In [202]:
df2.show(truncate=False)

+---+---------------------------------------------------------------------------+
|id |value                                                                      |
+---+---------------------------------------------------------------------------+
|1  |[Zipcode -> 704, ZipCodeType -> STANDARD, City -> PARC PARQUE, State -> PR]|
+---+---------------------------------------------------------------------------+



# to_json()

In [203]:
"""function is used to convert DataFrame columns MapType or Struct type to JSON string"""
from pyspark.sql.functions import to_json

df2.withColumn("value",to_json(col("value"))).show(truncate=False)

+---+----------------------------------------------------------------------------+
|id |value                                                                       |
+---+----------------------------------------------------------------------------+
|1  |{"Zipcode":"704","ZipCodeType":"STANDARD","City":"PARC PARQUE","State":"PR"}|
+---+----------------------------------------------------------------------------+



# json_tuple()

In [205]:
"""json_tuple() is used the query or extract the elements from JSON column and create the result as a new columns"""
from pyspark.sql.functions import json_tuple

df.select(col("id"),json_tuple(col("value"),"Zipcode","ZipCodeType","City")) \
    .toDF("id","Zipcode","ZipCodeType","City") \
    .show(truncate=False)

+---+-------+-----------+-----------+
|id |Zipcode|ZipCodeType|City       |
+---+-------+-----------+-----------+
|1  |704    |STANDARD   |PARC PARQUE|
+---+-------+-----------+-----------+



# get_json_object()

In [206]:
from pyspark.sql.functions import get_json_object
df.select(col("id"),get_json_object(col("value"),"$.ZipCodeType").alias("ZipCodeType")) \
    .show(truncate=False)

+---+-----------+
|id |ZipCodeType|
+---+-----------+
|1  |STANDARD   |
+---+-----------+

