In [0]:
#Aggregation like count,sum,avg,min,max

emp_data = [
(1,'manish',26,20000,'india','IT'),
(2,'rahul',None,40000,'germany','engineering'),
(3,'pawan',12,60000,'india','sales'),
(4,'roshini',44,None,'uk','engineering'),
(5,'raushan',35,70000,'india','sales'),
(6,None,29,200000,'uk','IT'),
(7,'adam',37,65000,'us','IT'),
(8,'chris',16,40000,'us','sales'),
(None,None,None,None,None,None),
(7,'adam',37,65000,'us','IT')
]

schema = ['id','name','age','sal','country','dept']

df = spark.createDataFrame(data = emp_data, schema = schema)


In [0]:
# counts start
from pyspark.sql.functions import *
from pyspark.sql.types import *

In [0]:
df.count()

Out[2]: 10

In [0]:
df.select(count("name")).show()
# this showed you the value of all names but if there is null it will ignore that when you use count with a specific column it does this


+-----------+
|count(name)|
+-----------+
|          8|
+-----------+



In [0]:
# one code to show all like example

df.select(sum('sal').alias("salary_sum"), max('sal').alias("salary_max"), min('sal').alias("salary_min"), (sum('sal')/count('sal')).cast(IntegerType()).alias("Average_without_avg")).show()

+----------+----------+----------+-------------------+
|salary_sum|salary_max|salary_min|Average_without_avg|
+----------+----------+----------+-------------------+
|    560000|    200000|     20000|              70000|
+----------+----------+----------+-------------------+



In [0]:
# Group BY 

data = [(1,'manish',50000,"IT"),
(2,'vikash',60000,"sales"),
(3,'raushan',70000,"marketing"),
(4,'mukesh',80000,"IT"),
(5,'pritam',90000,"sales"),
(6,'nikita',45000,"marketing"),
(7,'ragini',55000,"marketing"),
(8,'rakesh',100000,"IT"),
(9,'aditya',65000,"IT"),
(10,'rahul',50000,"marketing")]

schema1 = ['id','name','sal','dept']

df1 = spark.createDataFrame(data = data, schema = schema1)

In [0]:
df1.groupBy("dept").agg(sum('sal')).show()
# can do multiple agg with , in between as well


+---------+--------+
|     dept|sum(sal)|
+---------+--------+
|       IT|  295000|
|    sales|  150000|
|marketing|  220000|
+---------+--------+



In [0]:
df2 = df1.groupBy("dept").agg(sum('sal').alias("total_sal"))

In [0]:
# How can we use join 

# lets take the above data only 
# now our question is output all data along with another column where that column will have total salary of employee of that department only 
# This is the syntax "inner", "left", "right" are the types that u can use there are others as well

df1.join(df2, df1["dept"]==df2["dept"],"inner").show()
    


+---+-------+------+---------+---------+---------+
| id|   name|   sal|     dept|     dept|total_sal|
+---+-------+------+---------+---------+---------+
|  1| manish| 50000|       IT|       IT|   295000|
|  2| vikash| 60000|    sales|    sales|   150000|
|  3|raushan| 70000|marketing|marketing|   220000|
|  5| pritam| 90000|    sales|    sales|   150000|
|  4| mukesh| 80000|       IT|       IT|   295000|
|  6| nikita| 45000|marketing|marketing|   220000|
|  7| ragini| 55000|marketing|marketing|   220000|
|  8| rakesh|100000|       IT|       IT|   295000|
| 10|  rahul| 50000|marketing|marketing|   220000|
|  9| aditya| 65000|       IT|       IT|   295000|
+---+-------+------+---------+---------+---------+



In [0]:
df1.alias("a").join(df2.alias("b"), col("a.dept") == col("b.dept"), "inner") \
    .select(col("a.id"), col("a.sal"), col("a.dept"), col("b.total_sal")) \
    .show()

'''
if multiple column based join then use & and enclose each condition with ()

df1.alias("a").join(df2.alias("b"), ( col("a.dept") == col("b.dept") ) & ( col("a.something") == col("b.something") ), "inner") 

'''

+---+------+---------+---------+
| id|   sal|     dept|total_sal|
+---+------+---------+---------+
|  1| 50000|       IT|   295000|
|  2| 60000|    sales|   150000|
|  3| 70000|marketing|   220000|
|  5| 90000|    sales|   150000|
|  4| 80000|       IT|   295000|
|  6| 45000|marketing|   220000|
|  7| 55000|marketing|   220000|
|  8|100000|       IT|   295000|
| 10| 50000|marketing|   220000|
|  9| 65000|       IT|   295000|
+---+------+---------+---------+



In [0]:
# Window Functions

'''
General structure first we need to import window

from pyspark.sql.window import Window

then create a window and keep basically u are defining and keeping partionby and order by 

window = Window.partitionBy().orderBy()

when i want to use it 

df,withColumn("new", dense_rank().over(window))
'''



Out[35]: '\nGeneral structure first we need to import window\n\nfrom pyspark.sql.window import Window\n\nthen create a window and keep basically u are defining and keeping partionby and order by \n\nwindow = Window.partitionBy().orderBy()\n\nwhen i want to use it \n\ndf,withColumn("new", dense_rank().over(window))\n'

In [0]:
from pyspark.sql.window import Window

#Lets take the question from join one wherein we had to find total salary in another column if i had to do that with window the approach is below
window1 = Window.partitionBy('dept')
df1.withColumn("total_sal", sum('sal').over(window1))\
    .sort('dept',desc('sal')).show()

# if i want running total
window = Window.partitionBy('dept').orderBy(desc('sal'))
df1.withColumn("total_sal", sum('sal').over(window)).show()





+---+-------+------+---------+---------+
| id|   name|   sal|     dept|total_sal|
+---+-------+------+---------+---------+
|  8| rakesh|100000|       IT|   295000|
|  4| mukesh| 80000|       IT|   295000|
|  9| aditya| 65000|       IT|   295000|
|  1| manish| 50000|       IT|   295000|
|  3|raushan| 70000|marketing|   220000|
|  7| ragini| 55000|marketing|   220000|
| 10|  rahul| 50000|marketing|   220000|
|  6| nikita| 45000|marketing|   220000|
|  5| pritam| 90000|    sales|   150000|
|  2| vikash| 60000|    sales|   150000|
+---+-------+------+---------+---------+

+---+-------+------+---------+---------+
| id|   name|   sal|     dept|total_sal|
+---+-------+------+---------+---------+
|  8| rakesh|100000|       IT|   100000|
|  4| mukesh| 80000|       IT|   180000|
|  9| aditya| 65000|       IT|   245000|
|  1| manish| 50000|       IT|   295000|
|  3|raushan| 70000|marketing|    70000|
|  7| ragini| 55000|marketing|   125000|
| 10|  rahul| 50000|marketing|   175000|
|  6| nikita| 4

In [0]:
# Also there also one concept where you can find running total between a range of days like suppose range of 3 days or something

# Rows between and range between

'''Reffer This video for the same to understand it better since its lengthy to write 

        https://www.youtube.com/watch?v=FNrOnHNzoJI&list=PLTsNSGeIpGnGjaMSYVlidqVWSjKWoBhbr&index=23

        window = Window.partitionBy('dept').orderBy(desc('sal')).rowsBetween()
        OR
        window = Window.partitionBy('dept').orderBy(desc('sal')).rangeBetween() 


        rows between and range between takes 2 paramter

        parameters could be 0=> current row
                            -ve number => previous rows
                            +ve number => rows ahead of current
                            unbounded preceding => all rows above current row
                            unbounded => following all rows below current row

        so it can be like 

       rangeBetween(-6, 0):
// Includes rows where the column value is between (current_row_value - 6) and current_row_value.
like if i want data for last 7 days then it is -6,0 i.e 0 is the 1st day and previous 6 days so -6 total 7 days

rangeBetween(0, 6):
// Includes rows where the column value is between current_row_value and (current_row_value + 6) records.
'''

Out[47]: "Reffer This video for the same to understand it better since its lengthy to write \n\n        https://www.youtube.com/watch?v=FNrOnHNzoJI&list=PLTsNSGeIpGnGjaMSYVlidqVWSjKWoBhbr&index=23\n\n        window = Window.partitionBy('dept').orderBy(desc('sal')).rowsBetween()\n        OR\n        window = Window.partitionBy('dept').orderBy(desc('sal')).rangeBetween() \n\n\n        rows between and range between takes 2 paramter\n\n        parameters could be 0=> current row\n                            -ve number => previous rows\n                            +ve number => rows ahead of current\n                            unbounded preceding => all rows above current row\n                            unbounded => following all rows below current row\n\n        so it can be like \n\n       rangeBetween(-6, 0):\n// Includes rows where the column value is between (current_row_value - 6) and current_row_value.\nlike if i want data for last 7 days then it is -6,0 i.e 0 is the 1st day and p