In [3]:
import findspark
findspark.init()


from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

spark = SparkSession.builder.master("local[4]") \
        .appName("Joins").getOrCreate()

In [4]:
emp_data = [(1,'manish',50000,'IT','m'),
            (2,'vikash',60000,'sales','m'),
            (3,'raushan',70000,'marketing','m'),
            (4,'mukesh',80000,'IT','m'),
            (5,'priti',90000,'sales','f'),
            (6,'nikita',45000,'marketing','f'),
            (7,'ragini',55000,'marketing','f'),
            (8,'rashi',100000,'IT','f'),
            (9,'aditya',65000,'IT','m'),
            (10,'rahul',50000,'marketing','m'),
            (11,'rakhi',50000,'IT','f'),
            (12,'akhilesh',90000,'sales','m')]

emp_schema = StructType([StructField("id", IntegerType(), True),
                         StructField("name", StringType(), True),
                         StructField("salary", IntegerType(), True),
                         StructField("Department", StringType(), True),
                         StructField("Gender", StringType(), True)])

emp_df = spark.createDataFrame(emp_data, emp_schema)

In [5]:
emp_df.show()

                                                                                

+---+--------+------+----------+------+
| id|    name|salary|Department|Gender|
+---+--------+------+----------+------+
|  1|  manish| 50000|        IT|     m|
|  2|  vikash| 60000|     sales|     m|
|  3| raushan| 70000| marketing|     m|
|  4|  mukesh| 80000|        IT|     m|
|  5|   priti| 90000|     sales|     f|
|  6|  nikita| 45000| marketing|     f|
|  7|  ragini| 55000| marketing|     f|
|  8|   rashi|100000|        IT|     f|
|  9|  aditya| 65000|        IT|     m|
| 10|   rahul| 50000| marketing|     m|
| 11|   rakhi| 50000|        IT|     f|
| 12|akhilesh| 90000|     sales|     m|
+---+--------+------+----------+------+



In [7]:
emp_df.groupBy("department")\
      .agg(sum("salary")).show()

+----------+-----------+
|department|sum(salary)|
+----------+-----------+
| marketing|     220000|
|     sales|     240000|
|        IT|     345000|
+----------+-----------+



Window Functions

In [17]:
from pyspark.sql.window import *

In [26]:
window_spec = Window.partitionBy("department")

new_df = emp_df.withColumn("Dpt_Salary", sum("salary").over(window_spec))
# new_df.show(truncate=False)

new_df1 = new_df.withColumn("salary_percent", round(col("salary")*100/col("dpt_salary"), 2))
new_df1.show()

+---+--------+------+----------+------+----------+--------------+
| id|    name|salary|Department|Gender|Dpt_Salary|salary_percent|
+---+--------+------+----------+------+----------+--------------+
|  1|  manish| 50000|        IT|     m|    345000|         14.49|
|  4|  mukesh| 80000|        IT|     m|    345000|         23.19|
|  8|   rashi|100000|        IT|     f|    345000|         28.99|
|  9|  aditya| 65000|        IT|     m|    345000|         18.84|
| 11|   rakhi| 50000|        IT|     f|    345000|         14.49|
|  3| raushan| 70000| marketing|     m|    220000|         31.82|
|  6|  nikita| 45000| marketing|     f|    220000|         20.45|
|  7|  ragini| 55000| marketing|     f|    220000|          25.0|
| 10|   rahul| 50000| marketing|     m|    220000|         22.73|
|  2|  vikash| 60000|     sales|     m|    240000|          25.0|
|  5|   priti| 90000|     sales|     f|    240000|          37.5|
| 12|akhilesh| 90000|     sales|     m|    240000|          37.5|
+---+-----

In [35]:
rnk_window = Window.partitionBy("department").orderBy(col("salary"))

new_df = emp_df.withColumn("rank", rank().over(rnk_window))
new_df.show(truncate=False)

+---+--------+------+----------+------+----+
|id |name    |salary|Department|Gender|rank|
+---+--------+------+----------+------+----+
|1  |manish  |50000 |IT        |m     |1   |
|11 |rakhi   |50000 |IT        |f     |1   |
|9  |aditya  |65000 |IT        |m     |3   |
|4  |mukesh  |80000 |IT        |m     |4   |
|8  |rashi   |100000|IT        |f     |5   |
|6  |nikita  |45000 |marketing |f     |1   |
|10 |rahul   |50000 |marketing |m     |2   |
|7  |ragini  |55000 |marketing |f     |3   |
|3  |raushan |70000 |marketing |m     |4   |
|2  |vikash  |60000 |sales     |m     |1   |
|5  |priti   |90000 |sales     |f     |2   |
|12 |akhilesh|90000 |sales     |m     |2   |
+---+--------+------+----------+------+----+



In [40]:
dense_rnk_window = Window.partitionBy("department").orderBy(col("salary"))

new_df1 = new_df.withColumn("dense_rank", dense_rank().over(dense_rnk_window))
new_df1.show(truncate=False)

+---+--------+------+----------+------+----+----------+
|id |name    |salary|Department|Gender|rank|dense_rank|
+---+--------+------+----------+------+----+----------+
|1  |manish  |50000 |IT        |m     |1   |1         |
|11 |rakhi   |50000 |IT        |f     |1   |1         |
|9  |aditya  |65000 |IT        |m     |3   |2         |
|4  |mukesh  |80000 |IT        |m     |4   |3         |
|8  |rashi   |100000|IT        |f     |5   |4         |
|6  |nikita  |45000 |marketing |f     |1   |1         |
|10 |rahul   |50000 |marketing |m     |2   |2         |
|7  |ragini  |55000 |marketing |f     |3   |3         |
|3  |raushan |70000 |marketing |m     |4   |4         |
|2  |vikash  |60000 |sales     |m     |1   |1         |
|5  |priti   |90000 |sales     |f     |2   |2         |
|12 |akhilesh|90000 |sales     |m     |2   |2         |
+---+--------+------+----------+------+----+----------+



In [42]:
new_df1.filter(col("dense_rank")<=2).show(truncate=False)

+---+--------+------+----------+------+----+----------+
|id |name    |salary|Department|Gender|rank|dense_rank|
+---+--------+------+----------+------+----+----------+
|1  |manish  |50000 |IT        |m     |1   |1         |
|11 |rakhi   |50000 |IT        |f     |1   |1         |
|9  |aditya  |65000 |IT        |m     |3   |2         |
|6  |nikita  |45000 |marketing |f     |1   |1         |
|10 |rahul   |50000 |marketing |m     |2   |2         |
|2  |vikash  |60000 |sales     |m     |1   |1         |
|5  |priti   |90000 |sales     |f     |2   |2         |
|12 |akhilesh|90000 |sales     |m     |2   |2         |
+---+--------+------+----------+------+----+----------+



In [43]:
emp_df.show()

+---+--------+------+----------+------+
| id|    name|salary|Department|Gender|
+---+--------+------+----------+------+
|  1|  manish| 50000|        IT|     m|
|  2|  vikash| 60000|     sales|     m|
|  3| raushan| 70000| marketing|     m|
|  4|  mukesh| 80000|        IT|     m|
|  5|   priti| 90000|     sales|     f|
|  6|  nikita| 45000| marketing|     f|
|  7|  ragini| 55000| marketing|     f|
|  8|   rashi|100000|        IT|     f|
|  9|  aditya| 65000|        IT|     m|
| 10|   rahul| 50000| marketing|     m|
| 11|   rakhi| 50000|        IT|     f|
| 12|akhilesh| 90000|     sales|     m|
+---+--------+------+----------+------+



In [1]:
spark.stop()

NameError: name 'spark' is not defined