In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder\
.appName("Large Dataset")\
.getOrCreate()

spark

In [3]:
from google.colab import drive
drive.mount('/content/drive')

df = spark.read.csv('/content/drive/MyDrive/large_employee_dataset.csv',header= True,inferSchema=True)
df.show(10)

Mounted at /content/drive
+----------+--------------------+---+----------+------+-----------+--------+------------+
|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|        City|
+----------+--------------------+---+----------+------+-----------+--------+------------+
|      4128|     Charles Johnson| 52|        HR| 64039| 2018-07-07|Resigned|   Allentown|
|      6094|       Dylan Camacho| 57| Marketing| 34686| 2015-08-25|  Active| Anthonyfort|
|      5883| Mr. Ryan Bowman Jr.| 29|   Finance| 64541| 2025-03-11|On Leave|   Gilesstad|
|      9146|          Brian Ball| 24|     Sales| 87831| 2015-10-01|Resigned|Jenniferfurt|
|      1918|       Angela Hooper| 26|   Finance|108773| 2019-08-14|On Leave|Lake Amystad|
|      4600|Alexander Johnson...| 45|     Sales| 75671| 2016-04-21|On Leave|  Russohaven|
|      6253|         Steven Lane| 47|   Finance| 64982| 2021-07-25|  Active| East Robert|
|      8278|       Riley Johnson| 49|        HR| 43449| 2015-08-03|Resigne

In [5]:
# Count the total number of employees.

total_employee = df.select("EmployeeID").count()
print("Total number of Employee:",total_employee)

Total number of Employee: 500


In [6]:
# Display unique departments.
unique_dept = df.select("Department").distinct().show()
unique_dept

+----------+
|Department|
+----------+
|     Sales|
|        HR|
|   Finance|
| Marketing|
|        IT|
+----------+



In [7]:
# Filtering & Sorting
# 4. Filter all employees in the "IT" department.
it_employees = df.filter(df["Department"] == "IT").show()
it_employees

+----------+-------------------+---+----------+------+-----------+--------+------------------+
|EmployeeID|               Name|Age|Department|Salary|JoiningDate|  Status|              City|
+----------+-------------------+---+----------+------+-----------+--------+------------------+
|      6598|        Mary Henson| 58|        IT| 63951| 2021-08-25|  Active|       Port Tricia|
|      8518|   Elizabeth Abbott| 22|        IT| 91732| 2022-11-05|  Active|       Douglasside|
|      9506|        Thomas Dunn| 45|        IT| 90340| 2020-07-12|On Leave|    Lindseychester|
|      9663|        Glenn Mason| 43|        IT|109189| 2020-03-27|On Leave|      Katelynburgh|
|      2106|     Richard Bailey| 45|        IT| 30950| 2021-06-29|Resigned|        North John|
|      8212|      Jacob Jackson| 35|        IT| 54812| 2020-09-18|On Leave|South Veronicastad|
|      6354|     Nicole Gilmore| 35|        IT|104202| 2018-05-04|  Active|       East Joseph|
|      5716|         David Wang| 49|        IT| 94

In [8]:
# 5. Show employees aged between 30 and 40.

age_btw = df.filter((df["Age"] >= 30)&(df["Age"] <= 40)).show()
age_btw

+----------+------------------+---+----------+------+-----------+--------+-------------------+
|EmployeeID|              Name|Age|Department|Salary|JoiningDate|  Status|               City|
+----------+------------------+---+----------+------+-----------+--------+-------------------+
|      4676|Christopher Fuller| 30|        HR| 63042| 2021-04-30|Resigned|   South Donnaville|
|      4136|     Jerome Torres| 30|   Finance| 68213| 2024-11-30|  Active|North Justinborough|
|      1588|       Edwin Burns| 34|     Sales|108208| 2015-09-14|Resigned|        South David|
|      8074|       Fred Brewer| 30|        HR|100736| 2021-06-06|On Leave|    Port Wendyville|
|      3841|       April Allen| 36|        HR| 98845| 2020-05-20|  Active|      Rachelchester|
|      8212|     Jacob Jackson| 35|        IT| 54812| 2020-09-18|On Leave| South Veronicastad|
|      3325|       Brian Huynh| 40|   Finance| 59924| 2017-01-02|On Leave|           Johnside|
|      6180|     Robert Cortez| 35| Marketing| 761

In [11]:
# 6. Sort employees by Salary in descending order.

desc_salary = df.orderBy(df["Salary"].desc()).show()
desc_salary

+----------+--------------------+---+----------+------+-----------+--------+-------------------+
|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|               City|
+----------+--------------------+---+----------+------+-----------+--------+-------------------+
|      8860|       Cody Williams| 30|        IT|119978| 2019-03-16|Resigned|         Susanville|
|      4585|      Sandra Webster| 30|        HR|119940| 2022-10-21|On Leave|       Thompsonport|
|      4667|         Ronald Hunt| 58|     Sales|119677| 2019-08-29|Resigned|    Griffithchester|
|      1602|    Deborah Williams| 25|        HR|119397| 2023-09-26|On Leave|    Port Terrimouth|
|      3374|        Amanda Green| 41|        HR|119316| 2021-04-08|Resigned|    West Shelleyton|
|      6329|       Heidi Shaffer| 36|        HR|119165| 2020-01-14|Resigned|          New Alexa|
|      2428|        Mary Stevens| 55|     Sales|119137| 2022-03-06|On Leave|         Travisport|
|      1545|Brittany Christens

In [12]:
# Aggregation Tasks
# 7. Get the average salary by department.

from pyspark.sql.functions import avg
avg_salary = df.groupBy("Department").agg(avg("Salary")).show()
avg_salary

+----------+-----------------+
|Department|      avg(Salary)|
+----------+-----------------+
|     Sales|77488.54545454546|
|        HR|76091.27450980392|
|   Finance|72834.75630252101|
| Marketing| 71958.1888888889|
|        IT|73116.25555555556|
+----------+-----------------+



In [13]:
# 8. Count of employees by Status .
emp_stas = df.groupBy("Status").count().show()
emp_stas

+--------+-----+
|  Status|count|
+--------+-----+
|Resigned|  159|
|  Active|  172|
|On Leave|  169|
+--------+-----+



In [14]:
# 9. Highest salary in each city.

from pyspark.sql.functions import max
max_sal = df.groupBy("City").agg(max("Salary")).show()
max_sal

+----------------+-----------+
|            City|max(Salary)|
+----------------+-----------+
|   Wilsonchester|      67025|
|     Bradshawton|     111116|
|       Steventon|      32009|
|     Lake Alyssa|      84903|
|      North Lisa|      57898|
|    North Marvin|      66252|
|     Jenniferton|      39907|
|     Buckleyview|      50109|
|     Burtonville|      98492|
|    Johnsonmouth|      48799|
|    South Joseph|      52456|
|  Lindseychester|      90340|
|   North Stephen|      91947|
|Port Nicoleshire|      57537|
|    Jerrychester|      53374|
|  North Jennifer|      82486|
|      Laurenstad|      44608|
|West Brendanbury|      90698|
|       Juliaberg|      50170|
|       New James|      54378|
+----------------+-----------+
only showing top 20 rows



In [15]:
# GroupBy and Analysis
# 10. Total number of employees who joined each year.

from pyspark.sql.functions import year
emp_join_year = df.withColumn("JoinYear",year("JoiningDate")).groupBy("JoinYear").count().show()
emp_join_year

+--------+-----+
|JoinYear|count|
+--------+-----+
|    2025|   27|
|    2018|   52|
|    2015|   37|
|    2023|   47|
|    2022|   49|
|    2019|   52|
|    2020|   56|
|    2016|   49|
|    2024|   38|
|    2017|   44|
|    2021|   49|
+--------+-----+



In [16]:
# 11. Department-wise count of employees who are currently "Active".

active_count = df.filter(df["Status"] == "Active").groupBy("Department").count().show()
active_count

+----------+-----+
|Department|count|
+----------+-----+
|     Sales|   32|
|        HR|   37|
|   Finance|   45|
| Marketing|   32|
|        IT|   26|
+----------+-----+



In [17]:
# 12. Average age of employees per department.

from pyspark.sql.functions import avg
avg_age = df.groupby("Department").agg(avg("Age")).show()
avg_age

+----------+------------------+
|Department|          avg(Age)|
+----------+------------------+
|     Sales|40.535353535353536|
|        HR| 41.46078431372549|
|   Finance| 39.21008403361345|
| Marketing| 41.82222222222222|
|        IT| 38.68888888888889|
+----------+------------------+



In [26]:
# Joining (Use another DataFrame for mapping)
# 13. Create another dataset with City and Region , and join it.

city_region = [["Bradshawton","East"],["North Lisa","West"],["Buckleyview","Midwest"],["West Brendanbury","South"],["East Robert","West"],["Mariebury","East"]]
data = ["City","Region"]

city_region_df = spark.createDataFrame(city_region,data)
city_region_df.show()

join_df = df.join(city_region_df,on = "City",how = "left")
join_df.select("EmployeeID", "Name", "City", "Region", "Salary").show()

+----------------+-------+
|            City| Region|
+----------------+-------+
|     Bradshawton|   East|
|      North Lisa|   West|
|     Buckleyview|Midwest|
|West Brendanbury|  South|
|     East Robert|   West|
|       Mariebury|   East|
+----------------+-------+

+----------+--------------------+-------------------+------+------+
|EmployeeID|                Name|               City|Region|Salary|
+----------+--------------------+-------------------+------+------+
|      6253|         Steven Lane|        East Robert|  West| 64982|
|      4128|     Charles Johnson|          Allentown|  NULL| 64039|
|      6406|       Patrick Chung|         Tonyamouth|  NULL|116423|
|      9146|          Brian Ball|       Jenniferfurt|  NULL| 87831|
|      8989|       Scott Burnett|     North Brittany|  NULL| 93690|
|      5883| Mr. Ryan Bowman Jr.|          Gilesstad|  NULL| 64541|
|      3326|       Michael Brown|          Port Mark|  NULL| 85122|
|      8520|    Emily Washington|        West Ash

In [29]:
# 14. Group salaries by Region after the join.
from pyspark.sql.functions import avg, sum

avg_salary = join_df.groupBy("Region").agg(avg("Salary")).show()
avg_salary

sum_salary = join_df.groupBy("Region").agg(sum("Salary")).show()
sum_salary

+-------+-----------------+
| Region|      avg(Salary)|
+-------+-----------------+
|Midwest|          50109.0|
|   NULL|74311.87854251012|
|  South|          90698.0|
|   East|          91478.5|
|   West|          61440.0|
+-------+-----------------+

+-------+-----------+
| Region|sum(Salary)|
+-------+-----------+
|Midwest|      50109|
|   NULL|   36710068|
|  South|      90698|
|   East|     182957|
|   West|     122880|
+-------+-----------+



In [35]:
# Date Operations
# 15. Calculate years of experience for each employee (current date - JoiningDate).

from pyspark.sql.functions import current_date,datediff,round

exp_of_emp = df.withColumn("ExperienceYear",round(datediff(current_date(),df["JoiningDate"])/365.0,1))
exp_of_emp.show()

+----------+--------------------+---+----------+------+-----------+--------+-------------------+--------------+
|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|               City|ExperienceYear|
+----------+--------------------+---+----------+------+-----------+--------+-------------------+--------------+
|      4128|     Charles Johnson| 52|        HR| 64039| 2018-07-07|Resigned|          Allentown|           6.9|
|      6094|       Dylan Camacho| 57| Marketing| 34686| 2015-08-25|  Active|        Anthonyfort|           9.8|
|      5883| Mr. Ryan Bowman Jr.| 29|   Finance| 64541| 2025-03-11|On Leave|          Gilesstad|           0.2|
|      9146|          Brian Ball| 24|     Sales| 87831| 2015-10-01|Resigned|       Jenniferfurt|           9.7|
|      1918|       Angela Hooper| 26|   Finance|108773| 2019-08-14|On Leave|       Lake Amystad|           5.8|
|      4600|Alexander Johnson...| 45|     Sales| 75671| 2016-04-21|On Leave|         Russohaven|        

In [36]:
# 16. List all employees with more than 5 years of experience.
exp_more_5 = exp_of_emp.filter(exp_of_emp["ExperienceYear"] > 5)
exp_more_5.select("EmployeeID", "Name", "ExperienceYear").show()

+----------+--------------------+--------------+
|EmployeeID|                Name|ExperienceYear|
+----------+--------------------+--------------+
|      4128|     Charles Johnson|           6.9|
|      6094|       Dylan Camacho|           9.8|
|      9146|          Brian Ball|           9.7|
|      1918|       Angela Hooper|           5.8|
|      4600|Alexander Johnson...|           9.1|
|      8278|       Riley Johnson|           9.8|
|      1298|     Valerie Fleming|           5.5|
|      8989|       Scott Burnett|           9.1|
|      2758|       Brittany Kerr|           6.2|
|      1588|         Edwin Burns|           9.7|
|      8729|       Mary Reynolds|           6.9|
|      2039|           Erin Berg|           7.1|
|      8923|         Jason Hines|           9.5|
|      8423|Christopher Mcdaniel|          10.0|
|      9663|         Glenn Mason|           5.2|
|      6257|      Victoria Kelly|           7.7|
|      6706|     Michael Trevino|           5.1|
|      6580|      He