In [None]:
!pip install pyspark

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Employee Dataset Analysis") \
    .getOrCreate()


In [None]:
file_path = "/content/drive/MyDrive/large_employee_dataset.csv"

df = spark.read.csv(file_path, header=True, inferSchema=True)
df.show(5)


+----------+-------------------+---+----------+------+-----------+--------+------------+
|EmployeeID|               Name|Age|Department|Salary|JoiningDate|  Status|        City|
+----------+-------------------+---+----------+------+-----------+--------+------------+
|      4128|    Charles Johnson| 52|        HR| 64039| 2018-07-07|Resigned|   Allentown|
|      6094|      Dylan Camacho| 57| Marketing| 34686| 2015-08-25|  Active| Anthonyfort|
|      5883|Mr. Ryan Bowman Jr.| 29|   Finance| 64541| 2025-03-11|On Leave|   Gilesstad|
|      9146|         Brian Ball| 24|     Sales| 87831| 2015-10-01|Resigned|Jenniferfurt|
|      1918|      Angela Hooper| 26|   Finance|108773| 2019-08-14|On Leave|Lake Amystad|
+----------+-------------------+---+----------+------+-----------+--------+------------+
only showing top 5 rows



1. Show top 10 rows

In [None]:
df.show(10)


+----------+--------------------+---+----------+------+-----------+--------+------------+
|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|        City|
+----------+--------------------+---+----------+------+-----------+--------+------------+
|      4128|     Charles Johnson| 52|        HR| 64039| 2018-07-07|Resigned|   Allentown|
|      6094|       Dylan Camacho| 57| Marketing| 34686| 2015-08-25|  Active| Anthonyfort|
|      5883| Mr. Ryan Bowman Jr.| 29|   Finance| 64541| 2025-03-11|On Leave|   Gilesstad|
|      9146|          Brian Ball| 24|     Sales| 87831| 2015-10-01|Resigned|Jenniferfurt|
|      1918|       Angela Hooper| 26|   Finance|108773| 2019-08-14|On Leave|Lake Amystad|
|      4600|Alexander Johnson...| 45|     Sales| 75671| 2016-04-21|On Leave|  Russohaven|
|      6253|         Steven Lane| 47|   Finance| 64982| 2021-07-25|  Active| East Robert|
|      8278|       Riley Johnson| 49|        HR| 43449| 2015-08-03|Resigned|  New Thomas|
|      852

2. Count total employees


In [None]:
print("Total employees:", df.count())


Total employees: 500


3. Unique departments

In [None]:
df.select("Department").distinct().show()


+----------+
|Department|
+----------+
|     Sales|
|        HR|
|   Finance|
| Marketing|
|        IT|
+----------+



4. Employees in IT department

In [None]:
df.filter(df.Department == "IT").show()


+----------+-------------------+---+----------+------+-----------+--------+------------------+
|EmployeeID|               Name|Age|Department|Salary|JoiningDate|  Status|              City|
+----------+-------------------+---+----------+------+-----------+--------+------------------+
|      6598|        Mary Henson| 58|        IT| 63951| 2021-08-25|  Active|       Port Tricia|
|      8518|   Elizabeth Abbott| 22|        IT| 91732| 2022-11-05|  Active|       Douglasside|
|      9506|        Thomas Dunn| 45|        IT| 90340| 2020-07-12|On Leave|    Lindseychester|
|      9663|        Glenn Mason| 43|        IT|109189| 2020-03-27|On Leave|      Katelynburgh|
|      2106|     Richard Bailey| 45|        IT| 30950| 2021-06-29|Resigned|        North John|
|      8212|      Jacob Jackson| 35|        IT| 54812| 2020-09-18|On Leave|South Veronicastad|
|      6354|     Nicole Gilmore| 35|        IT|104202| 2018-05-04|  Active|       East Joseph|
|      5716|         David Wang| 49|        IT| 94

5. Employees aged between 30 and 40

In [None]:
df.filter((df.Age >= 30) & (df.Age <= 40)).show()


+----------+------------------+---+----------+------+-----------+--------+-------------------+
|EmployeeID|              Name|Age|Department|Salary|JoiningDate|  Status|               City|
+----------+------------------+---+----------+------+-----------+--------+-------------------+
|      4676|Christopher Fuller| 30|        HR| 63042| 2021-04-30|Resigned|   South Donnaville|
|      4136|     Jerome Torres| 30|   Finance| 68213| 2024-11-30|  Active|North Justinborough|
|      1588|       Edwin Burns| 34|     Sales|108208| 2015-09-14|Resigned|        South David|
|      8074|       Fred Brewer| 30|        HR|100736| 2021-06-06|On Leave|    Port Wendyville|
|      3841|       April Allen| 36|        HR| 98845| 2020-05-20|  Active|      Rachelchester|
|      8212|     Jacob Jackson| 35|        IT| 54812| 2020-09-18|On Leave| South Veronicastad|
|      3325|       Brian Huynh| 40|   Finance| 59924| 2017-01-02|On Leave|           Johnside|
|      6180|     Robert Cortez| 35| Marketing| 761

6. Sort by Salary descending

In [None]:
df.orderBy(df.Salary.desc()).show()


+----------+--------------------+---+----------+------+-----------+--------+-------------------+
|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|               City|
+----------+--------------------+---+----------+------+-----------+--------+-------------------+
|      8860|       Cody Williams| 30|        IT|119978| 2019-03-16|Resigned|         Susanville|
|      4585|      Sandra Webster| 30|        HR|119940| 2022-10-21|On Leave|       Thompsonport|
|      4667|         Ronald Hunt| 58|     Sales|119677| 2019-08-29|Resigned|    Griffithchester|
|      1602|    Deborah Williams| 25|        HR|119397| 2023-09-26|On Leave|    Port Terrimouth|
|      3374|        Amanda Green| 41|        HR|119316| 2021-04-08|Resigned|    West Shelleyton|
|      6329|       Heidi Shaffer| 36|        HR|119165| 2020-01-14|Resigned|          New Alexa|
|      2428|        Mary Stevens| 55|     Sales|119137| 2022-03-06|On Leave|         Travisport|
|      1545|Brittany Christens

Average salary by department



In [None]:
df.groupBy("Department").avg("Salary").show()


+----------+-----------------+
|Department|      avg(Salary)|
+----------+-----------------+
|     Sales|77488.54545454546|
|        HR|76091.27450980392|
|   Finance|72834.75630252101|
| Marketing| 71958.1888888889|
|        IT|73116.25555555556|
+----------+-----------------+



8. Count of employees by Status

In [None]:
df.groupBy("Status").count().show()


+--------+-----+
|  Status|count|
+--------+-----+
|Resigned|  159|
|  Active|  172|
|On Leave|  169|
+--------+-----+



9. Highest salary in each city

In [None]:
from pyspark.sql.functions import max

df.groupBy("City").agg(max("Salary").alias("MaxSalary")).show()


+----------------+---------+
|            City|MaxSalary|
+----------------+---------+
|   Wilsonchester|    67025|
|     Bradshawton|   111116|
|       Steventon|    32009|
|     Lake Alyssa|    84903|
|      North Lisa|    57898|
|    North Marvin|    66252|
|     Jenniferton|    39907|
|     Buckleyview|    50109|
|     Burtonville|    98492|
|    Johnsonmouth|    48799|
|    South Joseph|    52456|
|  Lindseychester|    90340|
|   North Stephen|    91947|
|Port Nicoleshire|    57537|
|    Jerrychester|    53374|
|  North Jennifer|    82486|
|      Laurenstad|    44608|
|West Brendanbury|    90698|
|       Juliaberg|    50170|
|       New James|    54378|
+----------------+---------+
only showing top 20 rows



10. Number of employees who joined each year

In [None]:
from pyspark.sql.functions import year

df.withColumn("JoiningYear", year("JoiningDate")) \
  .groupBy("JoiningYear") \
  .count().show()


+-----------+-----+
|JoiningYear|count|
+-----------+-----+
|       2025|   27|
|       2018|   52|
|       2015|   37|
|       2023|   47|
|       2022|   49|
|       2019|   52|
|       2020|   56|
|       2016|   49|
|       2024|   38|
|       2017|   44|
|       2021|   49|
+-----------+-----+



11. Active employees per department



In [None]:
df.filter(df.Status == "Active") \
  .groupBy("Department") \
  .count().show()


+----------+-----+
|Department|count|
+----------+-----+
|     Sales|   32|
|        HR|   37|
|   Finance|   45|
| Marketing|   32|
|        IT|   26|
+----------+-----+



12. Average age per department

In [None]:
df.groupBy("Department").avg("Age").show()


+----------+------------------+
|Department|          avg(Age)|
+----------+------------------+
|     Sales|40.535353535353536|
|        HR| 41.46078431372549|
|   Finance| 39.21008403361345|
| Marketing| 41.82222222222222|
|        IT| 38.68888888888889|
+----------+------------------+



13.City-Region dataset and join

In [None]:
from pyspark.sql import Row

city_region_data = [
    Row(City="New York", Region="East"),
    Row(City="San Francisco", Region="West"),
    Row(City="Chicago", Region="Central"),
    Row(City="Austin", Region="South"),
]

region_df = spark.createDataFrame(city_region_data)


df_joined = df.join(region_df, on="City", how="left")
df_joined.show()


+-------------------+----------+--------------------+---+----------+------+-----------+--------+------+
|               City|EmployeeID|                Name|Age|Department|Salary|JoiningDate|  Status|Region|
+-------------------+----------+--------------------+---+----------+------+-----------+--------+------+
|        East Robert|      6253|         Steven Lane| 47|   Finance| 64982| 2021-07-25|  Active|  NULL|
|          Allentown|      4128|     Charles Johnson| 52|        HR| 64039| 2018-07-07|Resigned|  NULL|
|         Tonyamouth|      6406|       Patrick Chung| 27|        HR|116423| 2024-07-05|  Active|  NULL|
|       Jenniferfurt|      9146|          Brian Ball| 24|     Sales| 87831| 2015-10-01|Resigned|  NULL|
|     North Brittany|      8989|       Scott Burnett| 48|     Sales| 93690| 2016-04-25|Resigned|  NULL|
|          Gilesstad|      5883| Mr. Ryan Bowman Jr.| 29|   Finance| 64541| 2025-03-11|On Leave|  NULL|
|          Port Mark|      3326|       Michael Brown| 28|       

14. Group salaries by Region

In [None]:
df_joined.groupBy("Region").avg("Salary").show()


+------+-----------+
|Region|avg(Salary)|
+------+-----------+
|  NULL|  74313.424|
+------+-----------+



15.years of experience



In [None]:
from pyspark.sql.functions import current_date, datediff, round

df_exp = df.withColumn("ExperienceYears", round(datediff(current_date(), df.JoiningDate) / 365, 2))
df_exp.select("Name", "JoiningDate", "ExperienceYears").show()


+--------------------+-----------+---------------+
|                Name|JoiningDate|ExperienceYears|
+--------------------+-----------+---------------+
|     Charles Johnson| 2018-07-07|           6.91|
|       Dylan Camacho| 2015-08-25|           9.78|
| Mr. Ryan Bowman Jr.| 2025-03-11|           0.23|
|          Brian Ball| 2015-10-01|           9.68|
|       Angela Hooper| 2019-08-14|           5.81|
|Alexander Johnson...| 2016-04-21|           9.12|
|         Steven Lane| 2021-07-25|           3.86|
|       Riley Johnson| 2015-08-03|           9.84|
|    Emily Washington| 2021-11-30|           3.51|
|     Valerie Fleming| 2019-12-08|           5.49|
|     Tracy Hughes MD| 2020-06-01|           5.01|
|    Johnathan Harmon| 2021-03-09|           4.24|
|       Michael Brown| 2023-10-21|           1.62|
|       Scott Burnett| 2016-04-25|           9.11|
|  Christopher Fuller| 2021-04-30|            4.1|
|         Mary Henson| 2021-08-25|           3.78|
|       Jerome Torres| 2024-11-

16. Employees with more than 5 years of experience

In [None]:
df_exp.filter(df_exp.ExperienceYears > 5).select("Name", "ExperienceYears").show()


+--------------------+---------------+
|                Name|ExperienceYears|
+--------------------+---------------+
|     Charles Johnson|           6.91|
|       Dylan Camacho|           9.78|
|          Brian Ball|           9.68|
|       Angela Hooper|           5.81|
|Alexander Johnson...|           9.12|
|       Riley Johnson|           9.84|
|     Valerie Fleming|           5.49|
|     Tracy Hughes MD|           5.01|
|       Scott Burnett|           9.11|
|       Brittany Kerr|            6.2|
|         Edwin Burns|           9.73|
|       Mary Reynolds|           6.93|
|           Erin Berg|           7.11|
|         Jason Hines|           9.52|
|Christopher Mcdaniel|          10.01|
|         April Allen|           5.04|
|         Glenn Mason|           5.19|
|      Victoria Kelly|           7.68|
|     Michael Trevino|           5.06|
|      Heather Nelson|          10.01|
+--------------------+---------------+
only showing top 20 rows

