#Calculate Department-wise Employee Percentage (SQL & PySpark)

In [0]:
use biki;

In [0]:
select * from employee;

In [0]:
-- Approach 1
select department
, count(1) as employee_count
, concat(cast(count(1)*100.0/sum(count(1)) over() as decimal(5,2)), '%') as `employee%`
 from employee group by department order by `employee%` desc;

In [0]:
-- Approach 2
with all_employees as (
select count(1) as all_emp from employee
), emp_per_dept as (
select department, count(1) as cnt from employee group by department
) select department
, cnt as employee_count
, concat(round(cnt*100.0/all_emp,2), '%') as `employee%` 
from emp_per_dept cross join all_employees order by `employee%` desc;

In [0]:
%python
from pyspark.sql import SparkSession, types as T, functions as F
from pyspark.sql.types import StructType, StructField, IntegerType, StringType
from pyspark.sql.window import Window as W

spark=SparkSession.builder.appName('practice').getOrCreate()
columns=StructType([StructField('emp_id',IntegerType(),False)
                , StructField('emp_name', StringType(), False)
                , StructField('department', StringType(), True)])
data=[
(1,	'Alice Johnson',	'Data Engineering'),
(2,	'Bob Smith',	'Data Analytics'),
(3,	'Carol White',	'Machine Learning'),
(4,	'David Brown',	'Cloud Platforms'),
(5,	'Emma Davis',	'Data Engineering')
]
employee_df=spark.createDataFrame(data,schema=columns)
employee_df = employee_df.groupBy('department').agg(F.count(F.col('emp_id')).alias('employee_count'))\
                .withColumn('all_employees', F.sum('employee_count').over(W.orderBy()))\
                .withColumn('employee%', F.col('employee_count')*100.0/(F.col('all_employees')))
employee_df.show()

In [0]:
%python
employee_df=spark.createDataFrame(data,schema=columns)
all_employees=employee_df.count()
employee_df = employee_df.groupBy('department').count()
employee_df = employee_df.withColumn('employee%', F.col('count')*100./all_employees)
employee_df.show()