In [1]:
!hadoop fs -head /public/hr_db/employees/part-m-00002

154	Nanette	Cambrault	NCAMBRAU	011.44.1344.987668	1998-12-09	SA_REP	7500.00	0.20	145	80
155	Oliver	Tuvault	OTUVAULT	011.44.1344.486508	1999-11-23	SA_REP	7000.00	0.15	145	80
156	Janette	King	JKING	011.44.1345.429268	1996-01-30	SA_REP	10000.00	0.35	146	80
157	Patrick	Sully	PSULLY	011.44.1345.929268	1996-03-04	SA_REP	9500.00	0.35	146	80
158	Allan	McEwen	AMCEWEN	011.44.1345.829268	1996-08-01	SA_REP	9000.00	0.35	146	80
159	Lindsey	Smith	LSMITH	011.44.1345.729268	1997-03-10	SA_REP	8000.00	0.30	146	80
160	Louise	Doran	LDORAN	011.44.1345.629268	1997-12-15	SA_REP	7500.00	0.30	146	80
161	Sarath	Sewall	SSEWALL	011.44.1345.529268	1998-11-03	SA_REP	7000.00	0.25	146	80
162	Clara	Vishney	CVISHNEY	011.44.1346.129268	1997-11-11	SA_REP	10500.00	0.25	147	80
163	Danielle	Greene	DGREENE	011.44.1346.229268	1999-03-19	SA_REP	9500.00	0.15	147	80
164	Mattea	Marvins	MMARVINS	011.44.1346.329268	2000-01-24	SA_REP	7200.00	0.10	147	80
165	David	Lee	DLEE	011.44.1346.529268	2000-02-23	SA_REP	6800.00	0.10	147	80
166	S

In [2]:
from pyspark.sql import SparkSession
import getpass as gp
from pyspark.sql import types as T, functions as F

In [3]:
user = gp.getuser()
user

'itv005077'

In [4]:
spark = SparkSession.builder \
    .appName(f'{user}-WEEK-8-Assignment-2') \
    .master('yarn') \
    .config('spark.sql.warehouse.dir', f'/user/{user}/warehouse') \
    .config('spark.sql.catalogImplementaion', 'hive') \
    .enableHiveSupport() \
    .getOrCreate()

In [5]:
spark

In [6]:
schema = T.StructType([
    T.StructField('employee_id', T.IntegerType()),
    T.StructField('first_name', T.StringType()),
    T.StructField('last_name', T.StringType()),
    T.StructField('email', T.StringType()),
    T.StructField('phone_number', T.StringType()),
    T.StructField('hire_date', T.DateType()),
    T.StructField('job_id', T.StringType()),
    T.StructField('salary', T.FloatType()),
    T.StructField('commission_pct', T.FloatType()),
    T.StructField('manager_id', T.IntegerType()),
    T.StructField('department_id', T.IntegerType()),
])

In [7]:
df_employee = spark.read \
    .format('csv') \
    .option('delimiter', '\t') \
    .schema(schema) \
    .load('/public/hr_db/employees')

In [8]:
df_employee.show()

+-----------+----------+----------+--------+------------------+----------+--------+-------+--------------+----------+-------------+
|employee_id|first_name| last_name|   email|      phone_number| hire_date|  job_id| salary|commission_pct|manager_id|department_id|
+-----------+----------+----------+--------+------------------+----------+--------+-------+--------------+----------+-------------+
|        127|     James|    Landry| JLANDRY|      650.124.1334|1999-01-14|ST_CLERK| 2400.0|          null|       120|           50|
|        128|    Steven|    Markle| SMARKLE|      650.124.1434|2000-03-08|ST_CLERK| 2200.0|          null|       120|           50|
|        129|     Laura|    Bissot| LBISSOT|      650.124.5234|1997-08-20|ST_CLERK| 3300.0|          null|       121|           50|
|        130|     Mozhe|  Atkinson|MATKINSO|      650.124.6234|1997-10-30|ST_CLERK| 2800.0|          null|       121|           50|
|        131|     James|    Marlow| JAMRLOW|      650.124.7234|1997-02-16|ST

## Programatic Approach

In [9]:
df_employee \
    .groupBy('department_id') \
    .agg(F.sum('salary').alias('total_dept_sal'), 
         F.sum(F.expr('salary * commission_pct')).alias('total_dept_sal_w_comm')
        ) \
    .orderBy('department_id', desc = False) \
    .show()

+-------------+--------------+---------------------+
|department_id|total_dept_sal|total_dept_sal_w_comm|
+-------------+--------------+---------------------+
|         null|        7000.0|               1050.0|
|           10|        4400.0|                 null|
|           20|       19000.0|                 null|
|           30|       24900.0|                 null|
|           40|        6500.0|                 null|
|           50|      156400.0|                 null|
|           60|       28800.0|                 null|
|           70|       10000.0|                 null|
|           80|      304500.0|     72640.0009765625|
|           90|       58000.0|                 null|
|          100|       51600.0|                 null|
|          110|       20300.0|                 null|
+-------------+--------------+---------------------+



## SQL Approach

In [10]:
df_employee \
    .groupBy('department_id') \
    .agg( F.expr('sum(salary) as total_dept_sal'), 
         F.expr('sum(salary*commission_pct) as total_dept_sal_w_comm')
        ) \
    .orderBy('department_id', desc = False) \
    .show()

+-------------+--------------+---------------------+
|department_id|total_dept_sal|total_dept_sal_w_comm|
+-------------+--------------+---------------------+
|         null|        7000.0|               1050.0|
|           10|        4400.0|                 null|
|           20|       19000.0|                 null|
|           30|       24900.0|                 null|
|           40|        6500.0|                 null|
|           50|      156400.0|                 null|
|           60|       28800.0|                 null|
|           70|       10000.0|                 null|
|           80|      304500.0|     72640.0009765625|
|           90|       58000.0|                 null|
|          100|       51600.0|                 null|
|          110|       20300.0|                 null|
+-------------+--------------+---------------------+



## SPARK SQL Approach

In [11]:
df_employee.createOrReplaceTempView('employees')

In [12]:
spark.sql('''
    SELECT department_id,
           sum(salary) as total_dept_sal,
           sum(salary*commission_pct) as total_dept_sal_w_comm
    FROM employees
    GROUP BY department_id
    ORDER BY department_id
''').show()

+-------------+--------------+---------------------+
|department_id|total_dept_sal|total_dept_sal_w_comm|
+-------------+--------------+---------------------+
|         null|        7000.0|               1050.0|
|           10|        4400.0|                 null|
|           20|       19000.0|                 null|
|           30|       24900.0|                 null|
|           40|        6500.0|                 null|
|           50|      156400.0|                 null|
|           60|       28800.0|                 null|
|           70|       10000.0|                 null|
|           80|      304500.0|     72640.0009765625|
|           90|       58000.0|                 null|
|          100|       51600.0|                 null|
|          110|       20300.0|                 null|
+-------------+--------------+---------------------+



In [13]:
spark.stop()