In [1]:
!hadoop fs -ls /public/hr_db/employees/

Found 5 items
-rw-r--r--   2 hdfs supergroup          0 2021-01-28 11:23 /public/hr_db/employees/_SUCCESS
-rw-r--r--   2 hdfs supergroup       2123 2021-01-28 09:29 /public/hr_db/employees/part-m-00000
-rw-r--r--   2 hdfs supergroup       2159 2021-01-28 10:54 /public/hr_db/employees/part-m-00001
-rw-r--r--   2 hdfs supergroup       2145 2021-01-28 08:16 /public/hr_db/employees/part-m-00002
-rw-r--r--   2 hdfs supergroup       2121 2021-01-28 09:29 /public/hr_db/employees/part-m-00003


In [2]:
!hadoop fs -head /public/hr_db/employees/part-m-00002

154	Nanette	Cambrault	NCAMBRAU	011.44.1344.987668	1998-12-09	SA_REP	7500.00	0.20	145	80
155	Oliver	Tuvault	OTUVAULT	011.44.1344.486508	1999-11-23	SA_REP	7000.00	0.15	145	80
156	Janette	King	JKING	011.44.1345.429268	1996-01-30	SA_REP	10000.00	0.35	146	80
157	Patrick	Sully	PSULLY	011.44.1345.929268	1996-03-04	SA_REP	9500.00	0.35	146	80
158	Allan	McEwen	AMCEWEN	011.44.1345.829268	1996-08-01	SA_REP	9000.00	0.35	146	80
159	Lindsey	Smith	LSMITH	011.44.1345.729268	1997-03-10	SA_REP	8000.00	0.30	146	80
160	Louise	Doran	LDORAN	011.44.1345.629268	1997-12-15	SA_REP	7500.00	0.30	146	80
161	Sarath	Sewall	SSEWALL	011.44.1345.529268	1998-11-03	SA_REP	7000.00	0.25	146	80
162	Clara	Vishney	CVISHNEY	011.44.1346.129268	1997-11-11	SA_REP	10500.00	0.25	147	80
163	Danielle	Greene	DGREENE	011.44.1346.229268	1999-03-19	SA_REP	9500.00	0.15	147	80
164	Mattea	Marvins	MMARVINS	011.44.1346.329268	2000-01-24	SA_REP	7200.00	0.10	147	80
165	David	Lee	DLEE	011.44.1346.529268	2000-02-23	SA_REP	6800.00	0.10	147	80
166	S

In [3]:
from pyspark.sql import SparkSession
import getpass as gp
from pyspark.sql import types as T, functions as F, Window as W

In [4]:
user = gp.getuser()
user

'itv005077'

In [5]:
spark = SparkSession.builder \
    .appName(f'{user}-WEEK-8_Assignment') \
    .master('yarn') \
    .config('spark.sql.warehouse.dir', f'/user/{user}/warehouse') \
    .config('spark.sql.catalogImplementation', 'hive') \
    .enableHiveSupport() \
    .getOrCreate()

In [6]:
spark

In [7]:
schema = T.StructType([
    T.StructField('employee_id', T.IntegerType()),
    T.StructField('first_name', T.StringType()),
    T.StructField('last_name', T.StringType()),
    T.StructField('email', T.StringType()),
    T.StructField('phone_number', T.StringType()),
    T.StructField('hire_date', T.DateType()),
    T.StructField('job_id', T.StringType()),
    T.StructField('salary', T.FloatType()),
    T.StructField('commission_pct', T.FloatType()),
    T.StructField('manager_id', T.IntegerType()),
    T.StructField('department_id', T.IntegerType()),
])

In [8]:
df_employee = spark.read \
    .format('csv') \
    .option('delimiter', '\t') \
    .schema(schema) \
    .load('/public/hr_db/employees')

In [9]:
df_employee.printSchema()

root
 |-- employee_id: integer (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- email: string (nullable = true)
 |-- phone_number: string (nullable = true)
 |-- hire_date: date (nullable = true)
 |-- job_id: string (nullable = true)
 |-- salary: float (nullable = true)
 |-- commission_pct: float (nullable = true)
 |-- manager_id: integer (nullable = true)
 |-- department_id: integer (nullable = true)



In [10]:
df_employee.count()

107

In [11]:
df_employee.tail(2)

[Row(employee_id=205, first_name='Shelley', last_name='Higgins', email='SHIGGINS', phone_number='515.123.8080', hire_date=datetime.date(1994, 6, 7), job_id='AC_MGR', salary=12000.0, commission_pct=None, manager_id=101, department_id=110),
 Row(employee_id=206, first_name='William', last_name='Gietz', email='WGIETZ', phone_number='515.123.8181', hire_date=datetime.date(1994, 6, 7), job_id='AC_ACCOUNT', salary=8300.0, commission_pct=None, manager_id=205, department_id=110)]

In [12]:
df_employee.show(10)

+-----------+----------+----------+--------+------------+----------+--------+------+--------------+----------+-------------+
|employee_id|first_name| last_name|   email|phone_number| hire_date|  job_id|salary|commission_pct|manager_id|department_id|
+-----------+----------+----------+--------+------------+----------+--------+------+--------------+----------+-------------+
|        127|     James|    Landry| JLANDRY|650.124.1334|1999-01-14|ST_CLERK|2400.0|          null|       120|           50|
|        128|    Steven|    Markle| SMARKLE|650.124.1434|2000-03-08|ST_CLERK|2200.0|          null|       120|           50|
|        129|     Laura|    Bissot| LBISSOT|650.124.5234|1997-08-20|ST_CLERK|3300.0|          null|       121|           50|
|        130|     Mozhe|  Atkinson|MATKINSO|650.124.6234|1997-10-30|ST_CLERK|2800.0|          null|       121|           50|
|        131|     James|    Marlow| JAMRLOW|650.124.7234|1997-02-16|ST_CLERK|2500.0|          null|       121|           50|


## Programatic Approach

In [13]:
df_employee.select(
    F.count('*').alias('row_count'),
    F.count('manager_id').alias('manager_count'),
    F.sum('salary').alias('total_salary'),
    F.countDistinct('department_id').alias('unique_dept'),
    F.avg('commission_pct').alias('average_commision')
).show()

+---------+-------------+------------+-----------+-------------------+
|row_count|manager_count|total_salary|unique_dept|  average_commision|
+---------+-------------+------------+-----------+-------------------+
|      107|          106|    691400.0|         11|0.22285714660372052|
+---------+-------------+------------+-----------+-------------------+



## SQL Approach

In [14]:
df_employee.select(
    F.expr("count(*) as row_count"),
    F.expr("count(manager_id) as manager_count"),
    F.expr("sum(salary) as total_salary"),
    F.expr("count(distinct(department_id)) as unique_dept"),
    F.expr("avg(commission_pct) as average_commission")
).show()

+---------+-------------+------------+-----------+-------------------+
|row_count|manager_count|total_salary|unique_dept| average_commission|
+---------+-------------+------------+-----------+-------------------+
|      107|          106|    691400.0|         11|0.22285714660372052|
+---------+-------------+------------+-----------+-------------------+



## SPARK SQL Approach

In [15]:
df_employee.createOrReplaceTempView('employees')

In [16]:
spark.sql('''
    SELECT count(*) as row_count,
           count(manager_id) as manager_count,
           sum(salary) as total_salary,
           count(distinct(department_id)) as unique_dept,
           avg(commission_pct) as average_commission
    FROM employees
''').show()

+---------+-------------+------------+-----------+-------------------+
|row_count|manager_count|total_salary|unique_dept| average_commission|
+---------+-------------+------------+-----------+-------------------+
|      107|          106|    691400.0|         11|0.22285714660372052|
+---------+-------------+------------+-----------+-------------------+



In [17]:
spark.stop()