# <center> 3. spark code challenge </center>

Data Srouce: [SF Salaries Dataset](https://www.kaggle.com/kaggle/sf-salaries) from Kaggle

In [1]:
import findspark
import os

findspark.init(os.environ['SPARK_HOME'])

In [2]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [3]:
# 1. create spark session
spark = SparkSession.builder.appName('salary').getOrCreate()

In [4]:
# 2. read csv file
df = spark.read.csv('../data/salaries.csv', header=True, sep=',', inferSchema=True)

In [5]:
# 3. take a look at the dataframe
df.limit(5).toPandas()

Unnamed: 0,Id,EmployeeName,JobTitle,BasePay,OvertimePay,OtherPay,Benefits,TotalPay,TotalPayBenefits,Year,Notes,Agency,Status
0,1,NATHANIEL FORD,GENERAL MANAGER-METROPOLITAN TRANSIT AUTHORITY,167411.18,0.0,400184.25,,567595.43,567595.43,2011,,San Francisco,
1,2,GARY JIMENEZ,CAPTAIN III (POLICE DEPARTMENT),155966.02,245131.88,137811.38,,538909.28,538909.28,2011,,San Francisco,
2,3,ALBERT PARDINI,CAPTAIN III (POLICE DEPARTMENT),212739.13,106088.18,16452.6,,335279.91,335279.91,2011,,San Francisco,
3,4,CHRISTOPHER CHONG,WIRE ROPE CABLE MAINTENANCE MECHANIC,77916.0,56120.71,198306.9,,332343.61,332343.61,2011,,San Francisco,
4,5,PATRICK GARDNER,"DEPUTY CHIEF OF DEPARTMENT,(FIRE DEPARTMENT)",134401.6,9737.0,182234.59,,326373.19,326373.19,2011,,San Francisco,


In [6]:
# 4. check schema of the dataframe
df.printSchema()

root
 |-- Id: integer (nullable = true)
 |-- EmployeeName: string (nullable = true)
 |-- JobTitle: string (nullable = true)
 |-- BasePay: double (nullable = true)
 |-- OvertimePay: double (nullable = true)
 |-- OtherPay: double (nullable = true)
 |-- Benefits: double (nullable = true)
 |-- TotalPay: double (nullable = true)
 |-- TotalPayBenefits: double (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Notes: string (nullable = true)
 |-- Agency: string (nullable = true)
 |-- Status: double (nullable = true)



In [7]:
# 5. how many records in the dataframe
df.count()

148654

In [8]:
# 6. What is the average BasePay
df.agg(F.avg(F.col('BasePay'))).show()

+-----------------+
|     avg(BasePay)|
+-----------------+
|66325.44884049208|
+-----------------+



In [9]:
# 7. What is the job title of JOSEPH DRISCOLL
df.filter(F.col('EmployeeName') == 'JOSEPH DRISCOLL') \
  .select('JobTitle').head()[0]

'CAPTAIN, FIRE SUPPRESSION'

In [10]:
# 8. How much does JOSEPH DRISCOLL make (including benefits)
df.filter(F.col('EmployeeName') == 'JOSEPH DRISCOLL') \
  .select('TotalPayBenefits').head()[0]

270324.91

In [11]:
# 9. What is the name of highest paid person (including benefits)?
df.sort(F.desc('TotalPayBenefits')).select(['EmployeeName', 'TotalPayBenefits']).show(1)

+--------------+----------------+
|  EmployeeName|TotalPayBenefits|
+--------------+----------------+
|NATHANIEL FORD|       567595.43|
+--------------+----------------+
only showing top 1 row



In [12]:
# 10. What is the name of lowest paid person (including benefits)
df.sort('TotalPayBenefits').select(['EmployeeName', 'TotalPayBenefits']).show(1)

+------------+----------------+
|EmployeeName|TotalPayBenefits|
+------------+----------------+
|   Joe Lopez|         -618.13|
+------------+----------------+
only showing top 1 row



In [13]:
# 11. What was the average (mean) BasePay of all employees per year? 
df.groupBy('year').agg(F.avg(F.col('BasePay')).alias('AvgBasePay')).sort('year').show()

+----+------------------+
|year|        AvgBasePay|
+----+------------------+
|2011|63595.956516774524|
|2012| 65436.40685742133|
|2013| 69630.03021647697|
|2014| 66564.42192449843|
+----+------------------+



In [14]:
# 11. How many unique job titles are there?
df.select('JobTitle').dropDuplicates().count()

2159

In [15]:
# 12. What are the top 5 most common jobs
df.groupBy('JobTitle').count().sort(F.desc('count')).show(5, truncate=False)

+----------------------------+-----+
|JobTitle                    |count|
+----------------------------+-----+
|Transit Operator            |7036 |
|Special Nurse               |4389 |
|Registered Nurse            |3736 |
|Public Svc Aide-Public Works|2518 |
|Police Officer 3            |2421 |
+----------------------------+-----+
only showing top 5 rows



In [16]:
# 13. How many Job Titles were represented by only one person in 2013?
df.where('Year = 2013').groupBy('JobTitle').count().filter(F.col('count') == 1).count()

202

In [19]:
# 14. How many people have the word Chief or chief in their job title?
df.filter(F.lower(F.col('JobTitle')).contains('chief')).count()
# df.where("UPPER(JobTitle) LIKE '%CHIEF%'").count()

627