# <center> 4. spark code challenge - solution </center>

Data Srouce: [SF Salaries Dataset](https://www.kaggle.com/kaggle/sf-salaries) from Kaggle

In [1]:
import findspark
import os

findspark.init(os.environ['SPARK_HOME'])

In [7]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [3]:
# 1. create spark session
spark = SparkSession.builder.appName('salary').getOrCreate()

In [4]:
# 2. read csv file
salary = spark.read.csv('./data/salaries.csv', header=True, inferSchema=True)

In [None]:
# 3. take a look at the dataframe
salary.show()

In [None]:
# 4. check schema of the dataframe
salary.printSchema()

In [5]:
# 5. how many records in the dataframe
salary.count()

148654

In [8]:
# 6. What is the average BasePay
salary.select(F.mean('BasePay')).show()

+-----------------+
|     avg(BasePay)|
+-----------------+
|66325.44884049208|
+-----------------+



In [9]:
# 7. What is the job title of JOSEPH DRISCOLL
salary.filter('EmployeeName = "JOSEPH DRISCOLL"').select('JobTitle').show(1, False)

+-------------------------+
|JobTitle                 |
+-------------------------+
|CAPTAIN, FIRE SUPPRESSION|
+-------------------------+



In [10]:
# 8. How much does JOSEPH DRISCOLL make (including benefits)
salary.filter('EmployeeName = "JOSEPH DRISCOLL"').select('TotalPayBenefits').show()

+----------------+
|TotalPayBenefits|
+----------------+
|       270324.91|
+----------------+



In [11]:
# 9. What is the name of highest paid person (including benefits)?

salary.orderBy('TotalPayBenefits', ascending=False).select('EmployeeName').show(1, False)

+--------------+
|EmployeeName  |
+--------------+
|NATHANIEL FORD|
+--------------+
only showing top 1 row



In [12]:
salary.orderBy('TotalPayBenefits', ascending=False).head(1)[0].EmployeeName

'NATHANIEL FORD'

In [13]:
# 10. What is the name of lowest paid person (including benefits)

salary.orderBy('TotalPayBenefits').head(1)[0].EmployeeName

'Joe Lopez'

In [14]:
# 11. What was the average (mean) BasePay of all employees per year? 

salary.groupBy('Year').agg(F.mean('BasePay')).show()

+----+------------------+
|Year|      avg(BasePay)|
+----+------------------+
|2013| 69630.03021647697|
|2014| 66564.42192449843|
|2012| 65436.40685742133|
|2011|63595.956516774524|
+----+------------------+



In [15]:
# 11. How many unique job titles are there?

salary.select('JobTitle').distinct().count()

2159

In [16]:
# 12. What are the top 5 most common jobs

salary.groupBy('JobTitle').count().orderBy('count', ascending=False).show(5, False)

+----------------------------+-----+
|JobTitle                    |count|
+----------------------------+-----+
|Transit Operator            |7036 |
|Special Nurse               |4389 |
|Registered Nurse            |3736 |
|Public Svc Aide-Public Works|2518 |
|Police Officer 3            |2421 |
+----------------------------+-----+
only showing top 5 rows



In [17]:
# 13. How many Job Titles were represented by only one person in 2013?

salary.filter('Year = 2013').groupBy('JobTitle').count().filter('count = 1').count()

202

In [18]:
# 14. How many people have the word Chief or chief in their job title?

def has_chief(title):
    if 'chief' in title.lower():
        return 1
    else:
        return 0

In [19]:
has_chief_udf = F.udf(has_chief)

In [20]:
salary.withColumn('has_chief', has_chief_udf(F.col('JobTitle'))).filter('has_chief = 1').count()

627