In [1]:
# pyspark initialization
import findspark
findspark.init()

In [2]:
#building sparksession
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark

In [3]:
#loading data
mydata = spark.read.format("csv").option("header", "true").load("original.csv", sheet_name="original")
mydata.show()

+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary|  Latitude|  Longitude|
+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.4967184|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|48.8231572|103.5218199|
|  3|    Alvera|  Di Boldi|Female|           NULL|                NULL|$57576.52|39.9947462|116.3397725|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.1300171|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      NULL| 37.6489954|
|  6|     Maris|      Folk|Female|Kinsealy-Drinan|      Civil Engineer|$30101.16|53.4266145| -6.1644997|
|  7|     Masha|    Divers|Female|         Dachun|     

In [4]:
#importing all pyspark functions
from pyspark.sql.functions import *

In [5]:
#replacing NULL values with 'Unknown' for 'City' column and populating it to new column 'clean_city'
mydata2 = mydata.withColumn("clean_city", when(mydata.City.isNull(), 'Unknown').otherwise(mydata.City))
mydata2.show()

+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+---------------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary|  Latitude|  Longitude|     clean_city|
+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+---------------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.4967184|      Nowa Ruda|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|48.8231572|103.5218199|         Bulgan|
|  3|    Alvera|  Di Boldi|Female|           NULL|                NULL|$57576.52|39.9947462|116.3397725|        Unknown|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.1300171|  Divnomorskoye|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      NULL| 37.6489954|      Mytishchi|
|  6|     Maris|      Folk|Femal

In [6]:
#filtering out NULL values from 'JobTitle' column
mydata2 = mydata2.filter(mydata2.JobTitle.isNotNull())
mydata2.show()

+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+---------------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary|  Latitude|  Longitude|     clean_city|
+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+---------------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.4967184|      Nowa Ruda|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|48.8231572|103.5218199|         Bulgan|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.1300171|  Divnomorskoye|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      NULL| 37.6489954|      Mytishchi|
|  6|     Maris|      Folk|Female|Kinsealy-Drinan|      Civil Engineer|$30101.16|53.4266145| -6.1644997|Kinsealy-Drinan|
|  8|   Goddart|     Flear|  Mal

In [7]:
#removing $-sign from 'Salary' column and updating it to new column 'clean_salary'
mydata2 = mydata2.withColumn("clean_salary", mydata2.Salary.substr(2, 100).cast('float'))
mydata2.show()

+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+---------------+------------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary|  Latitude|  Longitude|     clean_city|clean_salary|
+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+---------------+------------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.4967184|      Nowa Ruda|    57438.18|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|48.8231572|103.5218199|         Bulgan|     62846.6|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.1300171|  Divnomorskoye|    61489.23|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      NULL| 37.6489954|      Mytishchi|    63863.09|
|  6|     Maris|      Folk|Female|Kinsealy-Drinan|      Civil 

In [8]:
#calculating salary mean to replace salary NULL values
mean = mydata2.groupby().avg("clean_salary")
mean.show()
mean = mean.take(1)[0][0]
print(mean)

+-----------------+
|avg(clean_salary)|
+-----------------+
|55516.32088199837|
+-----------------+

55516.32088199837


In [9]:
#replacing NULL values with salary mean and updating it to new column 'new_salary'
from pyspark.sql.functions import lit 
mydata2 = mydata2.withColumn("new_salary", when(mydata2.clean_salary.isNull(), lit(mean)).otherwise(mydata2.clean_salary))
mydata2.show()

+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+---------------+------------+----------------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary|  Latitude|  Longitude|     clean_city|clean_salary|      new_salary|
+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+---------------+------------+----------------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.4967184|      Nowa Ruda|    57438.18|   57438.1796875|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|48.8231572|103.5218199|         Bulgan|     62846.6|   62846.6015625|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.1300171|  Divnomorskoye|    61489.23|  61489.23046875|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      NULL| 3

In [10]:
#selecting only 'Latitude' column
latitudes = mydata2.select("Latitude")
latitudes.show()

+----------+
|  Latitude|
+----------+
|50.5774075|
|48.8231572|
|44.5047212|
|      NULL|
|53.4266145|
|45.1905186|
| 32.027934|
|  4.272793|
|     -5.85|
| 39.172378|
|49.8151822|
|42.1014803|
|49.7923299|
|43.4945737|
|52.7441662|
| 38.696249|
|-7.7232567|
|40.7172049|
|  49.16291|
|40.7576842|
+----------+
only showing top 20 rows



In [11]:
#filtering out NULL values from 'Latitude' column
latitudes = latitudes.filter(latitudes.Latitude.isNotNull())
latitudes.show()

+----------+
|  Latitude|
+----------+
|50.5774075|
|48.8231572|
|44.5047212|
|53.4266145|
|45.1905186|
| 32.027934|
|  4.272793|
|     -5.85|
| 39.172378|
|49.8151822|
|42.1014803|
|49.7923299|
|43.4945737|
|52.7441662|
| 38.696249|
|-7.7232567|
|40.7172049|
|  49.16291|
|40.7576842|
|48.4902808|
+----------+
only showing top 20 rows



In [12]:
#typecasting latitude values to float
latitudes = latitudes.withColumn('latitudes2', latitudes.Latitude.cast('float')).select('latitudes2')
latitudes.show()

+----------+
|latitudes2|
+----------+
| 50.577408|
|  48.82316|
| 44.504723|
| 53.426613|
| 45.190517|
| 32.027935|
|  4.272793|
|     -5.85|
|  39.17238|
|  49.81518|
|  42.10148|
|  49.79233|
| 43.494576|
| 52.744167|
| 38.696247|
|-7.7232566|
| 40.717205|
|  49.16291|
| 40.757683|
|  48.49028|
+----------+
only showing top 20 rows



In [13]:
#calculating median of latitudes
import numpy as np
median = np.median(latitudes.collect())
print(median)

31.93397331237793


In [14]:
#median imputation for NULL values of latitude
mydata2 = mydata2.withColumn('lat', when(mydata2.Latitude.isNull(), lit(median)).otherwise(mydata2.Latitude))
mydata2.show()

+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+---------------+------------+----------------+-----------------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary|  Latitude|  Longitude|     clean_city|clean_salary|      new_salary|              lat|
+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+---------------+------------+----------------+-----------------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.4967184|      Nowa Ruda|    57438.18|   57438.1796875|       50.5774075|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|48.8231572|103.5218199|         Bulgan|     62846.6|   62846.6015625|       48.8231572|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.1300171|  Divnomorskoye|    61489.23|  61489.23046875|     

In [15]:
#Business Question 1: Between men and women, who get paid more on average?
import pyspark.sql.functions as sqlfunc
genders = mydata2.groupby('gender').agg(sqlfunc.avg('new_salary').alias('AvgSalary'))
genders.show()

+------+------------------+
|gender|         AvgSalary|
+------+------------------+
|Female|55677.250125558036|
|  Male| 55361.09385573019|
+------+------------------+



In [16]:
#Business Question 2 (Step 1): Split by job title, between men and women, who get paid more on average?
df = mydata2.withColumn('female_salary', when(mydata2.gender == 'Female', mydata2.new_salary).otherwise(lit(0)))
pdf = df.toPandas()
display(pdf)

Unnamed: 0,id,first_name,last_name,gender,City,JobTitle,Salary,Latitude,Longitude,clean_city,clean_salary,new_salary,lat,female_salary
0,1,Melinde,Shilburne,Female,Nowa Ruda,Assistant Professor,$57438.18,50.5774075,16.4967184,Nowa Ruda,57438.179688,57438.179688,50.5774075,57438.179688
1,2,Kimberly,Von Welden,Female,Bulgan,Programmer II,$62846.60,48.8231572,103.5218199,Bulgan,62846.601562,62846.601562,48.8231572,62846.601562
2,4,Shannon,O'Griffin,Male,Divnomorskoye,Budget/Accounting Analyst II,$61489.23,44.5047212,38.1300171,Divnomorskoye,61489.230469,61489.230469,44.5047212,0.000000
3,5,Sherwood,Macieja,Male,Mytishchi,VP Sales,$63863.09,,37.6489954,Mytishchi,63863.089844,63863.089844,31.93397331237793,0.000000
4,6,Maris,Folk,Female,Kinsealy-Drinan,Civil Engineer,$30101.16,53.4266145,-6.1644997,Kinsealy-Drinan,30101.160156,30101.160156,53.4266145,30101.160156
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
993,996,Kathye,Grasser,Female,Dzaoudzi,Accountant IV,$65520.45,-12.7822122,45.2582086,Dzaoudzi,65520.449219,65520.449219,-12.7822122,65520.449219
994,997,Haskell,Kempston,Male,Ban Talat Nua,Biostatistician I,$37021.92,7.9122744,98.3459726,Ban Talat Nua,37021.921875,37021.921875,7.9122744,0.000000
995,998,Holly-anne,Gerbl,Female,Guanaja,Speech Pathologist,$16200.10,16.4826614,-85.8793252,Guanaja,16200.099609,16200.099609,16.4826614,16200.099609
996,999,Marysa,Purdie,Female,Sioah,Desktop Support Technician,$95912.44,5.5864963,95.3514016,Sioah,95912.437500,95912.437500,5.5864963,95912.437500


In [17]:
#Business Question 2 (Step 2): Split by job title, between men and women, who get paid more on average?
df = df.withColumn('male_salary', when(df.gender == 'Male', df.new_salary).otherwise(lit(0)))
pdf = df.toPandas()
display(pdf)

Unnamed: 0,id,first_name,last_name,gender,City,JobTitle,Salary,Latitude,Longitude,clean_city,clean_salary,new_salary,lat,female_salary,male_salary
0,1,Melinde,Shilburne,Female,Nowa Ruda,Assistant Professor,$57438.18,50.5774075,16.4967184,Nowa Ruda,57438.179688,57438.179688,50.5774075,57438.179688,0.000000
1,2,Kimberly,Von Welden,Female,Bulgan,Programmer II,$62846.60,48.8231572,103.5218199,Bulgan,62846.601562,62846.601562,48.8231572,62846.601562,0.000000
2,4,Shannon,O'Griffin,Male,Divnomorskoye,Budget/Accounting Analyst II,$61489.23,44.5047212,38.1300171,Divnomorskoye,61489.230469,61489.230469,44.5047212,0.000000,61489.230469
3,5,Sherwood,Macieja,Male,Mytishchi,VP Sales,$63863.09,,37.6489954,Mytishchi,63863.089844,63863.089844,31.93397331237793,0.000000,63863.089844
4,6,Maris,Folk,Female,Kinsealy-Drinan,Civil Engineer,$30101.16,53.4266145,-6.1644997,Kinsealy-Drinan,30101.160156,30101.160156,53.4266145,30101.160156,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
993,996,Kathye,Grasser,Female,Dzaoudzi,Accountant IV,$65520.45,-12.7822122,45.2582086,Dzaoudzi,65520.449219,65520.449219,-12.7822122,65520.449219,0.000000
994,997,Haskell,Kempston,Male,Ban Talat Nua,Biostatistician I,$37021.92,7.9122744,98.3459726,Ban Talat Nua,37021.921875,37021.921875,7.9122744,0.000000,37021.921875
995,998,Holly-anne,Gerbl,Female,Guanaja,Speech Pathologist,$16200.10,16.4826614,-85.8793252,Guanaja,16200.099609,16200.099609,16.4826614,16200.099609,0.000000
996,999,Marysa,Purdie,Female,Sioah,Desktop Support Technician,$95912.44,5.5864963,95.3514016,Sioah,95912.437500,95912.437500,5.5864963,95912.437500,0.000000


In [18]:
#Business Question 2 (Step 3): Split by job title, between men and women, who get paid more on average?
df = df.groupBy('JobTitle').agg(sqlfunc.avg('female_salary').alias('final_female_salary'), sqlfunc.avg('male_salary').alias('final_male_salary'))
df.show()

+--------------------+-------------------+------------------+
|            JobTitle|final_female_salary| final_male_salary|
+--------------------+-------------------+------------------+
|Systems Administr...|    50590.474609375|  15540.9501953125|
|   Media Manager III| 29586.436197916668|17381.920572916668|
|  Recruiting Manager| 34848.452473958336|  26383.4951171875|
|       Geologist III|       31749.046875|    12830.75390625|
|        Geologist II|                0.0|   43293.865234375|
|Database Administ...|                0.0|     52018.4609375|
|   Financial Analyst|    23353.776953125|       39606.05625|
|  Analyst Programmer|   16406.1287109375|  21042.9634765625|
|Software Engineer II|                0.0|      74782.640625|
|       Accountant IV|    82732.248046875|               0.0|
|    Product Engineer|     41825.48359375|       20464.94375|
|Software Test Eng...|   32218.6083984375|   27122.462890625|
|Safety Technician...|                0.0|   29421.529296875|
|    Jun

In [19]:
#Business Question 2 (Step 4): Split by job title, between men and women, who get paid more on average?
df = df.withColumn('delta', df.final_female_salary - df.final_male_salary)
df.show()

+--------------------+-------------------+------------------+-------------------+
|            JobTitle|final_female_salary| final_male_salary|              delta|
+--------------------+-------------------+------------------+-------------------+
|Systems Administr...|    50590.474609375|  15540.9501953125|   35049.5244140625|
|   Media Manager III| 29586.436197916668|17381.920572916668|       12204.515625|
|  Recruiting Manager| 34848.452473958336|  26383.4951171875|  8464.957356770836|
|       Geologist III|       31749.046875|    12830.75390625|     18918.29296875|
|        Geologist II|                0.0|   43293.865234375|   -43293.865234375|
|Database Administ...|                0.0|     52018.4609375|     -52018.4609375|
|   Financial Analyst|    23353.776953125|       39606.05625|   -16252.279296875|
|  Analyst Programmer|   16406.1287109375|  21042.9634765625| -4636.834765625001|
|Software Engineer II|                0.0|      74782.640625|      -74782.640625|
|       Accounta

In [20]:
#Business Question 3: Which city has the highest average salary? Does geography impact the average salary?
cityavg = mydata2.groupBy('City').agg(sqlfunc.avg('new_salary').alias('avgsalary'))
cityavg = cityavg.sort(col('avgsalary').desc())
cityavg.show()

+-----------------+-------------+
|             City|    avgsalary|
+-----------------+-------------+
|        Mesopotam|  99948.28125|
|       Zhongcheng| 99942.921875|
|           Caxias|99786.3984375|
|      Karangtawar|99638.9921875|
|        Itabaiana|  99502.15625|
|           Pasian|  99421.34375|
|           Webuye| 99368.546875|
|      Yuktae-dong| 99250.828125|
|           Zinder|  99222.84375|
|   Timiryazevskiy|   99142.9375|
|        Sawahbaru|99013.7109375|
|          Madimba|98737.8671875|
|         Huangshi|  98690.34375|
|          Gharyan|   98679.3125|
|         Yŏnan-ŭp| 98628.609375|
|     Wringinputih|98603.8203125|
|Monte da Boavista|  98586.71875|
|          Klukeng|98439.4921875|
|         Murmashi|  98226.15625|
|        Fox Creek|      98138.0|
+-----------------+-------------+
only showing top 20 rows

