In [2]:
import pyspark


In [3]:
from pyspark.sql import SparkSession


In [4]:
spark = SparkSession.builder.getOrCreate()


In [5]:
spark

In [6]:
mydata = spark.read.format('csv').option("header",True).load("salary.csv")

In [7]:
mydata.show()

+----+------+-----------------+--------------------+-----------------+-------+
| Age|Gender|  Education Level|            JobTitle|YearsOfExperience| Salary|
+----+------+-----------------+--------------------+-----------------+-------+
|  32|  Male|       Bachelor's|   Software Engineer|                5| $90000|
|  28|Female|         Master's|        Data Analyst|                3| $65000|
|  45|  Male|              PhD|      Senior Manager|               15|$150000|
|  27|  Male|             NULL|           Developer|                7|$100000|
|  36|Female|       Bachelor's|     Sales Associate|                7| $60000|
|  52|  Male|         Master's|            Director|               20|$200000|
|  29|  Male|       Bachelor's|   Marketing Analyst|                2| $55000|
|  42|Female|         Master's|     Product Manager|               12|$120000|
|  31|  Male|       Bachelor's|       Sales Manager|                4| $80000|
|  26|Female|Bachelor's Degree|            Social M|

In [8]:
mydata.count()

6703

In [9]:
mydata.describe().show()

+-------+-----------------+------+---------------+---------------+-----------------+-------+
|summary|              Age|Gender|Education Level|       JobTitle|YearsOfExperience| Salary|
+-------+-----------------+------+---------------+---------------+-----------------+-------+
|  count|             6702|  6702|           6701|           6702|             6701|   6699|
|   mean|33.62085944494181|  NULL|           NULL|           NULL|8.094687360095508|   NULL|
| stddev|7.614632626251294|  NULL|           NULL|           NULL|6.059003056634107|   NULL|
|    min|               21|Female|     Bachelor's|Account Manager|                0|$100000|
|    max|               62| Other|            phD|  Web Developer|                9| $99747|
+-------+-----------------+------+---------------+---------------+-----------------+-------+



In [10]:
from pyspark.sql.functions import *

In [11]:
mydata = mydata.withColumn("Education", when(mydata["Education Level"].isNull(),"unknown").otherwise(mydata["Education Level"]))

In [12]:
mydata.show()

+----+------+-----------------+--------------------+-----------------+-------+-----------------+
| Age|Gender|  Education Level|            JobTitle|YearsOfExperience| Salary|        Education|
+----+------+-----------------+--------------------+-----------------+-------+-----------------+
|  32|  Male|       Bachelor's|   Software Engineer|                5| $90000|       Bachelor's|
|  28|Female|         Master's|        Data Analyst|                3| $65000|         Master's|
|  45|  Male|              PhD|      Senior Manager|               15|$150000|              PhD|
|  27|  Male|             NULL|           Developer|                7|$100000|          unknown|
|  36|Female|       Bachelor's|     Sales Associate|                7| $60000|       Bachelor's|
|  52|  Male|         Master's|            Director|               20|$200000|         Master's|
|  29|  Male|       Bachelor's|   Marketing Analyst|                2| $55000|       Bachelor's|
|  42|Female|         Master's

In [13]:
mydata = mydata.filter(mydata.Age.isNotNull())   

In [14]:
mydata.show()

+---+------+-----------------+--------------------+-----------------+-------+-----------------+
|Age|Gender|  Education Level|            JobTitle|YearsOfExperience| Salary|        Education|
+---+------+-----------------+--------------------+-----------------+-------+-----------------+
| 32|  Male|       Bachelor's|   Software Engineer|                5| $90000|       Bachelor's|
| 28|Female|         Master's|        Data Analyst|                3| $65000|         Master's|
| 45|  Male|              PhD|      Senior Manager|               15|$150000|              PhD|
| 27|  Male|             NULL|           Developer|                7|$100000|          unknown|
| 36|Female|       Bachelor's|     Sales Associate|                7| $60000|       Bachelor's|
| 52|  Male|         Master's|            Director|               20|$200000|         Master's|
| 29|  Male|       Bachelor's|   Marketing Analyst|                2| $55000|       Bachelor's|
| 42|Female|         Master's|     Produ

In [15]:
mydata = mydata.withColumn("cleaned_salary", mydata.Salary.substr(2,100).cast("float"))

In [16]:
mydata.show()

+---+------+-----------------+--------------------+-----------------+-------+-----------------+--------------+
|Age|Gender|  Education Level|            JobTitle|YearsOfExperience| Salary|        Education|cleaned_salary|
+---+------+-----------------+--------------------+-----------------+-------+-----------------+--------------+
| 32|  Male|       Bachelor's|   Software Engineer|                5| $90000|       Bachelor's|       90000.0|
| 28|Female|         Master's|        Data Analyst|                3| $65000|         Master's|       65000.0|
| 45|  Male|              PhD|      Senior Manager|               15|$150000|              PhD|      150000.0|
| 27|  Male|             NULL|           Developer|                7|$100000|          unknown|      100000.0|
| 36|Female|       Bachelor's|     Sales Associate|                7| $60000|       Bachelor's|       60000.0|
| 52|  Male|         Master's|            Director|               20|$200000|         Master's|      200000.0|
|

In [17]:
mean_salary = mydata.select(avg("cleaned_salary"))
mean_salary.show()


+-------------------+
|avg(cleaned_salary)|
+-------------------+
| 115326.96477086132|
+-------------------+



In [18]:
mean = mydata.select(avg("cleaned_salary")).take(1)[0][0]
mean

115326.96477086132

In [19]:
mydata = mydata.withColumn("new_salary", when(mydata["cleaned_salary"].isNull(),mean).otherwise(mydata["cleaned_salary"]))

In [20]:
mydata.show()

+---+------+-----------------+--------------------+-----------------+-------+-----------------+--------------+------------------+
|Age|Gender|  Education Level|            JobTitle|YearsOfExperience| Salary|        Education|cleaned_salary|        new_salary|
+---+------+-----------------+--------------------+-----------------+-------+-----------------+--------------+------------------+
| 32|  Male|       Bachelor's|   Software Engineer|                5| $90000|       Bachelor's|       90000.0|           90000.0|
| 28|Female|         Master's|        Data Analyst|                3| $65000|         Master's|       65000.0|           65000.0|
| 45|  Male|              PhD|      Senior Manager|               15|$150000|              PhD|      150000.0|          150000.0|
| 27|  Male|             NULL|           Developer|                7|$100000|          unknown|      100000.0|          100000.0|
| 36|Female|       Bachelor's|     Sales Associate|                7| $60000|       Bachel

In [21]:
years_experience = mydata.select("YearsOfExperience")


In [22]:
years_experience = years_experience.filter(years_experience.YearsOfExperience.isNotNull())
years_experience.show()

+-----------------+
|YearsOfExperience|
+-----------------+
|                5|
|                3|
|               15|
|                7|
|                7|
|               20|
|                2|
|               12|
|                4|
|                1|
|               10|
|                3|
|               18|
|                8|
|                6|
|               14|
|                6|
|                2|
|               16|
|                7|
+-----------------+
only showing top 20 rows



In [23]:
years_experience = years_experience.withColumn("yearsXP", years_experience.YearsOfExperience.cast("float"))
years_experience.show()

+-----------------+-------+
|YearsOfExperience|yearsXP|
+-----------------+-------+
|                5|    5.0|
|                3|    3.0|
|               15|   15.0|
|                7|    7.0|
|                7|    7.0|
|               20|   20.0|
|                2|    2.0|
|               12|   12.0|
|                4|    4.0|
|                1|    1.0|
|               10|   10.0|
|                3|    3.0|
|               18|   18.0|
|                8|    8.0|
|                6|    6.0|
|               14|   14.0|
|                6|    6.0|
|                2|    2.0|
|               16|   16.0|
|                7|    7.0|
+-----------------+-------+
only showing top 20 rows



In [25]:
median_years = np.median(years_experience.select("YearsXP").collect())
print(median_years)

7.0


In [27]:
mydata=mydata.withColumn("YearsXP", when(mydata["YearsOfExperience"].isNull(),median_years).otherwise(mydata["YearsOfExperience"]))
mydata.show()

+---+------+-----------------+--------------------+-----------------+-------+-----------------+--------------+------------------+-------+
|Age|Gender|  Education Level|            JobTitle|YearsOfExperience| Salary|        Education|cleaned_salary|        new_salary|YearsXP|
+---+------+-----------------+--------------------+-----------------+-------+-----------------+--------------+------------------+-------+
| 32|  Male|       Bachelor's|   Software Engineer|                5| $90000|       Bachelor's|       90000.0|           90000.0|      5|
| 28|Female|         Master's|        Data Analyst|                3| $65000|         Master's|       65000.0|           65000.0|      3|
| 45|  Male|              PhD|      Senior Manager|               15|$150000|              PhD|      150000.0|          150000.0|     15|
| 27|  Male|             NULL|           Developer|                7|$100000|          unknown|      100000.0|          100000.0|      7|
| 36|Female|       Bachelor's|    

In [28]:
mydata.columns

['Age',
 'Gender',
 'Education Level',
 'JobTitle',
 'YearsOfExperience',
 'Salary',
 'Education',
 'cleaned_salary',
 'new_salary',
 'YearsXP']

In [31]:
mydata2=mydata.select(['Age',
 'Gender',
 'Education',
 'JobTitle',
 'new_salary',
 'YearsXP'])

In [32]:
mydata2.show()

+---+------+-----------------+--------------------+------------------+-------+
|Age|Gender|        Education|            JobTitle|        new_salary|YearsXP|
+---+------+-----------------+--------------------+------------------+-------+
| 32|  Male|       Bachelor's|   Software Engineer|           90000.0|      5|
| 28|Female|         Master's|        Data Analyst|           65000.0|      3|
| 45|  Male|              PhD|      Senior Manager|          150000.0|     15|
| 27|  Male|          unknown|           Developer|          100000.0|      7|
| 36|Female|       Bachelor's|     Sales Associate|           60000.0|      7|
| 52|  Male|         Master's|            Director|          200000.0|     20|
| 29|  Male|       Bachelor's|   Marketing Analyst|           55000.0|      2|
| 42|Female|         Master's|     Product Manager|          120000.0|     12|
| 31|  Male|       Bachelor's|       Sales Manager|           80000.0|      4|
| 26|Female|Bachelor's Degree|            Social M|1

In [33]:
import pyspark.sql.functions as F

In [34]:
paid_job_gender_avg = mydata2.groupby("gender").agg(F.avg("new_salary").alias("avg_salary"))
paid_job_gender_max = mydata2.groupby("gender").agg(F.max("new_salary").alias("max_salary"))

In [35]:
paid_job_gender_avg.show()
paid_job_gender_max.show()

+------+------------------+
|gender|        avg_salary|
+------+------------------+
|Female|107891.46647802617|
| Other|125869.85714285714|
|  Male|121386.57047619535|
+------+------------------+

+------+----------+
|gender|max_salary|
+------+----------+
|Female|  220000.0|
| Other|  166109.0|
|  Male|  250000.0|
+------+----------+



In [39]:
paid_job_gender=mydata2.withColumn("female_salary", when(mydata2.Gender =="Female",mydata2.new_salary).otherwise(0))
paid_job_gender=paid_job_gender.withColumn("male_salary", when(mydata2.Gender =="Male",mydata2.new_salary).otherwise(0))  
paid_job_gender.show()      

+---+------+-----------------+--------------------+------------------+-------+------------------+------------------+
|Age|Gender|        Education|            JobTitle|        new_salary|YearsXP|     female_salary|       male_salary|
+---+------+-----------------+--------------------+------------------+-------+------------------+------------------+
| 32|  Male|       Bachelor's|   Software Engineer|           90000.0|      5|               0.0|           90000.0|
| 28|Female|         Master's|        Data Analyst|           65000.0|      3|           65000.0|               0.0|
| 45|  Male|              PhD|      Senior Manager|          150000.0|     15|               0.0|          150000.0|
| 27|  Male|          unknown|           Developer|          100000.0|      7|               0.0|          100000.0|
| 36|Female|       Bachelor's|     Sales Associate|           60000.0|      7|           60000.0|               0.0|
| 52|  Male|         Master's|            Director|          200

In [41]:
avg_paid_job_gender = paid_job_gender.groupBy("jobtitle").agg(
    F.avg("Female_salary").alias("avg_female_salary"),
    F.avg("Male_salary").alias("avg_male_salary")
)

In [42]:
avg_paid_job_gender.show()

+--------------------+------------------+------------------+
|            jobtitle| avg_female_salary|   avg_male_salary|
+--------------------+------------------+------------------+
|Digital Marketing...|47980.769230769234| 45288.46153846154|
|    Product Designer|               0.0|           56200.0|
|Senior Sales Manager|               0.0|          132500.0|
|Senior Software A...|               0.0|          120000.0|
|   Financial Manager|115373.13432835821|27686.567164179105|
|   Event Coordinator|           52500.0|               0.0|
|   Financial Analyst|           10000.0| 77692.30769230769|
|Senior Product Ma...|  86376.8115942029|40217.391304347824|
|Business Intellig...|               0.0|           85000.0|
|Junior Software E...|23705.882352941175|26666.666666666668|
|Senior IT Consultant|               0.0|          110000.0|
|     Sales Executive|28157.894736842107|14868.421052631578|
|   Senior Researcher|               0.0|          150000.0|
|Senior Project En...| 4

In [44]:
job_avg_paid = mydata2.groupby("jobtitle").agg(F.avg("new_salary").alias("avg_salary")).orderBy("avg_salary",ascending=False)
job_avg_paid.show()

+--------------------+------------------+
|            jobtitle|        avg_salary|
+--------------------+------------------+
|                 CEO|          250000.0|
|Chief Technology ...|          250000.0|
|  Chief Data Officer|          220000.0|
|Director of Data ...|204561.40350877194|
|       VP of Finance|          200000.0|
|            Director|          200000.0|
| Operations Director|          190000.0|
|    VP of Operations|          190000.0|
|Director of Human...|          187500.0|
|  Marketing Director|        183984.375|
|Director of Sales...|          180000.0|
|Director of Human...|          180000.0|
|Human Resources D...|          180000.0|
|Director of Engin...|          180000.0|
| Director of Finance|          175000.0|
|Director of Produ...|          175000.0|
|   Director of Sales|          175000.0|
|Director of Opera...|172727.27272727274|
|Software Engineer...|172502.16755319148|
|Director of Busin...|          170000.0|
+--------------------+------------