In [58]:
import pyspark


In [59]:
from pyspark.sql import SparkSession


In [60]:
spark = SparkSession.builder.getOrCreate()


In [61]:
spark

In [62]:
mydata = spark.read.format('csv').option("header",True).load("salary.csv")

In [63]:
mydata.show()

+----+------+-----------------+--------------------+-----------------+-------+
| Age|Gender|  Education Level|            JobTitle|YearsOfExperience| Salary|
+----+------+-----------------+--------------------+-----------------+-------+
|  32|  Male|       Bachelor's|   Software Engineer|                5| $90000|
|  28|Female|         Master's|        Data Analyst|                3| $65000|
|  45|  Male|              PhD|      Senior Manager|               15|$150000|
|  27|  Male|             NULL|           Developer|                7|$100000|
|  36|Female|       Bachelor's|     Sales Associate|                7| $60000|
|  52|  Male|         Master's|            Director|               20|$200000|
|  29|  Male|       Bachelor's|   Marketing Analyst|                2| $55000|
|  42|Female|         Master's|     Product Manager|               12|$120000|
|  31|  Male|       Bachelor's|       Sales Manager|                4| $80000|
|  26|Female|Bachelor's Degree|            Social M|

In [64]:
mydata.count()

6703

In [65]:
mydata.describe().show()

+-------+-----------------+------+---------------+---------------+-----------------+-------+
|summary|              Age|Gender|Education Level|       JobTitle|YearsOfExperience| Salary|
+-------+-----------------+------+---------------+---------------+-----------------+-------+
|  count|             6702|  6702|           6701|           6702|             6701|   6699|
|   mean|33.62085944494181|  NULL|           NULL|           NULL|8.094687360095508|   NULL|
| stddev|7.614632626251294|  NULL|           NULL|           NULL|6.059003056634107|   NULL|
|    min|               21|Female|     Bachelor's|Account Manager|                0|$100000|
|    max|               62| Other|            phD|  Web Developer|                9| $99747|
+-------+-----------------+------+---------------+---------------+-----------------+-------+



In [66]:
from pyspark.sql.functions import *

In [67]:
mydata = mydata.withColumn("Education", when(mydata["Education Level"].isNull(),"unknown").otherwise(mydata["Education Level"]))

In [68]:
mydata.show()

+----+------+-----------------+--------------------+-----------------+-------+-----------------+
| Age|Gender|  Education Level|            JobTitle|YearsOfExperience| Salary|        Education|
+----+------+-----------------+--------------------+-----------------+-------+-----------------+
|  32|  Male|       Bachelor's|   Software Engineer|                5| $90000|       Bachelor's|
|  28|Female|         Master's|        Data Analyst|                3| $65000|         Master's|
|  45|  Male|              PhD|      Senior Manager|               15|$150000|              PhD|
|  27|  Male|             NULL|           Developer|                7|$100000|          unknown|
|  36|Female|       Bachelor's|     Sales Associate|                7| $60000|       Bachelor's|
|  52|  Male|         Master's|            Director|               20|$200000|         Master's|
|  29|  Male|       Bachelor's|   Marketing Analyst|                2| $55000|       Bachelor's|
|  42|Female|         Master's

In [69]:
mydata = mydata.filter(mydata.Age.isNotNull())   

In [70]:
mydata.show()

+---+------+-----------------+--------------------+-----------------+-------+-----------------+
|Age|Gender|  Education Level|            JobTitle|YearsOfExperience| Salary|        Education|
+---+------+-----------------+--------------------+-----------------+-------+-----------------+
| 32|  Male|       Bachelor's|   Software Engineer|                5| $90000|       Bachelor's|
| 28|Female|         Master's|        Data Analyst|                3| $65000|         Master's|
| 45|  Male|              PhD|      Senior Manager|               15|$150000|              PhD|
| 27|  Male|             NULL|           Developer|                7|$100000|          unknown|
| 36|Female|       Bachelor's|     Sales Associate|                7| $60000|       Bachelor's|
| 52|  Male|         Master's|            Director|               20|$200000|         Master's|
| 29|  Male|       Bachelor's|   Marketing Analyst|                2| $55000|       Bachelor's|
| 42|Female|         Master's|     Produ

In [71]:
mydata = mydata.withColumn("cleaned_salary", mydata.Salary.substr(2,100).cast("float"))

In [72]:
mydata.show()

+---+------+-----------------+--------------------+-----------------+-------+-----------------+--------------+
|Age|Gender|  Education Level|            JobTitle|YearsOfExperience| Salary|        Education|cleaned_salary|
+---+------+-----------------+--------------------+-----------------+-------+-----------------+--------------+
| 32|  Male|       Bachelor's|   Software Engineer|                5| $90000|       Bachelor's|       90000.0|
| 28|Female|         Master's|        Data Analyst|                3| $65000|         Master's|       65000.0|
| 45|  Male|              PhD|      Senior Manager|               15|$150000|              PhD|      150000.0|
| 27|  Male|             NULL|           Developer|                7|$100000|          unknown|      100000.0|
| 36|Female|       Bachelor's|     Sales Associate|                7| $60000|       Bachelor's|       60000.0|
| 52|  Male|         Master's|            Director|               20|$200000|         Master's|      200000.0|
|

In [73]:
mean_salary = mydata.select(avg("cleaned_salary"))
mean_salary.show()


+-------------------+
|avg(cleaned_salary)|
+-------------------+
| 115326.96477086132|
+-------------------+



In [74]:
mean = mydata.select(avg("cleaned_salary")).take(1)[0][0]
mean

115326.96477086132

In [75]:
mydata = mydata.withColumn("new_salary", when(mydata["cleaned_salary"].isNull(),mean).otherwise(mydata["cleaned_salary"]))

In [76]:
mydata.show()

+---+------+-----------------+--------------------+-----------------+-------+-----------------+--------------+------------------+
|Age|Gender|  Education Level|            JobTitle|YearsOfExperience| Salary|        Education|cleaned_salary|        new_salary|
+---+------+-----------------+--------------------+-----------------+-------+-----------------+--------------+------------------+
| 32|  Male|       Bachelor's|   Software Engineer|                5| $90000|       Bachelor's|       90000.0|           90000.0|
| 28|Female|         Master's|        Data Analyst|                3| $65000|         Master's|       65000.0|           65000.0|
| 45|  Male|              PhD|      Senior Manager|               15|$150000|              PhD|      150000.0|          150000.0|
| 27|  Male|             NULL|           Developer|                7|$100000|          unknown|      100000.0|          100000.0|
| 36|Female|       Bachelor's|     Sales Associate|                7| $60000|       Bachel

In [77]:
years_experience = mydata.select("YearsOfExperience").show()
years_experience

+-----------------+
|YearsOfExperience|
+-----------------+
|                5|
|                3|
|               15|
|                7|
|                7|
|               20|
|                2|
|               12|
|                4|
|             NULL|
|                1|
|               10|
|                3|
|               18|
|                8|
|                6|
|               14|
|                6|
|                2|
|               16|
+-----------------+
only showing top 20 rows



In [None]:
years_experience = years_experience.filter(years_experience.YearsOfExperience.isNotNull())
years_experience.show()

AttributeError: 'NoneType' object has no attribute 'filter'