In [1]:
#initializing pyspark
import findspark
findspark.init()

In [2]:
#building sparksession
import pyspark
from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark

In [3]:
#reading CSV with syntax_1
df = spark.read.format("csv").option("header", "true").load("original.csv")
df.show()

+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary|  Latitude|  Longitude|
+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.4967184|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|48.8231572|103.5218199|
|  3|    Alvera|  Di Boldi|Female|           NULL|                NULL|$57576.52|39.9947462|116.3397725|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.1300171|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      NULL| 37.6489954|
|  6|     Maris|      Folk|Female|Kinsealy-Drinan|      Civil Engineer|$30101.16|53.4266145| -6.1644997|
|  7|     Masha|    Divers|Female|         Dachun|     

In [4]:
#reading CSV with syntax_2
df1 = spark.read.csv("original.csv", header=True)
df1.show()

+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary|  Latitude|  Longitude|
+---+----------+----------+------+---------------+--------------------+---------+----------+-----------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.4967184|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|48.8231572|103.5218199|
|  3|    Alvera|  Di Boldi|Female|           NULL|                NULL|$57576.52|39.9947462|116.3397725|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.1300171|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      NULL| 37.6489954|
|  6|     Maris|      Folk|Female|Kinsealy-Drinan|      Civil Engineer|$30101.16|53.4266145| -6.1644997|
|  7|     Masha|    Divers|Female|         Dachun|     

In [5]:
#printing datatypes of a DataFrame
df1.dtypes

[('id', 'string'),
 ('first_name', 'string'),
 ('last_name', 'string'),
 ('gender', 'string'),
 ('City', 'string'),
 ('JobTitle', 'string'),
 ('Salary', 'string'),
 ('Latitude', 'string'),
 ('Longitude', 'string')]

In [6]:
#customized schema
from pyspark.sql.types import *
myschema = StructType([
    StructField("id", IntegerType()),
    StructField("first_name", StringType()),
    StructField("last_name", StringType()),
    StructField("gender", StringType()),
    StructField("City", StringType()),
    StructField("JobTitle", StringType()),
    StructField("Salary", StringType()),
    StructField("Latitude", StringType()),
    StructField("Longitude", FloatType())
])
df2 = spark.read.csv("original.csv", header=True, schema=myschema)
df2.show()

+---+----------+----------+------+---------------+--------------------+---------+----------+----------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary|  Latitude| Longitude|
+---+----------+----------+------+---------------+--------------------+---------+----------+----------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.496717|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|48.8231572| 103.52182|
|  3|    Alvera|  Di Boldi|Female|           NULL|                NULL|$57576.52|39.9947462|116.339775|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.130016|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      NULL| 37.648994|
|  6|     Maris|      Folk|Female|Kinsealy-Drinan|      Civil Engineer|$30101.16|53.4266145|-6.1644998|
|  7|     Masha|    Divers|Female|         Dachun|              

In [7]:
#printing datatypes of a DataFrame with customized schema
df2.dtypes

[('id', 'int'),
 ('first_name', 'string'),
 ('last_name', 'string'),
 ('gender', 'string'),
 ('City', 'string'),
 ('JobTitle', 'string'),
 ('Salary', 'string'),
 ('Latitude', 'string'),
 ('Longitude', 'float')]

In [8]:
#printing top 10 rows
df2.head(10)

[Row(id=1, first_name='Melinde', last_name='Shilburne', gender='Female', City='Nowa Ruda', JobTitle='Assistant Professor', Salary='$57438.18', Latitude='50.5774075', Longitude=16.49671745300293),
 Row(id=2, first_name='Kimberly', last_name='Von Welden', gender='Female', City='Bulgan', JobTitle='Programmer II', Salary='$62846.60', Latitude='48.8231572', Longitude=103.52182006835938),
 Row(id=3, first_name='Alvera', last_name='Di Boldi', gender='Female', City=None, JobTitle=None, Salary='$57576.52', Latitude='39.9947462', Longitude=116.33977508544922),
 Row(id=4, first_name='Shannon', last_name="O'Griffin", gender='Male', City='Divnomorskoye', JobTitle='Budget/Accounting Analyst II', Salary='$61489.23', Latitude='44.5047212', Longitude=38.1300163269043),
 Row(id=5, first_name='Sherwood', last_name='Macieja', gender='Male', City='Mytishchi', JobTitle='VP Sales', Salary='$63863.09', Latitude=None, Longitude=37.64899444580078),
 Row(id=6, first_name='Maris', last_name='Folk', gender='Female

In [9]:
#printing headers with schema
df2.first

<bound method DataFrame.first of DataFrame[id: int, first_name: string, last_name: string, gender: string, City: string, JobTitle: string, Salary: string, Latitude: string, Longitude: float]>

In [10]:
#printing basic statistical parameters of columns
df2.describe().show()

+-------+-----------------+----------+---------+------+-------------------+-------------------+---------+-----------------+-----------------+
|summary|               id|first_name|last_name|gender|               City|           JobTitle|   Salary|         Latitude|        Longitude|
+-------+-----------------+----------+---------+------+-------------------+-------------------+---------+-----------------+-----------------+
|  count|             1000|      1000|     1000|  1000|                999|                998|     1000|              999|             1000|
|   mean|            500.5|      NULL|     NULL|  NULL|               NULL|               NULL|     NULL|25.43151724234234|43.33756460386515|
| stddev|288.8194360957494|      NULL|     NULL|  NULL|               NULL|               NULL|     NULL| 24.5790825486909| 69.4206453674681|
|    min|                1|   Abagail|    Abbay|Female|             Abéché|Account Coordinator|$10101.92|       -0.6256517|       -123.04196|
|    m

In [11]:
#printing columns
df2.columns

['id',
 'first_name',
 'last_name',
 'gender',
 'City',
 'JobTitle',
 'Salary',
 'Latitude',
 'Longitude']

In [12]:
#printing no. of rows
df2.count()

1000

In [13]:
#printing no. of distinct rows 
df2.distinct().count()

1000

In [14]:
#dropping 'NA' values
df2_dropped = df2.na.drop()
df2_dropped.show(5)

+---+----------+----------+------+---------------+--------------------+---------+----------+----------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary|  Latitude| Longitude|
+---+----------+----------+------+---------------+--------------------+---------+----------+----------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.496717|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|48.8231572| 103.52182|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.130016|
|  6|     Maris|      Folk|Female|Kinsealy-Drinan|      Civil Engineer|$30101.16|53.4266145|-6.1644998|
|  8|   Goddart|     Flear|  Male|      Trélissac|Desktop Support T...|$46116.36|45.1905186| 0.7423124|
+---+----------+----------+------+---------------+--------------------+---------+----------+----------+
only showing top 5 rows



In [15]:
#extracting data for which 'JobTitle' is NotNull
df2_null_jobs = df2.filter(df2.JobTitle.isNotNull())
df2_null_jobs.show(5)

+---+----------+----------+------+---------------+--------------------+---------+----------+----------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary|  Latitude| Longitude|
+---+----------+----------+------+---------------+--------------------+---------+----------+----------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.496717|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|48.8231572| 103.52182|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.130016|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      NULL| 37.648994|
|  6|     Maris|      Folk|Female|Kinsealy-Drinan|      Civil Engineer|$30101.16|53.4266145|-6.1644998|
+---+----------+----------+------+---------------+--------------------+---------+----------+----------+
only showing top 5 rows



In [16]:
#replacing NULL values with 'Unknown' keyword for 'City' column
from pyspark.sql.functions import *
df2_handled = df2.withColumn("clean_city", when(df2.City.isNull(), "Unknown").otherwise(df2.City))
df2_handled.show(5)

+---+----------+----------+------+-------------+--------------------+---------+----------+----------+-------------+
| id|first_name| last_name|gender|         City|            JobTitle|   Salary|  Latitude| Longitude|   clean_city|
+---+----------+----------+------+-------------+--------------------+---------+----------+----------+-------------+
|  1|   Melinde| Shilburne|Female|    Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.496717|    Nowa Ruda|
|  2|  Kimberly|Von Welden|Female|       Bulgan|       Programmer II|$62846.60|48.8231572| 103.52182|       Bulgan|
|  3|    Alvera|  Di Boldi|Female|         NULL|                NULL|$57576.52|39.9947462|116.339775|      Unknown|
|  4|   Shannon| O'Griffin|  Male|Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.130016|Divnomorskoye|
|  5|  Sherwood|   Macieja|  Male|    Mytishchi|            VP Sales|$63863.09|      NULL| 37.648994|    Mytishchi|
+---+----------+----------+------+-------------+--------------------+---

In [17]:
#dropping duplicates
df2_no_duplicates = df2.dropDuplicates()
df2_no_duplicates.show(5)

+---+----------+---------+------+------------------+--------------------+---------+-----------+----------+
| id|first_name|last_name|gender|              City|            JobTitle|   Salary|   Latitude| Longitude|
+---+----------+---------+------+------------------+--------------------+---------+-----------+----------+
|159|Georgianne|Henriques|Female|            Jinsha|Environmental Spe...|$63954.52|  30.679359|104.011665|
|215|     Angel|  Robjant|  Male|   Bambous Virieux|Occupational Ther...|$96856.73|-20.3438619| 57.763683|
|377|     Noach|  Golling|  Male|          Yuanqiao| Executive Secretary|$72247.30|    32.5639| 120.39677|
|525|     Terry|   Layton|Female|San Pedro Masahuat|    Dental Hygienist|$10808.16| 13.5432995| -89.03824|
|833|    Damara|  Beaford|Female|           Połomia|Environmental Spe...|$10616.44| 49.9890993| 18.569973|
+---+----------+---------+------+------------------+--------------------+---------+-----------+----------+
only showing top 5 rows



In [18]:
#selecting specific columns
df2_select = df2.select("first_name", "last_name")
df2_select.show(5)

+----------+----------+
|first_name| last_name|
+----------+----------+
|   Melinde| Shilburne|
|  Kimberly|Von Welden|
|    Alvera|  Di Boldi|
|   Shannon| O'Griffin|
|  Sherwood|   Macieja|
+----------+----------+
only showing top 5 rows



In [19]:
#renaming column
df2_renamed = df2.withColumnRenamed("first_name", "fn")
df2_renamed.show(5)

+---+--------+----------+------+-------------+--------------------+---------+----------+----------+
| id|      fn| last_name|gender|         City|            JobTitle|   Salary|  Latitude| Longitude|
+---+--------+----------+------+-------------+--------------------+---------+----------+----------+
|  1| Melinde| Shilburne|Female|    Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.496717|
|  2|Kimberly|Von Welden|Female|       Bulgan|       Programmer II|$62846.60|48.8231572| 103.52182|
|  3|  Alvera|  Di Boldi|Female|         NULL|                NULL|$57576.52|39.9947462|116.339775|
|  4| Shannon| O'Griffin|  Male|Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.130016|
|  5|Sherwood|   Macieja|  Male|    Mytishchi|            VP Sales|$63863.09|      NULL| 37.648994|
+---+--------+----------+------+-------------+--------------------+---------+----------+----------+
only showing top 5 rows



In [20]:
#filtering specific row w.r.t 'last_name'
df2_filter = df2.filter(df2.last_name == "Von Welden")
df2_filter.show()

+---+----------+----------+------+------+-------------+---------+----------+---------+
| id|first_name| last_name|gender|  City|     JobTitle|   Salary|  Latitude|Longitude|
+---+----------+----------+------+------+-------------+---------+----------+---------+
|  2|  Kimberly|Von Welden|Female|Bulgan|Programmer II|$62846.60|48.8231572|103.52182|
+---+----------+----------+------+------+-------------+---------+----------+---------+



In [21]:
#filtering rows w.r.t the presence of a phrase in 'first_name' column
# df2_filter = df2.filter(df2.first_name.like("%era")) #ending with 'era'
# df2_filter = df2.filter(df2.first_name.like("Kim%")) #starting with 'Kim'
df2_filter = df2.filter(df2.first_name.like("%era%")) #contains 'era'
df2_filter.show()

+---+----------+----------+------+-------+--------------------+---------+----------+----------+
| id|first_name| last_name|gender|   City|            JobTitle|   Salary|  Latitude| Longitude|
+---+----------+----------+------+-------+--------------------+---------+----------+----------+
|  3|    Alvera|  Di Boldi|Female|   NULL|                NULL|$57576.52|39.9947462|116.339775|
|177|   Gerardo|Pentercost|  Male|  Samal|    Technical Writer|$22930.28|14.6874444| 120.96388|
|328|   Veradis| Churchman|Female|Baalbek|       Social Worker|$59975.62|34.0046888|  36.21104|
|394|      Vera| Rosenbarg|Female|Šluknov|Research Assistan...|$49332.48|51.0031913| 14.442367|
|750|Fitzgerald|  Moorwood|  Male| Lamego|Human Resources A...|$64970.85|41.0953745|-7.8123803|
+---+----------+----------+------+-------+--------------------+---------+----------+----------+



In [22]:
#filtering rows for which 'first_name' column ends with 'din'
df2_filter = df2.filter(df2.first_name.endswith("din"))
df2_filter.show(5)

+---+----------+-------------+------+-----------+--------+---------+-----------+---------+
| id|first_name|    last_name|gender|       City|JobTitle|   Salary|   Latitude|Longitude|
+---+----------+-------------+------+-----------+--------+---------+-----------+---------+
|901|     Aldin|Matuszkiewicz|  Male|East London|Operator|$41468.83|-32.9549324|27.931913|
+---+----------+-------------+------+-----------+--------+---------+-----------+---------+



In [23]:
#filtering rows for which 'first_name' column starts with 'Alv'
df2_filter = df2.filter(df2.first_name.startswith("Alv"))
df2_filter.show(5)

+---+----------+---------+------+----------+--------------------+---------+----------+----------+
| id|first_name|last_name|gender|      City|            JobTitle|   Salary|  Latitude| Longitude|
+---+----------+---------+------+----------+--------------------+---------+----------+----------+
|  3|    Alvera| Di Boldi|Female|      NULL|                NULL|$57576.52|39.9947462|116.339775|
| 81|     Alvin|    Doman|  Male|      Niny|Research Assistant I|$53258.86|44.4868448| 43.940807|
|775|   Alverta| MacNulty|Female|Megalópoli| Geological Engineer|$17299.62| 37.401244| 22.136488|
+---+----------+---------+------+----------+--------------------+---------+----------+----------+



In [24]:
#filtering rows for which 'id' column contains values in-between 100 and 110
df2_filter = df2.filter(df2.id.between(100, 110))
df2_filter.show()

+---+----------+----------+------+----------------+--------------------+---------+----------+----------+
| id|first_name| last_name|gender|            City|            JobTitle|   Salary|  Latitude| Longitude|
+---+----------+----------+------+----------------+--------------------+---------+----------+----------+
|100|      Aime|   Phlippi|Female|   Leshukonskoye|   Account Executive|$15428.49|64.8973466| 45.765175|
|101|     Alene|       Odd|Female|   María la Baja|Accounting Assist...|$37379.03| 9.9141597| -75.41116|
|102|    Olivia|  Tregidgo|Female|           Dahua| Assistant Professor|$28120.01| 23.736457|107.998146|
|103|Westbrooke|Itchingham|  Male|      Đông Thành|Structural Analys...|$23180.71|20.2639567| 105.97534|
|104|   Chantal| Darlaston|Female|          Kahama|Sales Representative|$27726.38| -3.837556| 32.593826|
|105|    Dorisa|  Screwton|Female|Kuala Terengganu| Clinical Specialist|$81003.76|  5.329576| 103.13691|
|106|      Kory|     Sayre|  Male|            Caen|  Of

In [25]:
#filtering rows for which 'first_name' column contains 'Dorisa'
df2_filter = df2.filter(df2.first_name.isin("Dorisa"))
df2_filter.show()

+---+----------+---------+------+----------------+-------------------+---------+--------+---------+
| id|first_name|last_name|gender|            City|           JobTitle|   Salary|Latitude|Longitude|
+---+----------+---------+------+----------------+-------------------+---------+--------+---------+
|105|    Dorisa| Screwton|Female|Kuala Terengganu|Clinical Specialist|$81003.76|5.329576|103.13691|
+---+----------+---------+------+----------------+-------------------+---------+--------+---------+



In [26]:
#extracting a substring from 'first_name' column
df2_substr = df2.select(df2.first_name, df2.first_name.substr(1, 5).alias("name"))
df2_substr.show()

+----------+-----+
|first_name| name|
+----------+-----+
|   Melinde|Melin|
|  Kimberly|Kimbe|
|    Alvera|Alver|
|   Shannon|Shann|
|  Sherwood|Sherw|
|     Maris|Maris|
|     Masha|Masha|
|   Goddart|Godda|
|      Roth| Roth|
|      Bran| Bran|
|    Kylynn|Kylyn|
|       Rey|  Rey|
|      Kerr| Kerr|
|    Mickie|Micki|
|    Kaspar|Kaspa|
|    Norbie|Norbi|
|    Claude|Claud|
|     Thain|Thain|
|  Tiffanie|Tiffa|
|    Ettore|Ettor|
+----------+-----+
only showing top 20 rows



In [27]:
#filtering with OR condition
# df2_filter = df2.filter(df2.first_name.isin("Alene", "Aime"))
# df2_filter = df2.filter(df2.City.like("%Pedra"))
df2_filter = df2.filter(df2.first_name.isin("Alene", "Aime") | (df2.City.like("%la Baja")))
df2_filter.show()

+---+----------+---------+------+-------------+--------------------+---------+----------+---------+
| id|first_name|last_name|gender|         City|            JobTitle|   Salary|  Latitude|Longitude|
+---+----------+---------+------+-------------+--------------------+---------+----------+---------+
|100|      Aime|  Phlippi|Female|Leshukonskoye|   Account Executive|$15428.49|64.8973466|45.765175|
|101|     Alene|      Odd|Female|María la Baja|Accounting Assist...|$37379.03| 9.9141597|-75.41116|
+---+----------+---------+------+-------------+--------------------+---------+----------+---------+



In [28]:
#filtering with 'and' condition
df2_filter = df2.filter((df2.id >= 10) & (df2.id <= 20))
df2_filter.show()

+---+----------+---------+------+--------------+--------------------+---------+----------+----------+
| id|first_name|last_name|gender|          City|            JobTitle|   Salary|  Latitude| Longitude|
+---+----------+---------+------+--------------+--------------------+---------+----------+----------+
| 10|      Bran|  Trahear|  Male|      Arbeláez|Mechanical System...|$68098.42|  4.272793|-74.416016|
| 11|    Kylynn|  Lockart|Female|      El Cardo|Nuclear Power Eng...|$13604.63|     -5.85| -79.88333|
| 12|       Rey|   Meharg|Female|   Wangqingtuo|Systems Administr...|$73423.70| 39.172378| 116.93161|
| 13|      Kerr|   Braden|  Male|     Sułkowice|Compensation Analyst|$33432.99|49.8151822| 19.377174|
| 14|    Mickie|Whanstall|  Male|   Springfield|Assistant Media P...|$50838.53|42.1014803|-72.576675|
| 15|    Kaspar|    Pally|  Male|        Chrást|  Analyst Programmer|$40163.03|49.7923299| 13.491532|
| 16|    Norbie|   Gwyllt|  Male|        Xijiao|              Editor|$32492.73|43.

In [29]:
#registering a temporary table
df2.registerTempTable("Original")



In [30]:
#SQL-query_1
query_1 = spark.sql(
    'select *\
    from original')
query_1.show(5)

+---+----------+----------+------+-------------+--------------------+---------+----------+----------+
| id|first_name| last_name|gender|         City|            JobTitle|   Salary|  Latitude| Longitude|
+---+----------+----------+------+-------------+--------------------+---------+----------+----------+
|  1|   Melinde| Shilburne|Female|    Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.496717|
|  2|  Kimberly|Von Welden|Female|       Bulgan|       Programmer II|$62846.60|48.8231572| 103.52182|
|  3|    Alvera|  Di Boldi|Female|         NULL|                NULL|$57576.52|39.9947462|116.339775|
|  4|   Shannon| O'Griffin|  Male|Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.130016|
|  5|  Sherwood|   Macieja|  Male|    Mytishchi|            VP Sales|$63863.09|      NULL| 37.648994|
+---+----------+----------+------+-------------+--------------------+---------+----------+----------+
only showing top 5 rows



In [31]:
#SQL-query_2
query_2 = spark.sql(
    'select concat(first_name, " ", last_name) as full_name\
    from original\
    where gender = "Male"')
query_2.show()

+--------------------+
|           full_name|
+--------------------+
|   Shannon O'Griffin|
|    Sherwood Macieja|
|       Goddart Flear|
|     Roth O'Cannavan|
|        Bran Trahear|
|         Kerr Braden|
|    Mickie Whanstall|
|        Kaspar Pally|
|       Norbie Gwyllt|
|        Thain Habbon|
|     Ettore Gerriets|
|       Alon Chasteau|
|      Guthrey Johnke|
|       Jarvis Perone|
|Westleigh Belderf...|
|       Odell Morritt|
|      Base Summerlad|
|       Paddy Ashness|
|         Nicko Frays|
|      Hadrian Crumpe|
+--------------------+
only showing top 20 rows



In [32]:
#removing $-sign from 'Salary' column and creating a new column
df2_clean_salary = df2.withColumn("clean_salary", df2.Salary.substr(2, 100).cast('float'))
df2_clean_salary.show(5)

+---+----------+----------+------+-------------+--------------------+---------+----------+----------+------------+
| id|first_name| last_name|gender|         City|            JobTitle|   Salary|  Latitude| Longitude|clean_salary|
+---+----------+----------+------+-------------+--------------------+---------+----------+----------+------------+
|  1|   Melinde| Shilburne|Female|    Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.496717|    57438.18|
|  2|  Kimberly|Von Welden|Female|       Bulgan|       Programmer II|$62846.60|48.8231572| 103.52182|     62846.6|
|  3|    Alvera|  Di Boldi|Female|         NULL|                NULL|$57576.52|39.9947462|116.339775|    57576.52|
|  4|   Shannon| O'Griffin|  Male|Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.130016|    61489.23|
|  5|  Sherwood|   Macieja|  Male|    Mytishchi|            VP Sales|$63863.09|      NULL| 37.648994|    63863.09|
+---+----------+----------+------+-------------+--------------------+---------+-

In [33]:
#creating a new column for 'monthly_salary'
df2_monthly_salary = df2_clean_salary.withColumn('monthly_salary', df2_clean_salary.clean_salary/12)
df2_monthly_salary.show()

+---+----------+----------+------+---------------+--------------------+---------+----------+----------+------------+------------------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary|  Latitude| Longitude|clean_salary|    monthly_salary|
+---+----------+----------+------+---------------+--------------------+---------+----------+----------+------------+------------------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.496717|    57438.18| 4786.514973958333|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|48.8231572| 103.52182|     62846.6|    5237.216796875|
|  3|    Alvera|  Di Boldi|Female|           NULL|                NULL|$57576.52|39.9947462|116.339775|    57576.52| 4798.043294270833|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.130016|    61489.23|   5124.1025390625|
|  5|  Sherwood|   Macieja|  Male|      Mytishch

In [34]:
#creating a new column 'are_they_female?'
df2_are_female = df2.withColumn("are_they_female?", when(df2.gender == "Female", "Yes").otherwise("No"))
df2_are_female.show(10)

+---+----------+----------+------+---------------+--------------------+---------+----------+----------+----------------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary|  Latitude| Longitude|are_they_female?|
+---+----------+----------+------+---------------+--------------------+---------+----------+----------+----------------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.496717|             Yes|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|48.8231572| 103.52182|             Yes|
|  3|    Alvera|  Di Boldi|Female|           NULL|                NULL|$57576.52|39.9947462|116.339775|             Yes|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.130016|              No|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      NULL| 37.648994|              No|
|  6|     Maris|      Folk|Femal

In [35]:
#creating a new column 'are_they_male?'
df2_are_male = df2.withColumn("are_they_male?", when(df2.gender == "Male", "Yes").otherwise("No"))
df2_are_male.show(10)

+---+----------+----------+------+---------------+--------------------+---------+----------+----------+--------------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary|  Latitude| Longitude|are_they_male?|
+---+----------+----------+------+---------------+--------------------+---------+----------+----------+--------------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.496717|            No|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|48.8231572| 103.52182|            No|
|  3|    Alvera|  Di Boldi|Female|           NULL|                NULL|$57576.52|39.9947462|116.339775|            No|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.130016|           Yes|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      NULL| 37.648994|           Yes|
|  6|     Maris|      Folk|Female|Kinsealy-Drina

In [36]:
#creating a new column 'clean_salary' by removing $-sign from 'Salary' column
df2 = df2.withColumn("clean_salary", df2.Salary.substr(2, 100).cast("float"))
df2.show(10)

+---+----------+----------+------+---------------+--------------------+---------+----------+----------+------------+
| id|first_name| last_name|gender|           City|            JobTitle|   Salary|  Latitude| Longitude|clean_salary|
+---+----------+----------+------+---------------+--------------------+---------+----------+----------+------------+
|  1|   Melinde| Shilburne|Female|      Nowa Ruda| Assistant Professor|$57438.18|50.5774075| 16.496717|    57438.18|
|  2|  Kimberly|Von Welden|Female|         Bulgan|       Programmer II|$62846.60|48.8231572| 103.52182|     62846.6|
|  3|    Alvera|  Di Boldi|Female|           NULL|                NULL|$57576.52|39.9947462|116.339775|    57576.52|
|  4|   Shannon| O'Griffin|  Male|  Divnomorskoye|Budget/Accounting...|$61489.23|44.5047212| 38.130016|    61489.23|
|  5|  Sherwood|   Macieja|  Male|      Mytishchi|            VP Sales|$63863.09|      NULL| 37.648994|    63863.09|
|  6|     Maris|      Folk|Female|Kinsealy-Drinan|      Civil En

In [37]:
#applying 'groupby' condition for gender-wise total_salary
import pyspark.sql.functions as sqlfunc
df2_groupby = df2.groupby("gender").agg(sqlfunc.sum(df2.clean_salary).alias("total_salary"))
df2_groupby.show()

+------+--------------------+
|gender|        total_salary|
+------+--------------------+
|Female|2.7364519950195312E7|
|  Male|2.8123435678710938E7|
+------+--------------------+



In [38]:
#applying 'groupby' condition for gender-wise total_salary, average_salary, min_salary and max_salary
df2_groupby_additional = df2.groupby("gender").agg(sqlfunc.sum(df2.clean_salary).alias("total_salary"),
                                                   sqlfunc.avg(df2.clean_salary).alias("avg_salary"),
                                                   sqlfunc.min(df2.clean_salary).alias("min_salary"),
                                                   sqlfunc.max(df2.clean_salary).alias("max_salary"))
df2_groupby_additional.show()

+------+--------------------+-----------------+----------+----------+
|gender|        total_salary|       avg_salary|min_salary|max_salary|
+------+--------------------+-----------------+----------+----------+
|Female|2.7364519950195312E7|55618.94298820185|  10616.44|  99948.28|
|  Male|2.8123435678710938E7|55361.09385573019|  10101.92|  99942.92|
+------+--------------------+-----------------+----------+----------+



In [39]:
#applying 'groupby' condition for gender-wise and city-wise total_salary, average_salary, min_salary and max_salary
df2_groupby_additional_1 = df2.groupby("gender", "City").agg(sqlfunc.sum(df2.clean_salary).alias("total_salary"),
                                                   sqlfunc.avg(df2.clean_salary).alias("avg_salary"),
                                                   sqlfunc.min(df2.clean_salary).alias("min_salary"),
                                                   sqlfunc.max(df2.clean_salary).alias("max_salary"))
df2_groupby_additional_1.show()

+------+-----------------+----------------+----------------+----------+----------+
|gender|             City|    total_salary|      avg_salary|min_salary|max_salary|
+------+-----------------+----------------+----------------+----------+----------+
|Female|           Dachun| 25090.869140625| 25090.869140625|  25090.87|  25090.87|
|Female|      Trollhättan|106623.369140625|53311.6845703125|  26830.47|   79792.9|
|  Male|          Wenshao| 18941.509765625| 18941.509765625|  18941.51|  18941.51|
|Female|            Lanas| 13765.900390625| 13765.900390625|   13765.9|   13765.9|
|  Male|            Mörön|    77940.078125|    77940.078125|  77940.08|  77940.08|
|Female|             Same|   73369.7265625|   73369.7265625|  73369.73|  73369.73|
|Female|          Sawahan|  24608.83984375|  24608.83984375|  24608.84|  24608.84|
|  Male|Monte da Boavista|     98586.71875|     98586.71875|  98586.72|  98586.72|
|Female|         Nusajaya|    71637.921875|    71637.921875|  71637.92|  71637.92|
|Fem