In [4]:
import pyspark

import pandas as pd
import numpy as np

from pydataset import data

In [105]:
from pyspark.sql.functions import col, expr

In [5]:
# creating our spark object

spark = pyspark.sql.SparkSession.builder.getOrCreate()

1. Create a spark data frame that contains your favorite programming languages.

- The name of the column should be language
- View the schema of the dataframe
- Output the shape of the dataframe
- Show the first 5 records in the dataframe


In [16]:
language_df = spark.createDataFrame(pd.DataFrame({'language': ['python', 'R', 'java','javascript', 'scala', 'c++', 'sas', 'ruby'],
                                                  'score': [10,7,6,4,3,5,2,1]}))

In [17]:
language_df.show()

+----------+-----+
|  language|score|
+----------+-----+
|    python|   10|
|         R|    7|
|      java|    6|
|javascript|    4|
|     scala|    3|
|       c++|    5|
|       sas|    2|
|      ruby|    1|
+----------+-----+



In [18]:
language_df.printSchema()

root
 |-- language: string (nullable = true)
 |-- score: long (nullable = true)



In [20]:
# one way to get the shape of the data. We can see that there are
# 8 observations and 2 columns
language_df.describe().show()

+-------+--------+------------------+
|summary|language|             score|
+-------+--------+------------------+
|  count|       8|                 8|
|   mean|    null|              4.75|
| stddev|    null|2.9154759474226504|
|    min|       R|                 1|
|    max|   scala|                10|
+-------+--------+------------------+



In [21]:
# we can also use another function to get the length of the dataframe
language_df.count()

8

In [22]:
# lets only see the first five observations

language_df.show(5)

+----------+-----+
|  language|score|
+----------+-----+
|    python|   10|
|         R|    7|
|      java|    6|
|javascript|    4|
|     scala|    3|
+----------+-----+
only showing top 5 rows



2. Load the mpg dataset as a spark dataframe.

    - Create 1 column of output that contains a message like the one below:

        - The 1999 audi a4 has a 4 cylinder engine.
    - For each vehicle.

        - Transform the trans column so that it only contains either manual or auto.



In [23]:
mpg = spark.createDataFrame(data('mpg'))

In [85]:
from pyspark.sql.functions import when



In [75]:
from pyspark.sql.functions import lit
from pyspark.sql.functions import concat, sum, avg, min, max, count, mean

In [86]:
# Create 1 column of output that contains a message like the one below:
# The 1999 audi a4 has a 4 cylinder engine.

mpg.select(concat(lit('The '), mpg.year, lit(' '), mpg.manufacturer,lit(' '), mpg.model,lit(' '), lit('has '), mpg.cyl, lit('cylinder engine'))).alias('Final').show(1)

+----------------------------------------------------------------------------+
|concat(The , year,  , manufacturer,  , model,  , has , cyl, cylinder engine)|
+----------------------------------------------------------------------------+
|                                                        The 1999 audi a4 ...|
+----------------------------------------------------------------------------+
only showing top 1 row



In [87]:
mpg.show()

+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|manual(m5)|  4| 18| 26|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|  auto(l5)|  4| 16| 25|  p|c

In [88]:
from pyspark.sql.functions import regexp_extract, regexp_replace

In [95]:
#For each vehicle.

# Transform the trans column so that it only contains either manual or auto.

mpg.select( 'manufacturer', 'model', 'displ', 'year', 'cyl', 'trans', regexp_extract('trans', r"(\w+)", 1)\
           .alias('transmission'), 'drv', 'cty', 'hwy', 'fl', 'class').show()

+------------+------------------+-----+----+---+----------+------------+---+---+---+---+-------+
|manufacturer|             model|displ|year|cyl|     trans|transmission|drv|cty|hwy| fl|  class|
+------------+------------------+-----+----+---+----------+------------+---+---+---+---+-------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|        auto|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|1999|  4|manual(m5)|      manual|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|2008|  4|manual(m6)|      manual|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|2008|  4|  auto(av)|        auto|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|        auto|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|1999|  6|manual(m5)|      manual|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|2008|  6|  auto(av)|        auto|  f| 18| 27|  p|compact|
|        audi|        a4 quatt

3. Load the tips dataset as a spark dataframe.

    - What percentage of observations are smokers?
    - Create a column that contains the tip percentage
    - Calculate the average tip percentage for each combination of sex and smoker.

In [96]:
tips = spark.createDataFrame(data('tips'))

In [98]:
tips.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [101]:
# percentage of smoker 

(tips.filter(tips.smoker == 'Yes').count()/tips.count()) * 100

38.114754098360656

In [110]:
# creating a separate column for tip percentage

tips.select(expr('(tip/total_bill)*100').alias('tip_percentage')).show()

+------------------+
|    tip_percentage|
+------------------+
|5.9446733372572105|
|16.054158607350097|
|16.658733936220845|
| 13.97804054054054|
|14.680764538430255|
| 18.62396204033215|
| 22.80501710376283|
|11.607142857142858|
|13.031914893617023|
|21.853856562922868|
| 16.65043816942551|
|14.180374361883155|
|10.181582360570687|
|16.277807921866522|
|20.364126770060686|
|18.164967562557923|
| 16.16650532429816|
|22.774708410067525|
|20.624631703005306|
|16.222760290556902|
+------------------+
only showing top 20 rows

