In [10]:
import pyspark
import pydataset

In [11]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [12]:
spark

In [13]:
import pandas as pd
import numpy as np

np.random.seed(456)

pandas_dataframe = pd.DataFrame(
    dict(n=np.arange(20), group=np.random.choice(list("abc"), 20))
)
pandas_dataframe

Unnamed: 0,n,group
0,0,b
1,1,b
2,2,c
3,3,a
4,4,c
5,5,c
6,6,a
7,7,b
8,8,a
9,9,b


In [14]:
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: bigint, group: string]

In [15]:
df.show(5)

+---+-----+
|  n|group|
+---+-----+
|  0|    b|
|  1|    b|
|  2|    c|
|  3|    a|
|  4|    c|
+---+-----+
only showing top 5 rows



In [18]:
df.describe()

DataFrame[summary: string, n: string, group: string]

In [9]:
df.describe().show()

+-------+-----------------+-----+
|summary|                n|group|
+-------+-----------------+-----+
|  count|               20|   20|
|   mean|              9.5| null|
| stddev|5.916079783099616| null|
|    min|                0|    a|
|    max|               19|    c|
+-------+-----------------+-----+



In [19]:
from pydataset import data

mpg = spark.createDataFrame(data("mpg"))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [27]:
data()

Unnamed: 0,dataset_id,title
0,AirPassengers,Monthly Airline Passenger Numbers 1949-1960
1,BJsales,Sales Data with Leading Indicator
2,BOD,Biochemical Oxygen Demand
3,Formaldehyde,Determination of Formaldehyde
4,HairEyeColor,Hair and Eye Color of Statistics Students
5,InsectSprays,Effectiveness of Insect Sprays
6,JohnsonJohnson,Quarterly Earnings per Johnson & Johnson Share
7,LakeHuron,Level of Lake Huron 1875-1972
8,LifeCycleSavings,Intercountry Life-Cycle Savings Data
9,Nile,Flow of the River Nile


In [25]:
help(pydataset)

Help on package pydataset:

NAME
    pydataset

DESCRIPTION
    # __init__.py
    # main interface to pydataset module

PACKAGE CONTENTS
    datasets_handler
    dump_data
    locate_datasets
    support
    utils (package)

FUNCTIONS
    data(item=None, show_doc=False)
        loads a datasaet (from in-modules datasets) in a dataframe data structure.
        
        Args:
            item (str)      : name of the dataset to load.
            show_doc (bool) : to show the dataset's documentation.
        
        Examples:
        
        >>> iris = data('iris')
        
        
        >>> data('titanic', show_doc=True)
            : returns the dataset's documentation.
        
        >>> data()
            : like help(), returns a dataframe [Item, Title]
            for a list of the available datasets.

FILE
    /usr/local/anaconda3/lib/python3.7/site-packages/pydataset/__init__.py




In [20]:
mpg.hwy

Column<b'hwy'>

In [21]:
mpg.select(mpg.hwy, mpg.cty, mpg.model)

DataFrame[hwy: bigint, cty: bigint, model: string]

In [22]:
mpg.select(mpg.hwy, mpg.cty, mpg.model).show(10)

+---+---+----------+
|hwy|cty|     model|
+---+---+----------+
| 29| 18|        a4|
| 29| 21|        a4|
| 31| 20|        a4|
| 30| 21|        a4|
| 26| 16|        a4|
| 26| 18|        a4|
| 27| 18|        a4|
| 26| 18|a4 quattro|
| 25| 16|a4 quattro|
| 28| 20|a4 quattro|
+---+---+----------+
only showing top 10 rows



In [23]:
pydataset.data('mpg')

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact
6,audi,a4,2.8,1999,6,manual(m5),f,18,26,p,compact
7,audi,a4,3.1,2008,6,auto(av),f,18,27,p,compact
8,audi,a4 quattro,1.8,1999,4,manual(m5),4,18,26,p,compact
9,audi,a4 quattro,1.8,1999,4,auto(l5),4,16,25,p,compact
10,audi,a4 quattro,2.0,2008,4,manual(m6),4,20,28,p,compact


# Exercises

Create a directory named spark within your ds-methodologies repository. This is where you will do the exercises for this module.

Create a jupyter notebook or python script named spark101 for this exercise.

1. Create a spark data frame that contains your favorite programming languages.

    - The name of the column should be language
    - View the schema of the dataframe
    - Output the shape of the dataframe
    - Show the first 5 records in the dataframe


2. Load the mpg dataset as a spark dataframe.

    - Create 1 column of output that contains a message like the one below:
             
            The 1999 audi a4 has a 4 cylinder engine.
            For each vehicle.
            
    - Transform the trans column so that it only contains either manual or auto. 
  

3. Load the tips dataset as a spark dataframe.

    - What percentage of observations are smokers?
    - Create a column that contains the tip percentage
    - Calculate the average tip percentage for each combination of sex and smoker.


4. Use the seattle weather dataset referenced in the lesson to answer the questions below.

   - Convert the temperatures to farenheight.
   - Which month has the most rain, on average?
   - Which year was the windiest?
   - What is the most frequent type of weather in January?
   - What is the average high and low tempurature on sunny days in July in 2013 and 2014?
   - What percentage of days were rainy in q3 of 2015?
   - For each year, find what percentage of days it rained (had non-zero precipitation).

In [16]:
import pyspark
import pandas as pd

In [17]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

1. Create a spark data frame that contains your favorite programming languages.

    - The name of the column should be language
    - View the schema of the dataframe
    - Output the shape of the dataframe
    - Show the first 5 records in the dataframe

In [37]:
data = ['python', 'java', 'jupyter', 'C++', 'C#', 'clojure']

In [46]:
pandas_dataframe = pd.DataFrame(pd.Series(data), columns=['language'])
pandas_dataframe

Unnamed: 0,language
0,python
1,java
2,jupyter
3,C++
4,C#
5,clojure


In [51]:
df = spark.createDataFrame(pandas_dataframe)
df.show(5)

+--------+
|language|
+--------+
|  python|
|    java|
| jupyter|
|     C++|
|      C#|
+--------+
only showing top 5 rows



2. Load the mpg dataset as a spark dataframe.

    - Create 1 column of output that contains a message like the one below:
             
            The 1999 audi a4 has a 4 cylinder engine.
            For each vehicle.
            
    - Transform the trans column so that it only contains either manual or auto. 

In [72]:
from pydataset import data

In [73]:
mpg = data('mpg')

In [74]:
mpg.head()

Unnamed: 0,manufacturer,model,displ,year,cyl,trans,drv,cty,hwy,fl,class
1,audi,a4,1.8,1999,4,auto(l5),f,18,29,p,compact
2,audi,a4,1.8,1999,4,manual(m5),f,21,29,p,compact
3,audi,a4,2.0,2008,4,manual(m6),f,20,31,p,compact
4,audi,a4,2.0,2008,4,auto(av),f,21,30,p,compact
5,audi,a4,2.8,1999,6,auto(l5),f,16,26,p,compact


In [75]:
mpg = spark.createDataFrame(mpg)

In [76]:
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [77]:
mpg.printSchema()

root
 |-- manufacturer: string (nullable = true)
 |-- model: string (nullable = true)
 |-- displ: double (nullable = true)
 |-- year: long (nullable = true)
 |-- cyl: long (nullable = true)
 |-- trans: string (nullable = true)
 |-- drv: string (nullable = true)
 |-- cty: long (nullable = true)
 |-- hwy: long (nullable = true)
 |-- fl: string (nullable = true)
 |-- class: string (nullable = true)



In [78]:
mpg.count(), len(mpg.columns)

(234, 11)

In [81]:
mpg.select(concat(
    lit('The')
    col('year')
    lit(' ')
    col('manufacturer')
    lit(' ')
    col('model')
    lit(' has a ')
    col('cyl')
    lit(' cylinder engine'),
).alias('vehicle_cylinder_description')).show(truncate=False)

SyntaxError: invalid syntax (<ipython-input-81-9597f77f2d0b>, line 3)

3. Load the tips dataset as a spark dataframe.

    - What percentage of observations are smokers?
    - Create a column that contains the tip percentage
    - Calculate the average tip percentage for each combination of sex and smoker.

In [82]:
tips = spark.createDataFrame(pydataset.data('tips'))
tips.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



4. Use the seattle weather dataset referenced in the lesson to answer the questions below.

   - Convert the temperatures to farenheight.
   - Which month has the most rain, on average?
   - Which year was the windiest?
   - What is the most frequent type of weather in January?
   - What is the average high and low tempurature on sunny days in July in 2013 and 2014?
   - What percentage of days were rainy in q3 of 2015?
   - For each year, find what percentage of days it rained (had non-zero precipitation).

In [83]:
from vega_datasets import data

weather = data.seattle_weather().assign(date=lambda df: df.date.astype(str))
weather = spark.createDataFrame(weather)
weather.show(6)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|
|2012-01-06|          2.5|     4.4|     2.2| 2.2|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 6 rows

