# Spark 101
Curriculum Notes

In [11]:
# standard python imports
import numpy as np
import pandas as pd

# importing spark session
import pyspark

# data imports library
from pydataset import data

In [4]:
# creating spark object
spark = pyspark.sql.SparkSession.builder.getOrCreate()

# creating random seed var to get same results 
np.random.seed(456)

# creating pandas df
df_pd = pd.DataFrame(dict(n = np.arange(20), 
                         group = np.random.choice(list('abc'), 20))
                    )

# viewing df
df_pd

Unnamed: 0,n,group
0,0,b
1,1,b
2,2,c
3,3,a
4,4,c
5,5,c
6,6,a
7,7,b
8,8,a
9,9,b


In [5]:
# converting pandas df to spark df
df_spark = spark.createDataFrame(df_pd)
df_spark

DataFrame[n: bigint, group: string]

In [6]:
# viewing spark df
df_spark.show()

+---+-----+
|  n|group|
+---+-----+
|  0|    b|
|  1|    b|
|  2|    c|
|  3|    a|
|  4|    c|
|  5|    c|
|  6|    a|
|  7|    b|
|  8|    a|
|  9|    b|
| 10|    b|
| 11|    a|
| 12|    b|
| 13|    a|
| 14|    b|
| 15|    b|
| 16|    c|
| 17|    c|
| 18|    a|
| 19|    c|
+---+-----+



In [8]:
# using .describe() to get summary stats
df_spark.describe()

DataFrame[summary: string, n: string, group: string]

In [10]:
# viewing results
df_spark.describe().show()

+-------+-----------------+-----+
|summary|                n|group|
+-------+-----------------+-----+
|  count|               20|   20|
|   mean|              9.5| null|
| stddev|5.916079783099616| null|
|    min|                0|    a|
|    max|               19|    c|
+-------+-----------------+-----+



In [12]:
# importing mpg data and writing to spark df
spark_mpg = spark.createDataFrame(data('mpg'))
spark_mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



### .select( ) alows us to use our df column series

In [14]:
# calling certain columns
spark_mpg.select(spark_mpg['class'], 
                spark_mpg.cty,
                spark_mpg.hwy)

DataFrame[class: string, cty: bigint, hwy: bigint]

In [15]:
# showing results
spark_mpg.select(spark_mpg['class'], 
                spark_mpg.cty,
                spark_mpg.hwy).show(5)

+-------+---+---+
|  class|cty|hwy|
+-------+---+---+
|compact| 18| 29|
|compact| 21| 29|
|compact| 20| 31|
|compact| 21| 30|
|compact| 16| 26|
+-------+---+---+
only showing top 5 rows



In [26]:
# performing operations on certain columns
spark_mpg.select(spark_mpg.year > 2000).show()

+-------------+
|(year > 2000)|
+-------------+
|        false|
|        false|
|         true|
|         true|
|        false|
|        false|
|         true|
|        false|
|        false|
|         true|
|         true|
|        false|
|        false|
|         true|
|         true|
|        false|
|         true|
|         true|
|         true|
|         true|
+-------------+
only showing top 20 rows



In [24]:
# using the alias to rename a column
spark_mpg.select(spark_mpg['class'])

DataFrame[class: string]