In [1]:
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [2]:
import multiprocessing
import pyspark

nprocs = multiprocessing.cpu_count()

spark = (pyspark.sql.SparkSession.builder
 .master('local')
 .config('spark.jars.packages', 'mysql:mysql-connector-java:8.0.16')
 .config('spark.driver.memory', '4G')
 .config('spark.driver.cores', nprocs)
 .config('spark.sql.shuffle.partitions', nprocs)
 .appName('MySparkApplication')
 .getOrCreate())

In [3]:
spark.range(10).show()

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
|  5|
|  6|
|  7|
|  8|
|  9|
+---+



In [4]:
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [5]:
import pandas as pd
import numpy as np

np.random.seed(456)

pandas_dataframe = pd.DataFrame(
    dict(n=np.arange(20), group=np.random.choice(list("abc"), 20))
)
pandas_dataframe

Unnamed: 0,n,group
0,0,b
1,1,b
2,2,c
3,3,a
4,4,c
5,5,c
6,6,a
7,7,b
8,8,a
9,9,b


In [6]:
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: bigint, group: string]

In [7]:
df.show(5)

+---+-----+
|  n|group|
+---+-----+
|  0|    b|
|  1|    b|
|  2|    c|
|  3|    a|
|  4|    c|
+---+-----+
only showing top 5 rows



In [9]:
df.describe().show()

+-------+-----------------+-----+
|summary|                n|group|
+-------+-----------------+-----+
|  count|               20|   20|
|   mean|              9.5| null|
| stddev|5.916079783099616| null|
|    min|                0|    a|
|    max|               19|    c|
+-------+-----------------+-----+



In [11]:
from pydataset import data

mpg = spark.createDataFrame(data("mpg"))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



### Spark AP Mini exercises

In [12]:
#Use the starter code above to create a pandas dataframe.
import pandas as pd
import numpy as np

np.random.seed(13)

pandas_dataframe = pd.DataFrame(
    {
        "n": np.random.randn(20),
        "group": np.random.choice(list("xyz"), 20),
        "abool": np.random.choice([True, False], 20),
    }
)

In [14]:
pandas_dataframe.head(
)

Unnamed: 0,n,group,abool
0,-0.712391,z,False
1,0.753766,x,False
2,-0.044503,z,False
3,0.451812,y,False
4,1.345102,z,False


In [15]:
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

In [16]:
df.show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
+--------------------+-----+-----+
only showing top 5 rows



In [18]:
#Show the first 3 rows of the dataframe.
df.show(3)


+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
+--------------------+-----+-----+
only showing top 3 rows



In [19]:
#Show the first 7 rows of the dataframe.
df.show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



In [21]:
#View a summary of the data using .describe.
df.describe().show()

+-------+-------------------+-----+
|summary|                  n|group|
+-------+-------------------+-----+
|  count|                 20|   20|
|   mean|0.36640264498852165| null|
| stddev| 0.8905322898155364| null|
|    min| -1.261605945319069|    x|
|    max| 2.1503829673811126|    z|
+-------+-------------------+-----+



In [22]:
#Use .select to create a new dataframe with just the n and abool columns. View the first 5 rows of this dataframe.
df.select(df.n, df.abool).show(5)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
+--------------------+-----+
only showing top 5 rows



In [23]:
#Use .select to create a new dataframe with just the group and abool columns. View the first 5 rows of this dataframe.
df.select(df.group, df.abool).show(5)

+-----+-----+
|group|abool|
+-----+-----+
|    z|false|
|    x|false|
|    z|false|
|    y|false|
|    z|false|
+-----+-----+
only showing top 5 rows



In [24]:
#Use .select to create a new dataframe with the group column and the abool column renamed to a_boolean_value. Show the first 3 rows of this dataframe.
df.select(df.group, df.abool.alias("a_boolean_value")).show(3)

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    z|          false|
|    x|          false|
|    z|          false|
+-----+---------------+
only showing top 3 rows



In [25]:
#Use .select to create a new dataframe with the group column and the n column renamed to a_numeric_value. Show the first 6 rows of this dataframe.
df.select(df.group, df.n.alias("a_numeric_value")).show(6)

+-----+--------------------+
|group|     a_numeric_value|
+-----+--------------------+
|    z|  -0.712390662050588|
|    x|   0.753766378659703|
|    z|-0.04450307833805...|
|    y| 0.45181233874578974|
|    z|  1.3451017084510097|
|    y|  0.5323378882945463|
+-----+--------------------+
only showing top 6 rows



In [26]:
#Column Manipulation
#Use the starter code above to re-create a spark dataframe. Store the spark dataframe in a varaible named df

#Use .select to add 4 to the n column. Show the results.

#Subtract 5 from the n column and view the results.
df.show(4)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
+--------------------+-----+-----+
only showing top 4 rows



In [28]:
df.select(df.n+4).show(5)

+------------------+
|           (n + 4)|
+------------------+
|3.2876093379494122|
| 4.753766378659703|
|3.9554969216619464|
|  4.45181233874579|
|5.3451017084510095|
+------------------+
only showing top 5 rows



In [30]:
df.select(df.n-5).show()

+-------------------+
|            (n - 5)|
+-------------------+
| -5.712390662050588|
| -4.246233621340297|
| -5.044503078338053|
|  -4.54818766125421|
|-3.6548982915489905|
| -4.467662111705454|
|-3.6498121002774733|
|  -4.13878862583068|
| -3.521314262564103|
| -6.045377130538534|
| -5.788989024951549|
| -6.261605945319069|
| -4.437153214718968|
| -5.243326251885563|
| -4.086259295140323|
| -4.682649077263664|
| -4.872696719793019|
|-2.8496170326188874|
| -4.393711343103702|
| -5.026771649986441|
+-------------------+



In [36]:
#Multiply the n column by 2. View the results along with the original numbers.

col1= df.n.alias("N")

col2 = (df.n*2).alias("NN")

df.select(col1, col2).show(5)

+--------------------+--------------------+
|                   N|                  NN|
+--------------------+--------------------+
|  -0.712390662050588|  -1.424781324101176|
|   0.753766378659703|   1.507532757319406|
|-0.04450307833805...|-0.08900615667610691|
| 0.45181233874578974|  0.9036246774915795|
|  1.3451017084510097|  2.6902034169020195|
+--------------------+--------------------+
only showing top 5 rows



In [37]:
#Add a new column named n2 that is the n value multiplied by -1. Show the first 4 rows of your dataframe. You should see the original n value as well as n2.
col3= (df.n*-1).alias('n2')
df.select(col1, col2, col3).show(4)

+--------------------+--------------------+--------------------+
|                   N|                  NN|                  n2|
+--------------------+--------------------+--------------------+
|  -0.712390662050588|  -1.424781324101176|   0.712390662050588|
|   0.753766378659703|   1.507532757319406|  -0.753766378659703|
|-0.04450307833805...|-0.08900615667610691|0.044503078338053455|
| 0.45181233874578974|  0.9036246774915795|-0.45181233874578974|
+--------------------+--------------------+--------------------+
only showing top 4 rows



In [40]:
#Add a new column named n3 that is the n value squared. Show the first 5 rows of your dataframe. You should see both n, n2, and n3.
col4= (df.n**2).alias("n3")
df.select(col1, col2, col3, col4).show(5)

+--------------------+--------------------+--------------------+--------------------+
|                   N|                  NN|                  n2|                  n3|
+--------------------+--------------------+--------------------+--------------------+
|  -0.712390662050588|  -1.424781324101176|   0.712390662050588|   0.507500455376875|
|   0.753766378659703|   1.507532757319406|  -0.753766378659703|  0.5681637535977627|
|-0.04450307833805...|-0.08900615667610691|0.044503078338053455|0.001980523981562...|
| 0.45181233874578974|  0.9036246774915795|-0.45181233874578974| 0.20413438944294027|
|  1.3451017084510097|  2.6902034169020195| -1.3451017084510097|  1.8092986060778251|
+--------------------+--------------------+--------------------+--------------------+
only showing top 5 rows



In [41]:
#What happens when you run the code below?
df.group + df.abool

Column<b'(group + abool)'>

In [42]:
#What happens when you run the code below? What is the difference between this and the previous code sample?
df.select(df.group + df.abool)

AnalysisException: "cannot resolve '(CAST(`group` AS DOUBLE) + `abool`)' due to data type mismatch: differing types in '(CAST(`group` AS DOUBLE) + `abool`)' (double and boolean).;;\n'Project [(cast(group#324 as double) + abool#325) AS (group + abool)#723]\n+- LogicalRDD [n#323, group#324, abool#325], false\n"

In [44]:
#Type casting
#Use the starter code above to re-create a spark dataframe.
df.show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
+--------------------+-----+-----+
only showing top 5 rows



In [47]:
#Use .printSchema to view the datatypes in your dataframe.
df.printSchema

<bound method DataFrame.printSchema of DataFrame[n: double, group: string, abool: boolean]>

In [51]:
#Use .dtypes to view the datatypes in your dataframe.
df.dtypes

[('n', 'double'), ('group', 'string'), ('abool', 'boolean')]