## Spark API mini exercises

In [4]:
import pandas as pd
import numpy as np
import pyspark
np.random.seed(13)

In [2]:
pandas_dataframe = pd.DataFrame(
    {
        "n": np.random.randn(20),
        "group": np.random.choice(list("xyz"), 20),
        "abool": np.random.choice([True, False], 20),
    }
)

In [6]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()

df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

---

## 1.

In [7]:
df.show(3)

[Stage 0:>                                                          (0 + 1) / 1]

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
+--------------------+-----+-----+
only showing top 3 rows



                                                                                

In [8]:
df.show(7)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
|  0.5323378882945463|    y|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 7 rows



In [9]:
df.head()

Row(n=-0.712390662050588, group='z', abool=False)

In [10]:
df.head(3)

[Row(n=-0.712390662050588, group='z', abool=False),
 Row(n=0.753766378659703, group='x', abool=False),
 Row(n=-0.044503078338053455, group='z', abool=False)]

In [11]:
df.head(7)

[Row(n=-0.712390662050588, group='z', abool=False),
 Row(n=0.753766378659703, group='x', abool=False),
 Row(n=-0.044503078338053455, group='z', abool=False),
 Row(n=0.45181233874578974, group='y', abool=False),
 Row(n=1.3451017084510097, group='z', abool=False),
 Row(n=0.5323378882945463, group='y', abool=False),
 Row(n=1.3501878997225267, group='z', abool=False)]

- .show() displays a table with the designated number of rows
- .head() returns a list of rows

In [14]:
df.describe().show()

+-------+------------------+-----+
|summary|                 n|group|
+-------+------------------+-----+
|  count|                20|   20|
|   mean|0.3664026449885216| null|
| stddev|0.8905322898155363| null|
|    min|-1.261605945319069|    x|
|    max|2.1503829673811126|    z|
+-------+------------------+-----+



In [15]:
df.select('n', 'abool').show(5)

+--------------------+-----+
|                   n|abool|
+--------------------+-----+
|  -0.712390662050588|false|
|   0.753766378659703|false|
|-0.04450307833805...|false|
| 0.45181233874578974|false|
|  1.3451017084510097|false|
+--------------------+-----+
only showing top 5 rows



In [16]:
df.select('group', 'abool').show(5)

+-----+-----+
|group|abool|
+-----+-----+
|    z|false|
|    x|false|
|    z|false|
|    y|false|
|    z|false|
+-----+-----+
only showing top 5 rows



In [19]:
df.select('group', df.abool.alias('a_boolean_value')).show(3)

+-----+---------------+
|group|a_boolean_value|
+-----+---------------+
|    z|          false|
|    x|          false|
|    z|          false|
+-----+---------------+
only showing top 3 rows



In [21]:
df.select('group', df.n.alias('a_numeric_value')).show(6)

+-----+--------------------+
|group|     a_numeric_value|
+-----+--------------------+
|    z|  -0.712390662050588|
|    x|   0.753766378659703|
|    z|-0.04450307833805...|
|    y| 0.45181233874578974|
|    z|  1.3451017084510097|
|    y|  0.5323378882945463|
+-----+--------------------+
only showing top 6 rows



---

## 2.

In [22]:
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

In [24]:
df.select(df.n, df.n + 4).show(5)

+--------------------+------------------+
|                   n|           (n + 4)|
+--------------------+------------------+
|  -0.712390662050588|3.2876093379494122|
|   0.753766378659703| 4.753766378659703|
|-0.04450307833805...|3.9554969216619464|
| 0.45181233874578974|  4.45181233874579|
|  1.3451017084510097|5.3451017084510095|
+--------------------+------------------+
only showing top 5 rows



In [25]:
df.select(df.n, df.n - 5).show(5)

+--------------------+-------------------+
|                   n|            (n - 5)|
+--------------------+-------------------+
|  -0.712390662050588| -5.712390662050588|
|   0.753766378659703| -4.246233621340297|
|-0.04450307833805...| -5.044503078338053|
| 0.45181233874578974|  -4.54818766125421|
|  1.3451017084510097|-3.6548982915489905|
+--------------------+-------------------+
only showing top 5 rows



In [26]:
df.select(df.n, df.n *2).show(5)

+--------------------+--------------------+
|                   n|             (n * 2)|
+--------------------+--------------------+
|  -0.712390662050588|  -1.424781324101176|
|   0.753766378659703|   1.507532757319406|
|-0.04450307833805...|-0.08900615667610691|
| 0.45181233874578974|  0.9036246774915795|
|  1.3451017084510097|  2.6902034169020195|
+--------------------+--------------------+
only showing top 5 rows



In [27]:
df.select(df.n, (df.n * -1).alias('n2')).show(4)

+--------------------+--------------------+
|                   n|                  n2|
+--------------------+--------------------+
|  -0.712390662050588|   0.712390662050588|
|   0.753766378659703|  -0.753766378659703|
|-0.04450307833805...|0.044503078338053455|
| 0.45181233874578974|-0.45181233874578974|
+--------------------+--------------------+
only showing top 4 rows



In [29]:
df.select(df.n, (df.n * -1).alias('n2'), (df.n * df.n).alias('n3')).show(5)

+--------------------+--------------------+--------------------+
|                   n|                  n2|                  n3|
+--------------------+--------------------+--------------------+
|  -0.712390662050588|   0.712390662050588|   0.507500455376875|
|   0.753766378659703|  -0.753766378659703|  0.5681637535977627|
|-0.04450307833805...|0.044503078338053455|0.001980523981562...|
| 0.45181233874578974|-0.45181233874578974| 0.20413438944294027|
|  1.3451017084510097| -1.3451017084510097|  1.8092986060778251|
+--------------------+--------------------+--------------------+
only showing top 5 rows



In [30]:
df.group + df.abool

Column<'(group + abool)'>

In [32]:
#df.select(df.group + df.abool)

## 3.

In [33]:
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

In [37]:
df.printSchema()

root
 |-- n: double (nullable = true)
 |-- group: string (nullable = true)
 |-- abool: boolean (nullable = true)



In [38]:
df.dtypes

[('n', 'double'), ('group', 'string'), ('abool', 'boolean')]

In [39]:
df.abool.cast('int')

Column<'CAST(abool AS INT)'>

In [41]:
df.select(df.abool.cast('int')).show(5)

+-----+
|abool|
+-----+
|    0|
|    0|
|    0|
|    0|
|    0|
+-----+
only showing top 5 rows



In [46]:
df.select(df.group.cast('int')).show(5)

+-----+
|group|
+-----+
| null|
| null|
| null|
| null|
| null|
+-----+
only showing top 5 rows



In [48]:
df.select(df.n, df.n.cast('int')).show(5)

+--------------------+---+
|                   n|  n|
+--------------------+---+
|  -0.712390662050588|  0|
|   0.753766378659703|  0|
|-0.04450307833805...|  0|
| 0.45181233874578974|  0|
|  1.3451017084510097|  1|
+--------------------+---+
only showing top 5 rows



In [50]:
df.select(df.abool, df.abool.cast('string')).show(5)

+-----+-----+
|abool|abool|
+-----+-----+
|false|false|
|false|false|
|false|false|
|false|false|
|false|false|
+-----+-----+
only showing top 5 rows



## 4.

In [51]:
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

In [52]:
import pyspark.sql.functions as f

In [55]:
df.select(f.max(df.n)).show()

+------------------+
|            max(n)|
+------------------+
|2.1503829673811126|
+------------------+



In [56]:
df.select(f.min(df.n)).show()

+------------------+
|            min(n)|
+------------------+
|-1.261605945319069|
+------------------+



In [58]:
df.select(f.avg(df.n)).show()

+------------------+
|            avg(n)|
+------------------+
|0.3664026449885216|
+------------------+



In [65]:
from pyspark.sql.functions import when

df.select(df.group, when(df.group == 'x', 'group x').when(df.group == 'y', 'group y').when(df.group == 'z', 'group z').alias('group name')).show(5)



+-----+----------+
|group|group name|
+-----+----------+
|    z|   group z|
|    x|   group x|
|    z|   group z|
|    y|   group y|
|    z|   group z|
+-----+----------+
only showing top 5 rows



In [64]:
from pyspark.sql.functions import lit
df.select(f.concat(df.group, lit(':'), df.n)).show(5)

+--------------------+
| concat(group, :, n)|
+--------------------+
|z:-0.712390662050588|
| x:0.753766378659703|
|z:-0.044503078338...|
|y:0.4518123387457...|
|z:1.3451017084510097|
+--------------------+
only showing top 5 rows



## 5.

In [66]:
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

In [67]:
df.select(df.abool, when(df.abool == 'True', 'It is True').when(df.abool == 'False', 'It is False')).show(5)

+-----+----------------------------------------------------------------------------------+
|abool|CASE WHEN (abool = True) THEN It is True WHEN (abool = False) THEN It is False END|
+-----+----------------------------------------------------------------------------------+
|false|                                                                       It is False|
|false|                                                                       It is False|
|false|                                                                       It is False|
|false|                                                                       It is False|
|false|                                                                       It is False|
+-----+----------------------------------------------------------------------------------+
only showing top 5 rows



In [68]:
df.select(df.n, when(df.n < 0, 0).otherwise(df.n)).show(5)

+--------------------+-----------------------------------+
|                   n|CASE WHEN (n < 0) THEN 0 ELSE n END|
+--------------------+-----------------------------------+
|  -0.712390662050588|                                0.0|
|   0.753766378659703|                  0.753766378659703|
|-0.04450307833805...|                                0.0|
| 0.45181233874578974|                0.45181233874578974|
|  1.3451017084510097|                 1.3451017084510097|
+--------------------+-----------------------------------+
only showing top 5 rows



## 6.

In [69]:
df = spark.createDataFrame(pandas_dataframe)
df

DataFrame[n: double, group: string, abool: boolean]

In [70]:
df.filter(df.group == 'y').show(5)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|0.45181233874578974|    y|false|
| 0.5323378882945463|    y|false|
|-1.0453771305385342|    y| true|
| -1.261605945319069|    y|false|
| 0.5628467852810314|    y| true|
+-------------------+-----+-----+
only showing top 5 rows



In [71]:
df.filter(df.abool == 'false').show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
| 0.45181233874578974|    y|false|
|  1.3451017084510097|    z|false|
+--------------------+-----+-----+
only showing top 5 rows



In [72]:
df.filter(df.group != 'y').show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|   0.753766378659703|    x|false|
|-0.04450307833805...|    z|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
+--------------------+-----+-----+
only showing top 5 rows



In [73]:
df.filter(df.n > 0).show(5)

+-------------------+-----+-----+
|                  n|group|abool|
+-------------------+-----+-----+
|  0.753766378659703|    x|false|
|0.45181233874578974|    y|false|
| 1.3451017084510097|    z|false|
| 0.5323378882945463|    y|false|
| 1.3501878997225267|    z|false|
+-------------------+-----+-----+
only showing top 5 rows



In [75]:
df.filter(df.abool == 'false').where(df.group == 'z').show(5)

+--------------------+-----+-----+
|                   n|group|abool|
+--------------------+-----+-----+
|  -0.712390662050588|    z|false|
|-0.04450307833805...|    z|false|
|  1.3451017084510097|    z|false|
|  1.3501878997225267|    z|false|
| 0.12730328020698067|    z|false|
+--------------------+-----+-----+

