# Spark API

In [1]:
import pyspark

spark = pyspark.sql.SparkSession.builder.getOrCreate()

In [2]:
spark

For demonstration, we'll create a spark dataframe from a pandas dataframe.

In [3]:
import numpy as np
import pandas as pd
import pydataset

In [4]:
tips = pydataset.data('tips')
df = spark.createDataFrame(tips)
df.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

## DataFrame Basics

In [None]:
df.show(5)

In [None]:
df.head(5)

In [None]:
# Don't do this!
# just use .show to view df contents
df2 = df.show(10)

In [None]:
type(df2)

### Selecting Columns

In [None]:
df.select('total_bill', 'tip', 'size', 'day').show()

In [None]:
df.select('*')

In [None]:
df.select(df.tip / df.total_bill).show(5)

In [None]:
col = df.tip / df.total_bill
col

In [None]:
df.select('*', col.alias('tip_pct')).show(5)

In [None]:
df_with_tip_pct = df.select('*', col.alias('tip_pct'))

In [None]:
df_with_tip_pct.show(5)

### Selecting w/ Built In Functions

In [None]:
from pyspark.sql.functions import sum, mean, concat, lit, regexp_extract, regexp_replace, when

In [None]:
df.select(mean(df.tip), sum(df.total_bill)).show()

In [None]:
df.select(concat('day', lit(' '), 'time')).show(5)

In [None]:
df.select(df.time.cast('int')).show(5)

In [None]:
df = df.select(
    '*',
    (df.tip / df.total_bill).alias('tip_pct')
)

### When / Otherwise

In [None]:
df.select(
    'tip_pct',
    (when(df.tip_pct > .2, 'good tip')
     .otherwise('not good tip')
     .alias('tip_desc'))
).show(25)

### Regex

In [None]:
df.select(
    'time',
    regexp_extract('time', r'(\w).*', 1).alias('first_letter'),
    regexp_replace('time', r'[aeiou]', 'X')
).show(5)

## Transforming Rows

In [None]:
df.show()

### Sorting

In [None]:
df.orderBy(df.total_bill).show()

In [None]:
df.sort(df.day, df.size).show()

In [None]:
from pyspark.sql.functions import asc, desc, col

In [None]:
df.sort(df.day, asc('time'), desc('size')).show()

In [None]:
col('size').asc()

In [None]:
df.sort(col('size').desc(), col('time')).show()

### Filtering

In [None]:
df.where(df.tip < 4).show()

In [None]:
mask = df.tip < 4
df.where(mask).show()

In [None]:
df.filter((df.time == "Dinner") | (df.tip <= 2)).sort('tip').show()

In [None]:
df.where(df.smoker == "Yes").where(df.day == "Sat").show()

## Aggregating

In [None]:
from pyspark.sql.functions import mean, min, max

In [None]:
df.show(5)

In [None]:
df.groupBy('time').agg(mean('tip')).show()

In [None]:
df.groupBy('time').agg(min('tip'), mean('tip'), max('tip')).show()

In [None]:
df.groupBy('time').agg(mean('tip').alias('avg_tip')).show()

In [None]:
df.groupBy('time', 'day').agg(mean('total_bill')).show()

In [None]:
df.crosstab('time', 'day').show()

In [None]:
df.groupBy('time').pivot('day').agg(mean('total_bill')).show()

`.crosstab` is just for counts, for other methods of summarizing groups, use `.groupBy` (maybe in combination with `.pivot`) + `.agg`.

## Additional Features

### Spark SQL

In [None]:
df.createOrReplaceTempView('tips')

In [None]:
spark.sql('''
SELECT *
FROM tips
''').show()

In [None]:
# find the tip, total_bill, and day with the highest overall sales for that day
spark.sql('''
SELECT tip, total_bill, day
FROM tips
WHERE day = (
    SELECT day
    FROM tips
    GROUP BY day
    ORDER BY sum(total_bill) DESC
    LIMIT 1
)    
''').show()

### More Spark Dataframe Manipulation

In [None]:
df.where(
    df.time == 'Dinner'
).select(
    '*',
    (df.tip / df.total_bill).alias('tip_pct'),
).explain()

In [None]:
df.select(
    '*',
    (df.tip / df.total_bill).alias('tip_pct'),
).where(
    df.time == 'Dinner'
).explain()

### Mixing in SQL Expressions

In [None]:
from pyspark.sql.functions import expr

Expr lets us mix in parts of SQL into our dataframes

In [None]:
df.select(
    '*',
    expr('tip / total_bill as tip_pct')
).where(
    expr('day = "Sun" AND time = "Dinner"')
).show()

## Joins

In [28]:
df1 = spark.createDataFrame(pd.DataFrame({
    'id': np.arange(100) + 1,
    'x': np.random.randn(100).round(3),
    'group_id': np.random.choice(range(1, 7), 100),
}))
df2 = spark.createDataFrame(pd.DataFrame({
    'id': range(1, 7),
    'group': list('abcdef')
}))
df1.show(5)
df2.show()

+---+------+--------+
| id|     x|group_id|
+---+------+--------+
|  1| 0.069|       4|
|  2| 1.043|       2|
|  3|-1.496|       1|
|  4| 0.282|       1|
|  5|  0.77|       5|
+---+------+--------+
only showing top 5 rows

+---+-----+
| id|group|
+---+-----+
|  1|    a|
|  2|    b|
|  3|    c|
|  4|    d|
|  5|    e|
|  6|    f|
+---+-----+



In [21]:
df_merged = df1.join(df2, df1.group_id == df2.id)
df_merged.show(5)

+---+------+--------+---+-----+
| id|     x|group_id| id|group|
+---+------+--------+---+-----+
|  6| 0.031|       6|  6|    f|
|  8| 0.019|       6|  6|    f|
| 13| 0.972|       6|  6|    f|
| 19|-0.787|       6|  6|    f|
| 25| -0.17|       6|  6|    f|
+---+------+--------+---+-----+
only showing top 5 rows



In [22]:
df1.join(df2.withColumnRenamed('id', 'group_id'), 'group_id').show(5)

+--------+---+------+-----+
|group_id| id|     x|group|
+--------+---+------+-----+
|       6|  6| 0.031|    f|
|       6|  8| 0.019|    f|
|       6| 13| 0.972|    f|
|       6| 19|-0.787|    f|
|       6| 25| -0.17|    f|
+--------+---+------+-----+
only showing top 5 rows

