### Spark 101 - Spark API Exercises

In [21]:
import pandas as pd
import numpy as np
import pyspark
import pyspark.sql.functions as f
from pyspark.sql.functions import when
from pyspark.sql.functions import lit
from pydataset import data

### 1.

In [9]:
languages = {'language':['python', 'sql', 'spark', 'java', 'javascript']}
df = pd.DataFrame(data=languages)
df

Unnamed: 0,language
0,python
1,sql
2,spark
3,java
4,javascript


In [13]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()
df = spark.createDataFrame(df)
df

DataFrame[language: string]

In [14]:
df.show(5)

[Stage 0:>                                                          (0 + 1) / 1]

+----------+
|  language|
+----------+
|    python|
|       sql|
|     spark|
|      java|
|javascript|
+----------+



                                                                                

In [18]:
df.describe().show()

+-------+--------+
|summary|language|
+-------+--------+
|  count|       5|
|   mean|    null|
| stddev|    null|
|    min|    java|
|    max|     sql|
+-------+--------+



In [19]:
df.printSchema()

root
 |-- language: string (nullable = true)



---

### 2.

In [22]:
mpg = spark.createDataFrame(data("mpg"))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [34]:
mpg.select(f.concat(lit('The '), mpg.year, lit(' '),
                    mpg.manufacturer, lit(' '), mpg.model, lit(' '), lit('has a '), 
                    mpg.cyl, lit(' '), lit('cylinder engine')).alias('vehicle statement')).show(10, truncate=False)

+------------------------------------------------+
|vehicle statement                               |
+------------------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine        |
|The 1999 audi a4 has a 4 cylinder engine        |
|The 2008 audi a4 has a 4 cylinder engine        |
|The 2008 audi a4 has a 4 cylinder engine        |
|The 1999 audi a4 has a 6 cylinder engine        |
|The 1999 audi a4 has a 6 cylinder engine        |
|The 2008 audi a4 has a 6 cylinder engine        |
|The 1999 audi a4 quattro has a 4 cylinder engine|
|The 1999 audi a4 quattro has a 4 cylinder engine|
|The 2008 audi a4 quattro has a 4 cylinder engine|
+------------------------------------------------+
only showing top 10 rows



In [65]:
mpg.select(f.substring_index(mpg.trans, '(', 1).alias('trans_type')).show(5)

+----------+
|trans_type|
+----------+
|      auto|
|    manual|
|    manual|
|      auto|
|      auto|
+----------+
only showing top 5 rows



---

### 3.

In [66]:
tips = spark.createDataFrame(data("tips"))
tips.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [96]:
total = tips.select(f.count(tips.smoker)).head()[0]
total

244

In [99]:
#method chaining...
tips.groupby(tips.smoker).count().select('*', (f.col('count')/total).alias('percentage')).show()

+------+-----+-------------------+
|smoker|count|         percentage|
+------+-----+-------------------+
|    No|  151| 0.6188524590163934|
|   Yes|   93|0.38114754098360654|
+------+-----+-------------------+



In [101]:
tips.select('*', ((tips.tip / tips.total_bill).alias('tip_percentage'))).show(5)

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|     tip_percentage|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.14680764538430255|
+----------+----+------+------+---+------+----+-------------------+
only showing top 5 rows

