In [166]:
import pyspark
import pandas as pd
import numpy as np
import pydataset
import re
from pyspark.sql.functions import regexp_replace, col, udf, exp, lit
from pyspark.sql.types import IntegerType

In [97]:
spark = pyspark.sql.SparkSession.builder.getOrCreate()
spark

In [98]:
data = spark.createDataFrame(pd.DataFrame({'languages' : ['python', 'DOS', 'colbolt', 'moo', 'batch', 'c']}))

In [99]:
data.show()
data.printSchema()
print((data.count(), len(data.columns)))

+---------+
|languages|
+---------+
|   python|
|      DOS|
|  colbolt|
|      moo|
|    batch|
|        c|
+---------+

root
 |-- languages: string (nullable = true)

(6, 1)


In [100]:
data.show(5)

+---------+
|languages|
+---------+
|   python|
|      DOS|
|  colbolt|
|      moo|
|    batch|
+---------+
only showing top 5 rows



In [101]:
data = spark.createDataFrame(pydataset.data('mpg'))

In [102]:
data.show()

+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|             model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+------------------+-----+----+---+----------+---+---+---+---+-------+
|        audi|                a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|1999|  6|manual(m5)|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|2008|  6|  auto(av)|  f| 18| 27|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|manual(m5)|  4| 18| 26|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|  auto(l5)|  4| 16| 25|  p|c

In [103]:
data = spark.createDataFrame(data.withColumn('trans',regexp_replace(data['trans'],r'\([^)]*\)', '')).collect())

In [104]:
data.show()

+------------+------------------+-----+----+---+------+---+---+---+---+-------+
|manufacturer|             model|displ|year|cyl| trans|drv|cty|hwy| fl|  class|
+------------+------------------+-----+----+---+------+---+---+---+---+-------+
|        audi|                a4|  1.8|1999|  4|  auto|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|1999|  4|manual|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|2008|  4|manual|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|2008|  4|  auto|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|1999|  6|  auto|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|1999|  6|manual|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|2008|  6|  auto|  f| 18| 27|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|manual|  4| 18| 26|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|  auto|  4| 16| 25|  p|compact|
|        audi|        a4 quattro|  2.0|2

In [107]:
data = spark.createDataFrame(pydataset.data('tips'))

In [109]:
data.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [138]:
print('%' + str(int((data.where(data.smoker == 'Yes').count() / data.count()) * 100)) + ' are smokers')

%38 are smokers


In [168]:
data = data.withColumn("percent_tip", lit((data.tip / data.total_bill) * 100))
data.show()

+----------+----+------+------+---+------+----+------------------+
|total_bill| tip|   sex|smoker|day|  time|size|       percent_tip|
+----------+----+------+------+---+------+----+------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|5.9446733372572105|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|16.054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|16.658733936220845|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 13.97804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|14.680764538430255|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4| 18.62396204033215|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2| 22.80501710376283|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|11.607142857142858|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|13.031914893617023|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|21.853856562922868|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2| 16.65043816942551|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|14.180374361883

In [179]:
group = data.groupby('sex', 'smoker')
group.agg({'percent_tip':'mean'}).show()

+------+------+------------------+
|   sex|smoker|  avg(percent_tip)|
+------+------+------------------+
|  Male|    No| 16.06687151291298|
|  Male|   Yes|15.277117520248513|
|Female|    No|15.692097076918358|
|Female|   Yes| 18.21503526994103|
+------+------+------------------+

