In [12]:
import pyspark
import pandas as pd
import numpy as np
# create spark session
spark = pyspark.sql.SparkSession.builder.getOrCreate()
from pyspark.sql.functions import *

### Create a spark data frame that contains your favorite programming languages.
- The name of the column should be language
- View the schema of the dataframe
- Output the shape of the dataframe
- Show the first 5 records in the dataframe

In [13]:
# create pandas dataframe

pd_df = pd.DataFrame({'language': ['python', 'julia', 'ruby', 'R', 'C++', 'Javascript', 'Typescript', 'Swift', 'Rust']})

In [14]:
# create spark dataframe from panadas dataframe
sp_df = spark.createDataFrame(pd_df)
sp_df

DataFrame[language: string]

In [15]:
# View the shema of the dataframe
sp_df.printSchema()

root
 |-- language: string (nullable = true)



In [16]:
# another way to print schema
sp_df.dtypes

[('language', 'string')]

In [17]:
#output the shape of the dataframe
sp_df.describe().show()
print("DataFrame shape: ", sp_df.count(), " x ", len(sp_df.columns))

+-------+--------+
|summary|language|
+-------+--------+
|  count|       9|
|   mean|    null|
| stddev|    null|
|    min|     C++|
|    max|    ruby|
+-------+--------+

DataFrame shape:  9  x  1


In [18]:
sp_df.show(5)

+--------+
|language|
+--------+
|  python|
|   julia|
|    ruby|
|       R|
|     C++|
+--------+
only showing top 5 rows



### Load the mpg dataset as a spark dataframe.
- Create 1 column of output that contains a message like the one below:
>- The 1999 audi a4 has a 4 cylinder engine.
>- For each vehicle.
>- Transform the trans column so that it only contains either manual or auto.

In [26]:
#Spark dataframe
import pydataset

mpg = spark.createDataFrame(data("mpg"))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [20]:
mpg.select(
    mpg.year.alias("year"), col("hwy").alias("highway_mileage"),
    mpg.cty.alias("city_mileage"), mpg.cyl.alias("cylinders"))

DataFrame[year: bigint, highway_mileage: bigint, city_mileage: bigint, cylinders: bigint]

In [21]:
from pyspark.sql.functions import concat, sum, avg, min, max, count, mean

In [22]:
# concatenate string columns to create the desired output column
description = mpg.select(concat(lit("The "), mpg.year, (lit(" ")), mpg.manufacturer, (lit(" ")), mpg.model, (lit(" has a")), mpg.cyl, (lit(" engine"))))
description.show(20, False)

+-------------------------------------------------------------------+
|concat(The , year,  , manufacturer,  , model,  has a, cyl,  engine)|
+-------------------------------------------------------------------+
|The 1999 audi a4 has a4 engine                                     |
|The 1999 audi a4 has a4 engine                                     |
|The 2008 audi a4 has a4 engine                                     |
|The 2008 audi a4 has a4 engine                                     |
|The 1999 audi a4 has a6 engine                                     |
|The 1999 audi a4 has a6 engine                                     |
|The 2008 audi a4 has a6 engine                                     |
|The 1999 audi a4 quattro has a4 engine                             |
|The 1999 audi a4 quattro has a4 engine                             |
|The 2008 audi a4 quattro has a4 engine                             |
|The 2008 audi a4 quattro has a4 engine                             |
|The 1999 audi a4 qu

In [24]:
#Transform the trans column so that it only contains either manual or auto.
mpg.select(
    regexp_extract('trans', r'^(\w+)\(', 1).alias('trans_extract'),
    regexp_replace('trans', r'\(.+$', '').alias('trans_replace'),
    when(mpg.trans.like('auto%'), 'auto').otherwise('manual').alias('trans_when')
).show()

+-------------+-------------+----------+
|trans_extract|trans_replace|trans_when|
+-------------+-------------+----------+
|         auto|         auto|      auto|
|       manual|       manual|    manual|
|       manual|       manual|    manual|
|         auto|         auto|      auto|
|         auto|         auto|      auto|
|       manual|       manual|    manual|
|         auto|         auto|      auto|
|       manual|       manual|    manual|
|         auto|         auto|      auto|
|       manual|       manual|    manual|
|         auto|         auto|      auto|
|         auto|         auto|      auto|
|       manual|       manual|    manual|
|         auto|         auto|      auto|
|       manual|       manual|    manual|
|         auto|         auto|      auto|
|         auto|         auto|      auto|
|         auto|         auto|      auto|
|         auto|         auto|      auto|
|         auto|         auto|      auto|
+-------------+-------------+----------+
only showing top

### Load the tips dataset as a spark dataframe.
- What percentage of observations are smokers?
-  Create a column that contains the tip percentage
- Calculate the average tip percentage for each combination of sex and smoker.

In [27]:
# Load the tips dataset

tips = spark.createDataFrame(pydataset.data('tips'))
tips.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [None]:
mpg.hwy + 1

In [None]:
mpg.select(mpg.hwy, mpg.hwy +1).show(10)

In [None]:
mpg.select(mpg.hwy.alias('highway_milage'), mpg.hwy +1).alias('highway_milage_plus1').show(10)

In [None]:
from pyspark.sql.functions import col, expr

In [None]:
col('hwy')

In [None]:
avg_col = (col('hwy')+col('cty'))/2
mpg.select(
    col('hwy').alias("highway_milage"),
    mpg.cty.alias('city_milage'),
    avg_col.alias('avg_milage'),).show(5)

In [None]:
from pyspark.sql.functions import regexp_extract, regexp_replace

In [None]:
textdf = spark.createDataFrame(
    pd.DataFrame(
        {
            "address": [
                "600 Navarro St ste 600, San Antonio, TX 78205",
                "3130 Broadway St, San Antonio, TX 78209",
                "303 Pearl Pkwy, San Antonio, TX 78215",
                "1255 SW Loop 410, San Antonio, TX 78227",
            ]
        }
    )
)

textdf.show(truncate=False)

In [None]:
textdf.select(
    "address",
    regexp_extract("address", r"^(\d+)", 1).alias("street_no"),
    regexp_extract("address", r"^\d+\s([\w\s]+?),", 1).alias("street"),
).show(truncate=False)

In [None]:
textdf.select(
    "address",
    regexp_replace("address", r"^.*?,\s*", "").alias("city_state_zip"),
).show(truncate=False)

In [None]:
mpg.filter(mpg.cyl == 4).where(mpg["class"] == "subcompact").show()

In [None]:
from pyspark.sql.functions import when

In [None]:
mpg.select(mpg.hwy, when(mpg.hwy > 25, "good_mileage").alias("mpg_desc")).show(
    12
)

In [None]:
mpg.select(
    mpg.hwy,
    when(mpg.hwy > 25, "good_mileage")
    .otherwise("bad_mileage")
    .alias("mpg_desc"),
).show(20)

In [None]:
from pyspark.sql.functions import asc, desc
from pyspark.sql.functions import when
from pyspark.sql.functions import concat, sum, avg, min, max, count, mean
from pyspark.sql.functions import lit

In [None]:
mpg.groupBy(mpg.cyl)
mpg.groupBy(col("cyl"))
mpg.groupBy("cyl")

In [None]:
mpg.groupBy(mpg.cyl).agg(avg(mpg.cty), avg(mpg.hwy)).show()

In [None]:
mpg.groupBy("cyl", "class").agg(avg(mpg.cty), avg(mpg.hwy)).show()

In [None]:
mpg.rollup("cyl").count().sort("cyl").show()

In [None]:
mpg.rollup("cyl").agg(expr("avg(hwy)")).sort("cyl").show()

In [None]:
mpg.rollup("cyl", "class").mean("hwy").sort(col("cyl"), col("class")).show()