In [12]:
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession

1. Create a spark data frame that contains your favorite programming languages.

- The name of the column should be language
- View the schema of the dataframe
- Output the shape of the dataframe
- Show the first 5 records in the dataframe

In [16]:
# Create a SparkSession
spark = SparkSession.builder.getOrCreate()

# Define the data
data = [("Python",), ("Java",), ("JavaScript",), ("C++",), ("Ruby",)]

# Create the DataFrame
df = spark.createDataFrame(data, ["language"])

# View the schema
df.printSchema()

# Output the shape
print("Number of rows: ", df.count())
print("Number of columns: ", len(df.columns))

# Show the first 5 records
df.show(5)


root
 |-- language: string (nullable = true)

Number of rows:  5
Number of columns:  1
+----------+
|  language|
+----------+
|    Python|
|      Java|
|JavaScript|
|       C++|
|      Ruby|
+----------+



2. Load the mpg dataset as a spark dataframe.

- Create 1 column of output that contains a message like the one below:


The 1999 audi a4 has a 4 cylinder engine.
For each vehicle.

- Transform the trans column so that it only contains either manual or auto.



In [84]:
from pydataset import data

In [116]:
df = spark.createDataFrame(data('mpg'))
df.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [117]:
from pyspark.sql.functions import concat, lit

In [118]:
# create 1 column
df.withColumn(
    "message",
    concat(lit("The "),
           df.year,
           lit(' '), 
           df.manufacturer, 
           lit(' '), 
           df.model, 
           lit(' has a '), 
           df.cyl, 
           lit(" cylinder engine."))
).show(5)



+------------+-----+-----+----+---+----------+---+---+---+---+-------+--------------------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|             message|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+--------------------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|The 1999 audi a4 ...|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|The 1999 audi a4 ...|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|The 2008 audi a4 ...|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|The 2008 audi a4 ...|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|The 1999 audi a4 ...|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+--------------------+
only showing top 5 rows



In [119]:
from pyspark.sql.functions import regexp_extract, regexp_replace

In [120]:
# using regexp extract the manual and auto
df = df.withColumn("trans", regexp_extract(df.trans, r"(\b\w+\b)", 0))

In [121]:
df.show()

+------------+------------------+-----+----+---+------+---+---+---+---+-------+
|manufacturer|             model|displ|year|cyl| trans|drv|cty|hwy| fl|  class|
+------------+------------------+-----+----+---+------+---+---+---+---+-------+
|        audi|                a4|  1.8|1999|  4|  auto|  f| 18| 29|  p|compact|
|        audi|                a4|  1.8|1999|  4|manual|  f| 21| 29|  p|compact|
|        audi|                a4|  2.0|2008|  4|manual|  f| 20| 31|  p|compact|
|        audi|                a4|  2.0|2008|  4|  auto|  f| 21| 30|  p|compact|
|        audi|                a4|  2.8|1999|  6|  auto|  f| 16| 26|  p|compact|
|        audi|                a4|  2.8|1999|  6|manual|  f| 18| 26|  p|compact|
|        audi|                a4|  3.1|2008|  6|  auto|  f| 18| 27|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|manual|  4| 18| 26|  p|compact|
|        audi|        a4 quattro|  1.8|1999|  4|  auto|  4| 16| 25|  p|compact|
|        audi|        a4 quattro|  2.0|2