In [91]:
# imports

import pyspark

import pandas as pd
import numpy as np

from pydataset import data

from pyspark.sql.functions import col, expr, concat_ws

from pyspark.sql.functions import concat, sum, avg, min, max, count, mean, isnan

from pyspark.sql.functions import lit

from pyspark.sql.functions import regexp_extract, regexp_replace

from pyspark.sql.functions import when

from pyspark.sql.functions import asc, desc

from pyspark.sql.functions import month, year, quarter



## 1 / Create a spark data frame that contains your favorite programming languages.

The name of the column should be ```language```  
View the schema of the dataframe  
Output the shape of the dataframe  
Show the first 5 records in the dataframe  



In [3]:
# create spark object (using the SQL-based commands)

spark = pyspark.sql.SparkSession.builder.getOrCreate()


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/03/08 09:13:56 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [37]:
columns = ['language']
data = ['Python', '蟒蛇', 'SQL', 'Spark', 'Ruby', 'Rust', 'R', 'HTML', 'C++']

# make pandas df
p = pd.DataFrame({'language': data})

# make spark df
langs_df = spark.createDataFrame(p)



# df = spark.createDataFrame(
#     pd.DataFrame(
#         {"x": [1, 2, np.nan, 4, 5, np.nan], "y": [np.nan, 0, 0, 3, 1, np.nan]}
#     )
# )
# df.show()

langs_df.show(), langs_df.show(5)

+--------+
|language|
+--------+
|  Python|
|    蟒蛇|
|     SQL|
|   Spark|
|    Ruby|
|    Rust|
|       R|
|    HTML|
|     C++|
+--------+

+--------+
|language|
+--------+
|  Python|
|    蟒蛇|
|     SQL|
|   Spark|
|    Ruby|
+--------+
only showing top 5 rows



(None, None)

In [35]:
# size
print((langs_df.count(), len(langs_df.columns)))

print()

# schema
langs_df.printSchema()

(9, 1)

root
 |-- language: string (nullable = true)



## 2 / Load the mpg dataset as a spark dataframe.

Create 1 column of output that contains a message like :

```The 1999 audi a4 has a 4-cylinder engine.``` for each vehicle.

Transform the trans column so that it only contains either manual or auto.


In [41]:
# load mpg dataset

mpg = spark.createDataFrame(data("mpg"))


In [42]:
# show df

mpg.show(3)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 3 rows



In [51]:
# create phrase

a = 'The '
b = 'has a '
c = '-cylinder engine.'

mpg.select(concat(lit(f'{a}'), mpg.year, lit(f' '), mpg.manufacturer, lit(f' '),
                  mpg.model, lit(f' '), lit(f'{b}'), mpg.cyl, 
                  lit(f'{c}')).alias('phrase')).show(7, truncate = False)

+-----------------------------------------+
|phrase                                   |
+-----------------------------------------+
|The 1999 audi a4 has a 4-cylinder engine.|
|The 1999 audi a4 has a 4-cylinder engine.|
|The 2008 audi a4 has a 4-cylinder engine.|
|The 2008 audi a4 has a 4-cylinder engine.|
|The 1999 audi a4 has a 6-cylinder engine.|
|The 1999 audi a4 has a 6-cylinder engine.|
|The 2008 audi a4 has a 6-cylinder engine.|
+-----------------------------------------+
only showing top 7 rows



In [95]:
# / # Concatenates multiple input string columns together 
# into a single string column, using the given separator.


mpg.select(concat_ws('-*-', mpg.year, mpg.cyl).alias('s')).show(3)

# NB : adding .collect() to the end of a phrase instead of .show() shows all


+--------+
|       s|
+--------+
|1999-*-4|
|1999-*-4|
|2008-*-4|
+--------+
only showing top 3 rows



In [56]:
# look at 'trans' col

mpg.select(mpg.trans).show(3)


+----------+
|     trans|
+----------+
|  auto(l5)|
|manual(m5)|
|manual(m6)|
+----------+
only showing top 3 rows



In [85]:

mpg.select([count(when(col(c).isNotNull() , c)).alias(c) for c in mpg.columns]
   ).show()

+------------+-----+-----+----+---+-----+---+---+---+---+-----+
|manufacturer|model|displ|year|cyl|trans|drv|cty|hwy| fl|class|
+------------+-----+-----+----+---+-----+---+---+---+---+-----+
|         234|  234|  234| 234|234|  234|234|234|234|234|  234|
+------------+-----+-----+----+---+-----+---+---+---+---+-----+



In [84]:

mpg.select([count(when(col("trans").isNotNull() , c)).alias(c) for c in mpg.columns]
   ).show()

+------------+-----+-----+----+---+-----+---+---+---+---+-----+
|manufacturer|model|displ|year|cyl|trans|drv|cty|hwy| fl|class|
+------------+-----+-----+----+---+-----+---+---+---+---+-----+
|         234|  234|  234| 234|234|  234|234|234|234|234|  234|
+------------+-----+-----+----+---+-----+---+---+---+---+-----+

