In [99]:
import pyspark
import pyspark.sql.functions as F
from pyspark.sql.functions import col, concat, lit, when, regexp_replace, regexp_extract, countDistinct, sum
from pyspark.sql import SparkSession
import pandas as pd
import numpy as np
from pydataset import data as pydata
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
import seaborn as sns

## Create a spark data frame that contains your favorite programming languages.

In [56]:
spark = SparkSession.builder \
    .appName("FavoriteProgrammingLanguages") \
    .getOrCreate()


data = [("Python",),
        ("Java",),
        ("JavaScript",),
        ("C++",),
        ("Ruby",),
        ("Swift",),
        ("Go",)]

columns = ["language"]
df = spark.createDataFrame(data, columns)

df.printSchema()

num_rows = df.count()
num_cols = len(df.columns)
print(f"Shape of the DataFrame: {num_rows} rows, {num_cols} columns")

df.show(5)

root
 |-- language: string (nullable = true)

Shape of the DataFrame: 7 rows, 1 columns
+----------+
|  language|
+----------+
|    Python|
|      Java|
|JavaScript|
|       C++|
|      Ruby|
+----------+
only showing top 5 rows



## The name of the column should be language

In [57]:
df.select('language').show()

+----------+
|  language|
+----------+
|    Python|
|      Java|
|JavaScript|
|       C++|
|      Ruby|
|     Swift|
|        Go|
+----------+



## View the schema of the dataframe

In [58]:
df.printSchema()


root
 |-- language: string (nullable = true)



## Output the shape of the dataframe

In [59]:
num_rows = df.count()
num_cols = len(df.columns)

In [60]:
print(f'({num_cols},{num_rows})')

(1,7)


## Show the first 5 records in the dataframe

In [61]:
df.select('language').show(5)

+----------+
|  language|
+----------+
|    Python|
|      Java|
|JavaScript|
|       C++|
|      Ruby|
+----------+
only showing top 5 rows



## Load the mpg dataset as a spark dataframe.

In [62]:
mpg = pydata('mpg')

## Create 1 column of output that contains a message like the one below:

#### "The 1999 audi a4 has a 4 cylinder engine."

### For each vehicle.

In [63]:
spark_mpg_df = spark.createDataFrame(mpg)

spark_mpg_df = spark_mpg_df.withColumn(
    "message",
    concat(
        lit("The "),
        col("manufacturer"), lit(" "),
        col("model"), lit(" has a "),
        col("cyl"), lit(" cylinder engine.")
    )
)

spark_mpg_df.show(truncate=False)




+------------+------------------+-----+----+---+----------+---+---+---+---+-------+---------------------------------------------------------+
|manufacturer|model             |displ|year|cyl|trans     |drv|cty|hwy|fl |class  |message                                                  |
+------------+------------------+-----+----+---+----------+---+---+---+---+-------+---------------------------------------------------------+
|audi        |a4                |1.8  |1999|4  |auto(l5)  |f  |18 |29 |p  |compact|The audi a4 has a 4 cylinder engine.                     |
|audi        |a4                |1.8  |1999|4  |manual(m5)|f  |21 |29 |p  |compact|The audi a4 has a 4 cylinder engine.                     |
|audi        |a4                |2.0  |2008|4  |manual(m6)|f  |20 |31 |p  |compact|The audi a4 has a 4 cylinder engine.                     |
|audi        |a4                |2.0  |2008|4  |auto(av)  |f  |21 |30 |p  |compact|The audi a4 has a 4 cylinder engine.                     |
|audi 

## Transform the trans column so that it only contains either manual or auto.

In [66]:
transformed_df = spark_mpg_df.withColumn(
    "trans",
    regexp_extract(col("trans"), r"(\w+)", 1)
)


In [67]:
transformed_df.show()

+------------+------------------+-----+----+---+------+---+---+---+---+-------+--------------------+
|manufacturer|             model|displ|year|cyl| trans|drv|cty|hwy| fl|  class|             message|
+------------+------------------+-----+----+---+------+---+---+---+---+-------+--------------------+
|        audi|                a4|  1.8|1999|  4|  auto|  f| 18| 29|  p|compact|The audi a4 has a...|
|        audi|                a4|  1.8|1999|  4|manual|  f| 21| 29|  p|compact|The audi a4 has a...|
|        audi|                a4|  2.0|2008|  4|manual|  f| 20| 31|  p|compact|The audi a4 has a...|
|        audi|                a4|  2.0|2008|  4|  auto|  f| 21| 30|  p|compact|The audi a4 has a...|
|        audi|                a4|  2.8|1999|  6|  auto|  f| 16| 26|  p|compact|The audi a4 has a...|
|        audi|                a4|  2.8|1999|  6|manual|  f| 18| 26|  p|compact|The audi a4 has a...|
|        audi|                a4|  3.1|2008|  6|  auto|  f| 18| 27|  p|compact|The audi a4 

## Load the tips dataset as a spark dataframe.

In [68]:
tips = pydata('tips')

## What percentage of observations are smokers?

In [70]:
tips_df = spark.createDataFrame(tips)

In [71]:
tips_df.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [78]:
smoker_counts = tips_df.groupBy('smoker').count().show()

+------+-----+
|smoker|count|
+------+-----+
|    No|  151|
|   Yes|   93|
+------+-----+



In [85]:
total_count = tips_df.count()

In [89]:
total_count

244

In [100]:
smoker_count = tips_df.filter(col("smoker") == "Yes").count()

smoker_percentage = (smoker_count / total_count) * 100

In [101]:
smoker_percentage

38.114754098360656

## Create a column that contains the tip percentage

In [113]:
tips_df = tips_df.withColumn(
     "tip_percentage",
    .agg(round((col("tip") / col("total_bill")) * 100,2))
)



SyntaxError: invalid syntax (310396625.py, line 3)

In [111]:
tips_df.show(5)

+----------+----+------+------+---+------+----+------------------+
|total_bill| tip|   sex|smoker|day|  time|size|    tip_percentage|
+----------+----+------+------+---+------+----+------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|5.9446733372572105|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|16.054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|16.658733936220845|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 13.97804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|14.680764538430255|
+----------+----+------+------+---+------+----+------------------+
only showing top 5 rows



## Calculate the average tip percentage for each combination of sex and smoker.

## Use the seattle weather dataset referenced in the lesson to answer the questions below.

## Convert the temperatures to fahrenheit.

## Which month has the most rain, on average?

## Which year was the windiest?

## What is the most frequent type of weather in January?

## What is the average high and low temperature on sunny days in July in 2013 and 2014?

## What percentage of days were rainy in q3 of 2015?

## For each year, find what percentage of days it rained (had non-zero precipitation).