In [0]:
from pyspark.sql import SparkSession

## Spark Session

In [0]:
spark = SparkSession.builder.appName("MyApp").getOrCreate()

In [0]:
myRange = spark.range(1000).toDF("number")

In [0]:
print(myRange)

In [0]:
myRange.show(5)

In [0]:
divBy2 = myRange.where("number%2=0")

In [0]:
divBy2.show(5)

## Action

In [0]:
divBy2.count()

## Read CSV

In [0]:
csv_path = "workspace.default.2015_summary"

In [0]:
flightData2015 = spark.table("workspace.default.2015_summary")
flightData2015.show(3)

In [0]:
flightData2015.take(3)


In [0]:
flightData2015.sort("count").explain()

## SQL and DataFrames

In [0]:
flightData2015.createOrReplaceTempView("flight_data_2015")

In [0]:
sqlWay = spark.sql("""
SELECT DEST_COUNTRY_NAME, count(1)
FROM flight_data_2015
GROUP BY DEST_COUNTRY_NAME
""")

In [0]:
dataFrameWay = flightData2015.groupBy("DEST_COUNTRY_NAME").count()

In [0]:
sqlWay.explain()
dataFrameWay.explain()

In [0]:
spark.sql("""
          SELECT MAX(count)
          FROM flight_data_2015
          """).take(2)

In [0]:
from pyspark.sql.functions import max

In [0]:
flightData2015.select(max("count")).take(1)

In [0]:
maxSql = spark.sql("""
          SELECT DEST_COUNTRY_NAME, sum(count) as  destination_total
          FROM flight_data_2015
          GROUP BY DEST_COUNTRY_NAME
          ORDER BY sum(count) DESC
          LIMIT 5
          """)

In [0]:
maxSql.show()

In [0]:
from pyspark.sql.functions import desc

In [0]:
flightData2015\
.groupBy("DEST_COUNTRY_NAME")\
.sum("count")\
.withColumnRenamed("sum(count)", "destination_total")\
.sort(desc("destination_total"))\
.limit(5)\
.show()
