In [1]:
import jedi

In [2]:
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession

conf = SparkConf().setAppName("Spark SQL Course")
sc = SparkContext(conf=conf)

spark = (SparkSession
    .builder
    .appName("Spark SQL Course")
    .getOrCreate()
)

# `DataFrame`

In [None]:
from pyspark.sql import Row

row1 = Row(name="John", age=21)
row2 = Row(name="James", age=32)
row3 = Row(name="Jane", age=18)
row1['name']

In [None]:
df = spark.createDataFrame([row1, row2, row3])

In [None]:
df.printSchema()

In [None]:
df.show()

In [None]:
print(df.rdd.toDebugString().decode("utf-8"))

In [None]:
df.rdd.getNumPartitions()

## Creating dataframes

In [None]:
rows = [
    Row(name="John", age=21, gender="male"),
    Row(name="James", age=25, gender="female"),
    Row(name="Albert", age=46, gender="male"),
    Row(**{'name': "Caesar", 'age': 56, 'gender': 'male'})
]
df = spark.createDataFrame(rows)
df.show()

In [None]:
column_names = ["name", "age", "gender"]
rows = [
    ["John", 21, "male"],
    ["James", 25, "female"],
    ["Albert", 46, "male"], 
    ["Jane", 33, None]
]
df = spark.createDataFrame(rows, column_names)
df.show()

In [None]:
df.printSchema()

In [None]:
column_names = ["name", "age", "gender"]

rdd = sc.parallelize([
    ("John", 21, "male"),
    ("James", 25, "female"),
    ("Albert", 46, "male")
])

df = spark.createDataFrame(rdd, column_names)
df.show()

In [None]:
rdd

# Schema

In [None]:
df.schema

In [None]:
type(df.schema)

In [None]:
from pyspark.sql.types import *

schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("gender", StringType(), True)
])
rows = [("John", 21, "male")]
df = spark.createDataFrame(rows, schema)
df.printSchema()
df.show()

In [None]:
!pwd
from pathlib import Path
import os
dirpath = Path(os.getcwd())
dirpath.joinpath('gro.csv.gz')

In [None]:
df = (spark.read
    .format('csv')
    .option('header', 'true')
    .option('sep', ";")
    .load('gro.csv.gz')
)

In [None]:
df.printSchema()

In [None]:
from datetime import date

?date

In [None]:

products = spark.createDataFrame([
    ('1', 'mouse', 'microsoft', 39.99),
    ('2', 'keyboard', 'logitech', 59.99),
], ['prod_id', 'prod_cat', 'prod_brand', 'prod_value'])

purchases = spark.createDataFrame([
    (date(2017, 11, 1), 2, '1'),
    (date(2017, 11, 2), 1, '1'),
    (date(2017, 11, 5), 1, '2'),
], ['date', 'quantity', 'prod_id'])

# The default join type is the "INNER" join
purchases.join(products, 'prod_id').show()

In [None]:
products.createOrReplaceTempView("products")
purchases.createOrReplaceTempView("purchases")

query = """
SELECT * FROM
(purchases AS prc INNER JOIN products AS prd 
on prc.prod_id = prd.prod_id)
"""
spark.sql(query).show()


In [None]:
new_purchases = spark.createDataFrame([
    (date(2017, 11, 1), 2, '1'),
    (date(2017, 11, 2), 1, '3'),
], ['date', 'quantity', 'prod_id_x'])

# The default join type is the "INNER" join
join_rule = new_purchases.prod_id_x == products.prod_id
new_purchases.join(products, join_rule, 'left').show()


In [None]:
new_purchases = spark.createDataFrame([
    (date(2017, 11, 1), 2, '1'),
    (date(2017, 11, 2), 1, '3'),
], ['date', 'quantity', 'prod_id_x'])

# The default join type is the "INNER" join
join_rule = new_purchases.prod_id_x == products.prod_id
new_purchases.join(products, join_rule, 'left').show()

# Various types of joins

In [None]:

left = spark.createDataFrame([
    (1, "A1"), (2, "A2"), (3, "A3"), (4, "A4")], 
    ["id", "value"])

right = spark.createDataFrame([
    (3, "A3"), (4, "A4"), (4, "A4_1"), (5, "A5"), (6, "A6")], 
    ["id", "value"])

print("LEFT")
left.orderBy("id").show()

print("RIGHT")
right.orderBy("id").show()

join_types = [
    "inner", "outer", "left", "right",
    "leftsemi", "leftanti"
]

In [None]:
for join_type in join_types:
    print(join_type)
    left.join(right, on="id", how=join_type)\
        .orderBy("id")\
        .show()