## Schemas

<https://www.kaggle.com/c/tmdb-box-office-prediction/data>

In [None]:
import pandas as pd

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("PySparkShell") \
    .getOrCreate()

In [None]:
spark

## Create Dataframes from RDD

In [None]:
cities_rdd = sc.parallelize([
    ("MAD", "Madrid", "ES", 40.4165, -3.70256),
    ("BCN", "Barcelona", "ES", 41.297078, 2.078464),
    ("PAR", "Paris", "FR", 48.85341, 2.3488),
    ("ROM", "Rome", "IT", 41.89193, 12.51133)])

cities_df = cities_rdd.toDF([
    "city_code","city_name","country_code","latitude","longitude"])

cities_df.show()

In [None]:
cities_df = spark.createDataFrame([
    ("MAD", "Madrid", "ES", 40.4165, -3.70256),
    ("BCN", "Barcelona", "ES", 41.297078, 2.078464),
    ("PAR", "Paris", "FR", 48.85341, 2.3488),
    ("ROM", "Rome", "IT", 41.89193, 12.51133)],
    ["city_code","city_name","country_code","latitude","longitude"])

cities_df.show()

In [None]:
cities_df.printSchema()

In [None]:
cities_df.filter(cities_df.country_code=="ES").show()

In [None]:
cities_df.select('city_code','country_code').show()

In [None]:
import pyspark.sql.functions as F 
agg_df = cities_df\
    .groupBy('country_code')\
    .agg(F.count(F.col('city_code')))\
    .orderBy(F.col('country_code'))
agg_df.show()

In [None]:
agg_df.explain()

In [None]:
print(agg_df.rdd.toDebugString().decode())

In [None]:
cities_df.describe().show()

### Array and Struct types

In [None]:
countries = sc.parallelize([
    ("ES", ["Spanish","Catalan","Basque"], ("MAD", "Madrid")),
    ("FR", ["French","Alsacien","Breton"], ("PAR", "Paris")),
    ("IT", ["Italian","French"], ("ROM", "Rome")),
    ("US", ["English", "Spanish"], ("WAS", "Washington"))]).toDF(["country_code", "languages", "capital"])\
    .withColumn('capital', F.struct(F.col("capital._1").alias("code"), F.col("capital._2").alias("name")))
countries.show(truncate=False)

In [None]:
countries.printSchema()

In [None]:
countries.select('country_code',F.col('languages').getItem(0),'capital.name').show()

## Read csv data

In [None]:
#!wget https://raw.githubusercontent.com/JulienCojan/pyspark_kschool/master/data/tmdb-box-office-prediction/train.csv -P data/tmdb-box-office-prediction

In [None]:
films_sdf = spark\
    .read\
    .csv("data/tmdb-box-office-prediction/train.csv", header=True)

In [None]:
films_sdf.printSchema()

In [None]:
films_sdf.show()

In [None]:
print(films_sdf.select("cast").limit(10).toPandas().iloc[1]["cast"])

In [None]:
from pyspark.sql.types import IntegerType
import pyspark.sql.functions as F

films_sdf = films_sdf\
    .withColumn("id", films_sdf.id.cast(IntegerType()))
    
    
films_sdf.printSchema()

In [None]:
films_sdf.select("genres").show(5, truncate=False)

In [None]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType

schema = ArrayType(
    StructType([
        StructField("id", IntegerType()),
        StructField("name", StringType())
    ])
)

films_sdf = films_sdf\
    .withColumn("genres", F.from_json(F.col("genres"), schema))

In [None]:
films_sdf.printSchema()

In [None]:
films_sdf.select("genres").show(5, truncate=False)

In [None]:
films_sdf.select("genres.name").show(5, truncate=False)

In [None]:
films_sdf.select("genres.name").printSchema()

## String manipulation

In [None]:
films_sdf2 = spark.read.csv("data/tmdb-box-office-prediction/train.csv", header=True)

In [None]:
films_sdf2.select("genres").show(truncate=False)

In [None]:
genres_col = films_sdf2\
    .select(
    "id", 
    F.col("genres").substr(F.lit(2), F.length(F.col("genres"))-2).alias("genres"))

genres_col.show(3, truncate=False)

In [None]:
genres_col\
     .withColumn("genres_array", F.split(F.col("genres"), '(?<=}), '))\
     .head(2)

In [None]:
genre_sdf = genres_col\
    .select("id", F.explode(F.split(F.col("genres"), '(?<=}), ')).alias("genre"))

genre_sdf.show(truncate=False)

In [None]:
genre_sdf.select(
    F.col('id'),
    F.regexp_extract(F.col("genre"), "(?<='name':\ ')[A-Z][a-z]+", 0).alias("genre_name")
    ).groupBy("genre_name")\
    .count().orderBy(F.desc("count")).show()

## Saving data

In [None]:
films_sdf.printSchema()

In [None]:
films_sdf.repartition(2).write.mode('overwrite').parquet("data/tmdb1")

In [None]:
!ls data/tmdb1

In [None]:
new_films_sdf = spark.read.parquet("data/tmdb1")

In [None]:
new_films_sdf.printSchema()

## Schema

In [None]:
from pyspark.sql.types import StructField, StructType, IntegerType, LongType

schema = \
StructType([
    StructField("id", IntegerType(), False),
    StructField("genres", ArrayType(
        StructType([
            StructField("id", IntegerType()),
            StructField("name", StringType())
        ])))
    ])

In [None]:
new_films_sdf = spark.read.schema(schema=schema).parquet("data/tmdb1")

In [None]:
new_films_sdf.printSchema()

## Partition

In [None]:
#!wget https://raw.githubusercontent.com/JulienCojan/pyspark_kschool/master/data/competitive-data-science-predict-future-sales/sales_train.csv.gz -P data/competitive-data-science-predict-future-sales/

In [None]:
sales_sdf = spark.read.option("header", "true").csv("data/competitive-data-science-predict-future-sales/sales_train.csv.gz")

In [None]:
import pyspark.sql.functions as F
sales_with_iso_dates = sales_sdf\
    .withColumn("date", F.from_unixtime(F.unix_timestamp(F.col("date"), 'dd.MM.yyyy')))\
    .withColumn("year", F.year("date"))\
    .withColumn("month", F.month("date"))\
    .withColumn("day", F.dayofmonth("date"))

In [None]:
sales_with_iso_dates.show()

In [None]:
sales_with_iso_dates.write.partitionBy("year", "month", "day").mode('overwrite').parquet("data/tmdb2")

In [None]:
!ls data/tmdb2

In [None]:
!ls data/tmdb2/year=2013/month=3/day=19

In [None]:
films_sdf2 = spark.read.parquet("data/tmdb2/year=2013")

In [None]:
films_sdf2.show()

In [None]:
films_sdf3 = spark\
    .read\
    .option("basePath", "data/tmdb2")\
    .parquet(
        "data/tmdb2/year=2014/month=3",
        "data/tmdb2/year=2014/month=4"
    )

In [None]:
films_sdf3.groupBy("year", "month").count().show()

## Handling NA

In [None]:
from pyspark.sql import Row

In [None]:
df1 = spark.createDataFrame([
    Row(id=1, value=15.0),
    Row(id=2, value=None),
    Row(id=3, value=float('NaN')),
])
df1.show()

In [None]:
df2 = spark.createDataFrame([
    Row(id=1, value=float('NaN')),
    Row(id=2, value=42.0),
    Row(id=3, value=None)
])
df2.show()

In [None]:
df1.join(df2, df1["value"] == df2["value"]).show()

In [None]:
df1.join(df2, df1["value"].eqNullSafe(df2["value"])).show()

In [None]:
df2.select(
    df2['value'].eqNullSafe(None),
    df2['value'].eqNullSafe(float('NaN')),
    df2['value'].eqNullSafe(42.0)
).show()

In [None]:
pdf1 = df1.toPandas()
pdf2 = df2.toPandas()

In [None]:
pdf2.dtypes

In [None]:
pdf1.join(pdf2, on="value", lsuffix="_1", how='inner')