## Schemas

<https://www.kaggle.com/c/tmdb-box-office-prediction/data>

In [None]:
import pandas as pd
import pyspark.sql.functions as F 

from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("PySparkShell") \
    .getOrCreate()

In [4]:
spark

## Create Dataframes from RDD

In [None]:
cities_rdd = sc.parallelize([
    ("MAD", "Madrid", "ES", 40.4165, -3.70256),
    ("BCN", "Barcelona", "ES", 41.297078, 2.078464),
    ("PAR", "Paris", "FR", 48.85341, 2.3488),
    ("ROM", "Rome", "IT", 41.89193, 12.51133)])

cities_df = cities_rdd.toDF([
    "city_code","city_name","country_code","latitude","longitude"])

cities_df.show()

In [None]:
cities_df = spark.createDataFrame([
    ("MAD", "Madrid", "ES", 40.4165, -3.70256),
    ("BCN", "Barcelona", "ES", 41.297078, 2.078464),
    ("PAR", "Paris", "FR", 48.85341, 2.3488),
    ("ROM", "Rome", "IT", 41.89193, 12.51133)],
    ["city_code","city_name","country_code","latitude","longitude"])

cities_df.show()

In [None]:
cities_df.printSchema()

In [None]:
cities_df.filter(cities_df.country_code=="ES").show()

In [None]:
cities_df.select('city_code','country_code').show()

In [None]:
import pyspark.sql.functions as F 
agg_df = cities_df\
    .groupBy('country_code')\
    .agg(F.count(F.col('city_code')))\
    .orderBy(F.col('country_code'))
agg_df.show()

In [None]:
agg_df.explain()

In [None]:
print(agg_df.rdd.toDebugString().decode())

In [None]:
cities_df.describe().show()

### Array and Struct types

In [None]:
countries = sc.parallelize([
    ("ES", ["Spanish","Catalan","Basque"], ("MAD", "Madrid")),
    ("FR", ["French","Alsacien","Breton"], ("PAR", "Paris")),
    ("IT", ["Italian","French"], ("ROM", "Rome")),
    ("US", ["English", "Spanish"], ("WAS", "Washington"))]).toDF(["country_code", "languages", "capital"])\
    .withColumn('capital', F.struct(F.col("capital._1").alias("code"), F.col("capital._2").alias("name")))
countries.show(truncate=False)

In [None]:
countries.printSchema()

In [None]:
countries.select('country_code',F.col('languages').getItem(0),'capital.name').show()

## Read csv data

In [None]:
#!wget https://raw.githubusercontent.com/JulienCojan/pyspark_kschool/master/data/tmdb-box-office-prediction/train.csv -P data/tmdb-box-office-prediction

In [1]:
films_sdf = spark\
    .read\
    .csv("data/tmdb-box-office-prediction/train.csv", header=True)

In [2]:
films_sdf.printSchema()

root
 |-- id: string (nullable = true)
 |-- belongs_to_collection: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- Keywords: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- crew: string (nullable = true)
 |-- revenue: string (nullable = true)



In [3]:
films_sdf.show()

+---+---------------------+---------+--------------------+--------------------+---------+-----------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+
| id|belongs_to_collection|   budget|              genres|            homepage|  imdb_id|original_language|      original_title|            overview|          popularity|         poster_path|production_companies|production_countries|        release_date|             runtime|    spoken_languages|  status|             tagline|               title|            Keywords|                cast|                crew|             revenue|
+---+---------------------+---------+--------------------+--------------------+---------+-----------------+--------------------+--------

In [4]:
print(films_sdf.select("cast").limit(10).toPandas().iloc[1]["cast"])

"[{'cast_id': 1, 'character': 'Mia Thermopolis', 'credit_id': '52fe43fe9251416c7502561f', 'gender': 1, 'id': 1813, 'name': 'Anne Hathaway', 'order': 0, 'profile_path': '/jUMOKwSUBnTcMeN1HfhutiY49Ad.jpg'}, {'cast_id': 2, 'character': 'Queen Clarisse Renaldi', 'credit_id': '52fe43fe9251416c75025623', 'gender': 1, 'id': 5823, 'name': 'Julie Andrews', 'order': 1, 'profile_path': '/6t61jkmfSA6nbYRCKR9s97CgUN6.jpg'}, {'cast_id': 3, 'character': 'Joe', 'credit_id': '52fe43fe9251416c75025627', 'gender': 2, 'id': 1210, 'name': 'H√©ctor Elizondo', 'order': 2, 'profile_path': '/48UNfVFZVr0jyMIlLPhzm8IIM7f.jpg'}, {'cast_id': 4, 'character': 'Viscount Mabrey', 'credit_id': '52fe43fe9251416c7502562b', 'gender': 2, 'id': 655, 'name': 'John Rhys-Davies', 'order': 3, 'profile_path': '/zZ67PuoFfik9QlZyfaEsFBC1yVJ.jpg'}, {'cast_id': 5, 'character': 'Lilly Moscovitz', 'credit_id': '52fe43fe9251416c7502562f', 'gender': 1, 'id': 33656, 'name': 'Heather Matarazzo', 'order': 4, 'profile_path': '/xcwR8aPuSkUCD

In [5]:
from pyspark.sql.types import IntegerType
import pyspark.sql.functions as F

films_sdf = films_sdf\
    .withColumn("id", films_sdf.id.cast(IntegerType()))
    
    
films_sdf.printSchema()

root
 |-- id: integer (nullable = true)
 |-- belongs_to_collection: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- Keywords: string (nullable = true)
 |-- cast: string (nullable = true)
 |-- crew: string (nullable = true)
 |-- revenue: string (nullable = true)



In [6]:
films_sdf.select("genres").show(5, truncate=False)

+------------------------------------------------------------------------------------------------------------------------------+
|genres                                                                                                                        |
+------------------------------------------------------------------------------------------------------------------------------+
|[{'id': 35, 'name': 'Comedy'}]                                                                                                |
|[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'name': 'Drama'}, {'id': 10751, 'name': 'Family'}, {'id': 10749, 'name': 'Romance'}]|
|[{'id': 18, 'name': 'Drama'}]                                                                                                 |
|[{'id': 53, 'name': 'Thriller'}, {'id': 18, 'name': 'Drama'}]                                                                 |
|[{'id': 28, 'name': 'Action'}, {'id': 53, 'name': 'Thriller'}]                                  

In [7]:
from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType

schema = ArrayType(
    StructType([
        StructField("id", IntegerType()),
        StructField("name", StringType())
    ])
)

films_sdf = films_sdf\
    .withColumn("genres", F.from_json(F.col("genres"), schema))

In [8]:
films_sdf.printSchema()

root
 |-- id: integer (nullable = true)
 |-- belongs_to_collection: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: integer (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- Keywords: string (nullable = true)
 |-- 

In [9]:
films_sdf.select("genres").show(5, truncate=False)

+--------------------------------------------------------------+
|genres                                                        |
+--------------------------------------------------------------+
|[[35, Comedy]]                                                |
|[[35, Comedy], [18, Drama], [10751, Family], [10749, Romance]]|
|[[18, Drama]]                                                 |
|[[53, Thriller], [18, Drama]]                                 |
|[[28, Action], [53, Thriller]]                                |
+--------------------------------------------------------------+
only showing top 5 rows



In [10]:
films_sdf.select("genres.name").show(5, truncate=False)

+--------------------------------+
|name                            |
+--------------------------------+
|[Comedy]                        |
|[Comedy, Drama, Family, Romance]|
|[Drama]                         |
|[Thriller, Drama]               |
|[Action, Thriller]              |
+--------------------------------+
only showing top 5 rows



In [None]:
films_sdf.select("genres.name").printSchema()

## String manipulation

In [None]:
films_sdf2 = spark.read.csv("data/tmdb-box-office-prediction/train.csv", header=True)

In [None]:
films_sdf2.select("genres").show(truncate=False)

In [None]:
genres_col = films_sdf2\
    .select(
    "id", 
    F.col("genres").substr(F.lit(2), F.length(F.col("genres"))-2).alias("genres"))

genres_col.show(3, truncate=False)

In [None]:
genres_col\
     .withColumn("genres_array", F.split(F.col("genres"), '(?<=}), '))\
     .head(2)

In [None]:
genre_sdf = genres_col\
    .select("id", F.explode(F.split(F.col("genres"), '(?<=}), ')).alias("genre"))

genre_sdf.show(truncate=False)

In [None]:
genre_sdf.select(
    F.col('id'),
    F.regexp_extract(F.col("genre"), "(?<='name':\ ')[A-Z][a-z]+", 0).alias("genre_name")
    ).groupBy("genre_name")\
    .count().orderBy(F.desc("count")).show()

## Saving data

In [11]:
films_sdf.printSchema()

root
 |-- id: integer (nullable = true)
 |-- belongs_to_collection: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: integer (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- Keywords: string (nullable = true)
 |-- 

In [17]:
films_sdf.repartition(1).write.mode('overwrite').parquet("data/tmdb1")
#films_sdf.write.mode('overwrite').parquet("data/tmdb1")

In [18]:
!ls data/tmdb1

_SUCCESS  part-00000-4e153c39-0824-4688-a72b-c9fdd2162008-c000.snappy.parquet


In [15]:
new_films_sdf = spark.read.parquet("data/tmdb1")

In [16]:
new_films_sdf.printSchema()

root
 |-- id: integer (nullable = true)
 |-- belongs_to_collection: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- genres: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- id: integer (nullable = true)
 |    |    |-- name: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- Keywords: string (nullable = true)
 |-- 

## Schema

In [None]:
from pyspark.sql.types import StructField, StructType, IntegerType, LongType

schema = \
StructType([
    StructField("id", IntegerType(), False),
    StructField("genres", ArrayType(
        StructType([
            StructField("id", IntegerType()),
            StructField("name", StringType())
        ])))
    ])

In [None]:
new_films_sdf = spark.read.schema(schema=schema).parquet("data/tmdb1")

In [None]:
new_films_sdf.printSchema()

## Partition

In [None]:
#!wget https://raw.githubusercontent.com/JulienCojan/pyspark_kschool/master/data/competitive-data-science-predict-future-sales/sales_train.csv.gz -P data/competitive-data-science-predict-future-sales/

In [15]:
#sales_sdf = spark.read.option("header", "true").csv("data/competitive-data-science-predict-future-sales/sales_train.csv.gz")
sales_sdf = spark.read.option("header", "true").csv("data/competitive-data-science-predict-future-sales/sample.csv")

In [16]:
import pyspark.sql.functions as F
sales_with_iso_dates = sales_sdf\
    .withColumn("date", F.from_unixtime(F.unix_timestamp(F.col("date"), 'dd.MM.yyyy')))\
    .withColumn("year", F.year("date"))\
    .withColumn("month", F.month("date"))\
    .withColumn("day", F.dayofmonth("date"))

In [17]:
sales_with_iso_dates.show()

+-------------------+--------------+-------+-------+----------+------------+----+-----+---+
|               date|date_block_num|shop_id|item_id|item_price|item_cnt_day|year|month|day|
+-------------------+--------------+-------+-------+----------+------------+----+-----+---+
|2013-01-02 00:00:00|             0|     59|  22154|     999.0|         1.0|2013|    1|  2|
|2013-01-03 00:00:00|             0|     25|   2552|     899.0|         1.0|2013|    1|  3|
|2013-01-05 00:00:00|             0|     25|   2552|     899.0|        -1.0|2013|    1|  5|
|2013-01-06 00:00:00|             0|     25|   2554|   1709.05|         1.0|2013|    1|  6|
|2013-01-15 00:00:00|             0|     25|   2555|    1099.0|         1.0|2013|    1| 15|
|2013-01-10 00:00:00|             0|     25|   2564|     349.0|         1.0|2013|    1| 10|
|2013-01-02 00:00:00|             0|     25|   2565|     549.0|         1.0|2013|    1|  2|
|2013-01-04 00:00:00|             0|     25|   2572|     239.0|         1.0|2013

In [18]:
#sales_with_iso_dates.write.partitionBy("year", "month", "day").mode('overwrite').parquet("data/tmdb2")
sales_with_iso_dates.write.partitionBy("year", "month", "day").mode('overwrite').csv("data/sales")

In [None]:
!ls data/tmdb2

In [None]:
!ls data/tmdb2/year=2013/month=3/day=19

In [20]:
#films_sdf2 = spark.read.parquet("data/tmdb2/year=2013")
films_sdf2 = spark.read.csv("data/sales/year=2013")

In [21]:
films_sdf2.show()

+-------------------+---+---+-----+------+---+-----+---+
|                _c0|_c1|_c2|  _c3|   _c4|_c5|month|day|
+-------------------+---+---+-----+------+---+-----+---+
|2013-01-02 00:00:00|  0| 59|22154| 999.0|1.0|    1|  2|
|2013-01-02 00:00:00|  0| 25| 2565| 549.0|1.0|    1|  2|
|2013-01-02 00:00:00|  0| 25| 2546| 299.0|1.0|    1|  2|
|2013-01-02 00:00:00|  0| 25| 2715| 899.0|1.0|    1|  2|
|2013-01-02 00:00:00|  0| 25| 2716| 149.0|1.0|    1|  2|
|2013-01-02 00:00:00|  0| 25| 2719|2699.0|1.0|    1|  2|
|2013-01-02 00:00:00|  0| 25| 2480|  58.0|1.0|    1|  2|
|2013-01-02 00:00:00|  0| 25| 2731| 599.0|1.0|    1|  2|
|2013-01-02 00:00:00|  0| 25| 2833| 599.0|1.0|    1|  2|
|2013-01-02 00:00:00|  0| 25| 2808| 999.0|2.0|    1|  2|
|2013-01-02 00:00:00|  0| 25| 2090| 449.0|1.0|    1|  2|
|2013-01-02 00:00:00|  0| 25| 2389| 999.0|1.0|    1|  2|
|2013-01-02 00:00:00|  0| 25| 2441|1199.0|1.0|    1|  2|
|2013-01-02 00:00:00|  0| 25| 2678| 999.0|1.0|    1|  2|
|2013-01-02 00:00:00|  0| 25| 2

In [8]:
films_sdf3 = spark\
    .read\
    .option("basePath", "data/tmdb2")\
    .parquet(
        "data/tmdb2/year=2014/month=3",
        "data/tmdb2/year=2014/month=4"
    )

In [13]:
films_sdf3 = spark\
    .read\
    .option("basePath", "data/tmdb2")\
    .parquet(
        "data/tmdb2/year=2014/month=3"
    )

In [14]:
films_sdf3.groupBy("year", "month").count().show()

+----+-----+-----+
|year|month|count|
+----+-----+-----+
|2014|    3|92733|
+----+-----+-----+



## Handling NA

In [None]:
from pyspark.sql import Row

In [None]:
df1 = spark.createDataFrame([
    Row(id=1, value=15.0),
    Row(id=2, value=None),
    Row(id=3, value=float('NaN')),
])
df1.show()

In [None]:
df2 = spark.createDataFrame([
    Row(id=1, value=float('NaN')),
    Row(id=2, value=42.0),
    Row(id=3, value=None)
])
df2.show()

In [None]:
df1.join(df2, df1["value"] == df2["value"]).show()

In [None]:
df1.join(df2, df1["value"].eqNullSafe(df2["value"])).show()

In [None]:
df2.select(
    df2['value'].eqNullSafe(None),
    df2['value'].eqNullSafe(float('NaN')),
    df2['value'].eqNullSafe(42.0)
).show()

In [None]:
pdf1 = df1.toPandas()
pdf2 = df2.toPandas()

In [None]:
pdf2.dtypes

In [None]:
pdf1.join(pdf2, on="value", lsuffix="_1", how='inner')