In [0]:
# imports
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
import pyspark.sql.types as t

In [0]:
spark = SparkSession.builder.master("yarn").appName("MovieAnalysis").getOrCreate()

In [0]:
# schemes to be defined
from pyspark.sql.types import *
from pyspark.sql.functions import *

lang_schema = StructType([StructField('name', StringType(), True),
                          StructField('id', IntegerType(), True)])

cast_schema = StructType([StructField('name', StringType(), True),
                          StructField('character', StringType(), True),
                          StructField('gender', IntegerType(), True),
                          StructField('id', IntegerType(), True)])

genre_schema = StructType([StructField('name', StringType(), True),
                          StructField('id', IntegerType(), True)])

movie_schema = StructType([StructField('index', IntegerType(), True),
                          StructField('title', IntegerType(), True),
                          StructField('release_date', DateType(), True),
                          StructField('runtime', FloatType(), True),
                          StructField('revenue', IntegerType(), True),
                          StructField('budget', IntegerType(), True),
                          StructField('popularity', FloatType(), True),
                          StructField('id', IntegerType(), True)])

crew_schema = StructType([StructField('name', StringType(), True),
                          StructField('job', StringType(), True),
                          StructField('gender', IntegerType(), True),
                          StructField('id', IntegerType(), True)])

recom_schema = StructType([StructField('index', IntegerType(), True),
                          StructField('movie_id', IntegerType(), True),
                          StructField('user_id', IntegerType(), True),
                          StructField('vote', IntegerType(), True)])

In [0]:
# maybe add schemes when reading the csv files
df_lang = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/ibele@stud.dhbw-ravensburg.de/spoken_languages-1.csv")
df_cast = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/ibele@stud.dhbw-ravensburg.de/cast-3.csv")
df_genre = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/ibele@stud.dhbw-ravensburg.de/genres-2.csv")
df_movie = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/ibele@stud.dhbw-ravensburg.de/movies-3.csv")
df_crew = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/ibele@stud.dhbw-ravensburg.de/crew-2.csv")
df_recom = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/ibele@stud.dhbw-ravensburg.de/recom.csv")

In [0]:
# dropping first column of movie since it's only the index and printing the schema
df_movie = df_movie.drop("_c0")
df_movie.printSchema()

root
 |-- title: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- revenue: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- id: string (nullable = true)



In [0]:
df_movie.display()

title,release_date,runtime,revenue,budget,popularity,id
Avatar,2009-12-10,162.0,2787966824,236972821,150.437577,19995
Pirates of the Caribbean: At World's End,2007-05-19,169.0,961022070,299983825,139.082615,285
Spectre,2015-10-26,148.0,880719915,244983844,107.376788,206647
The Dark Knight Rises,2012-07-16,165.0,1084987218,249951643,112.31295,49026
John Carter,2012-03-07,132.0,284133596,260013038,43.926995,49529
Spider-Man 3,2007-05-01,139.0,890863843,258008881,115.699814,559
Tangled,2010-11-24,100.0,591814703,260023415,48.681969,38757
Avengers: Age of Ultron,2015-04-22,141.0,1405364340,279979533,134.279229,99861
Harry Potter and the Half-Blood Prince,2009-07-07,153.0,933974783,250021729,98.885637,767
Batman v Superman: Dawn of Justice,2016-03-23,151.0,873305820,249958606,155.790452,209112


In [0]:
# dropping first column of recom since it's only the index and printing the schema
df_recom = df_recom.drop("_c0")
df_recom.printSchema()

root
 |-- movie_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- vote: string (nullable = true)



In [0]:
# writing the dataframes to parquet
df_lang.write.format("parquet").mode("overwrite").save("output/lang.parquet")
df_cast.write.format("parquet").mode("overwrite").save("output/cast.parquet")
df_genre.write.format("parquet").mode("overwrite").save("output/genre.parquet")
df_movie.write.format("parquet").mode("overwrite").save("output/movie.parquet")
df_crew.write.format("parquet").mode("overwrite").save("output/crew.parquet")
df_recom.write.format("parquet").mode("overwrite").save("output/recom.parquet")

In [0]:
# reading the data from the parquet files in dataframes
pf_lang = spark.read.parquet("/output/lang.parquet")
pf_cast = spark.read.parquet("/output/cast.parquet")
pf_genre = spark.read.parquet("/output/genre.parquet")
pf_movie = spark.read.parquet("/output/movie.parquet")
pf_crew = spark.read.parquet("/output/crew.parquet")
pf_recom = spark.read.parquet("/output/recom.parquet")

<b> Question: </b>\
How many movies were written by a female writer?\
Explain what data storage structure you used to store the information and why. When storing the information how can you speed up the information retrieval if you know you are interested in looking at the gender of the writer? Why does it speed up the information retrieval when you store the data differently?

In [0]:
(pf_movie
     .join(pf_crew, on=pf_movie.id==pf_crew.id.astype(t.IntegerType()))
     .where(
         (pf_crew.job=="Writer") 
         & (pf_crew.gender==1))
     .orderBy("name")
     .count())

Out[36]: 151

In [0]:
(pf_movie
     .join(pf_crew, on=pf_movie.id==pf_crew.id.astype(t.IntegerType()))
     .where(
         (pf_crew.job=="Writer") 
         & (pf_crew.gender==1))
     .orderBy("name")
     .display())

title,release_date,runtime,revenue,budget,popularity,id,name,job,gender,id.1
Valentine's Day,2010-02-10,125.0,216533382,52017397,25.41929,32856,Abby Kohn,Writer,1,32856.0
He's Just Not That Into You,2009-02-06,129.0,177253008,39970675,26.253357,10184,Abby Kohn,Writer,1,10184.0
After.Life,2010-04-09,104.0,3651621,4478061,14.570359,36419,Agnieszka WojtowiczVosloo,Writer,1,36419.0
Beloved,1998-10-16,172.0,8547,48177,1.453765,39437,Akosua Busia,Writer,1,39437.0
Goddess of Love,2015-08-31,96.0,-31219,-20384,1.273773,347764,Alexis Kendra,Writer,1,347764.0
Saving Face,2004-09-12,91.0,-44222,-5633,2.487255,19316,Alice Wu,Writer,1,19316.0
Four Rooms,1995-12-09,98.0,4253036,4047190,22.87623,5,Allison Anders,Writer,1,5.0
Sugar Town,1999-09-17,92.0,-2946,-46804,0.375423,142132,Allison Anders,Writer,1,142132.0
Ask Me Anything,2014-04-19,100.0,-7182,941597,8.976128,271185,Allison Burnett,Writer,1,271185.0
The Betrayed,2008-09-27,98.0,6148,3539607,1.348114,20055,Amanda Gusack,Writer,1,20055.0


<b> Answer: </b>\
Under the assumption that '1' in the column gender stands for female (2 for male and 0 for the ones, where the gender is not specified), there are 151 movies, that were written by a female writer.