In [0]:
# imports
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
import pyspark.sql.types as t

In [0]:
spark = SparkSession.builder.master("yarn").appName("MovieAnalysis").getOrCreate()

In [0]:
# schemes to be defined
from pyspark.sql.types import *
from pyspark.sql.functions import *

lang_schema = StructType([StructField('name', StringType(), True),
                          StructField('id', IntegerType(), True)])

cast_schema = StructType([StructField('name', StringType(), True),
                          StructField('character', StringType(), True),
                          StructField('gender', IntegerType(), True),
                          StructField('id', IntegerType(), True)])

genre_schema = StructType([StructField('name', StringType(), True),
                          StructField('id', IntegerType(), True)])

movie_schema = StructType([StructField('index', IntegerType(), True),
                          StructField('title', IntegerType(), True),
                          StructField('release_date', DateType(), True),
                          StructField('runtime', FloatType(), True),
                          StructField('revenue', IntegerType(), True),
                          StructField('budget', IntegerType(), True),
                          StructField('popularity', FloatType(), True),
                          StructField('id', IntegerType(), True)])

crew_schema = StructType([StructField('name', StringType(), True),
                          StructField('job', StringType(), True),
                          StructField('gender', IntegerType(), True),
                          StructField('id', IntegerType(), True)])

recom_schema = StructType([StructField('index', IntegerType(), True),
                          StructField('movie_id', IntegerType(), True),
                          StructField('user_id', IntegerType(), True),
                          StructField('vote', IntegerType(), True)])

In [0]:
# maybe add schemes when reading the csv files
df_lang = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/ibele@stud.dhbw-ravensburg.de/spoken_languages-1.csv")
df_cast = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/ibele@stud.dhbw-ravensburg.de/cast-3.csv")
df_genre = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/ibele@stud.dhbw-ravensburg.de/genres-2.csv")
df_movie = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/ibele@stud.dhbw-ravensburg.de/movies-3.csv")
df_crew = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/ibele@stud.dhbw-ravensburg.de/crew-2.csv")
df_recom = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/ibele@stud.dhbw-ravensburg.de/recom.csv")

In [0]:
df_movie.display()

_c0,title,release_date,runtime,revenue,budget,popularity,id
0,Avatar,2009-12-10,162.0,2787966824,236972821,150.437577,19995
1,Pirates of the Caribbean: At World's End,2007-05-19,169.0,961022070,299983825,139.082615,285
2,Spectre,2015-10-26,148.0,880719915,244983844,107.376788,206647
3,The Dark Knight Rises,2012-07-16,165.0,1084987218,249951643,112.31295,49026
4,John Carter,2012-03-07,132.0,284133596,260013038,43.926995,49529
5,Spider-Man 3,2007-05-01,139.0,890863843,258008881,115.699814,559
6,Tangled,2010-11-24,100.0,591814703,260023415,48.681969,38757
7,Avengers: Age of Ultron,2015-04-22,141.0,1405364340,279979533,134.279229,99861
8,Harry Potter and the Half-Blood Prince,2009-07-07,153.0,933974783,250021729,98.885637,767
9,Batman v Superman: Dawn of Justice,2016-03-23,151.0,873305820,249958606,155.790452,209112


In [0]:
# dropping first column of movie since it's only the index and printing the schema
df_movie = df_movie.drop("_c0")
df_movie.printSchema()

root
 |-- title: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- runtime: string (nullable = true)
 |-- revenue: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- id: string (nullable = true)



In [0]:
df_recom.display()

_c0,movie_id,user_id,vote
0,508,77338,5
1,133694,83675,1
2,14048,78439,5
3,8338,66699,5
4,11624,56765,3
5,83860,9167,1
6,284293,88524,3
7,20770,69865,3
8,9383,15759,3
9,6687,55291,2


In [0]:
# dropping first column of recom since it's only the index and printing the schema
df_recom = df_recom.drop("_c0")
df_recom.printSchema()

root
 |-- movie_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- vote: string (nullable = true)



In [0]:
# writing the dataframes to parquet
df_lang.write.format("parquet").mode("overwrite").save("output/lang.parquet")
df_cast.write.format("parquet").mode("overwrite").save("output/cast.parquet")
df_genre.write.format("parquet").mode("overwrite").save("output/genre.parquet")
df_movie.write.format("parquet").mode("overwrite").save("output/movie.parquet")
df_crew.write.format("parquet").mode("overwrite").save("output/crew.parquet")
df_recom.write.format("parquet").mode("overwrite").save("output/recom.parquet")

In [0]:
# reading the data from the parquet files in dataframes
pf_lang = spark.read.parquet("/output/lang.parquet")
pf_cast = spark.read.parquet("/output/cast.parquet")
pf_genre = spark.read.parquet("/output/genre.parquet")
pf_movie = spark.read.parquet("/output/movie.parquet")
pf_crew = spark.read.parquet("/output/crew.parquet")
pf_recom = spark.read.parquet("/output/recom.parquet")

## Group Task

<b> Aufgabe 1: </b> \
*Which movie genres have the most movies with a runtime over 120 minutes and how many movies? Please list the top three movie genres.*

In [0]:
(pf_genre
    .join(pf_movie, on=pf_genre.id==pf_movie.id.astype(t.FloatType()))
    .where(f.col("runtime")>120)
    .groupBy("name")
    .count()
    .orderBy("count", ascending=False)
    .limit(3)
    .display())

name,count
Drama,672
Action,298
Thriller,246


=> The movie genres "Drama", "Action" and "Thriller" have the most movies with a runtime over 120 minutes.

<b> Aufgabe 2: </b> \
*In how many movies did the actor Johnny Depp take part in as an actor?*

In [0]:
(pf_cast
    .where(f.col("name")=="Johnny Depp")
    .count())

Out[12]: 40

=> The actor Johnny Depp did take part in 40 movies as an actor.

*In how many of those did he also act as a producer?*

In [0]:
(pf_cast
     .join(pf_crew, on=pf_cast.id==pf_crew.id)
     .where(
         (pf_cast.name=="Johnny Depp")
          & (pf_crew.name=="Johnny Depp")
          & ((pf_crew.job=="Producer") | (pf_crew.job=="Executive Producer")))
     .count())

Out[13]: 3

=> Johnny Depp acted in 3 movies as a Producer or Executive Producer.

<b> Aufgabe 3: </b> \
*List the names and the revenue of the ten movies with the most revenue which were released before 2015.*

In [0]:
(pf_movie
    .where(
        f.year(f.to_date("release_date", "yyyy-MM-dd"))<2015)
     .withColumn("revenue", pf_movie.revenue.astype(t.FloatType()))
     .orderBy("revenue", ascending=False)
     .select("title", "revenue")
     .limit(10)
     .display()
)

title,revenue
Avatar,2787966720.0
Titanic,1845063940.0
The Avengers,1519574780.0
Frozen,1274198780.0
Iron Man 3,1215448450.0
Transformers: Dark of the Moon,1123772160.0
The Lord of the Rings: The Return of the King,1118902530.0
Skyfall,1108547970.0
Transformers: Age of Extinction,1091412100.0
The Dark Knight Rises,1084987260.0


=> The ten movies listed above are the movies, which were released before 2015 and have the most revenue. The one with most revenue is 'Avatar'.

*What are the ten financially most succesful movies when comparing the revenue to the budget?*

In [0]:
(pf_movie
    .select("title", f.col("revenue")/f.col("budget"), "revenue", "budget")
    .orderBy("(revenue / budget)", ascending=False)
    .limit(10)
    .display())

title,(revenue / budget),revenue,budget
House of Flying Daggers,128791.59223300972,92858738,721
The Odd Life of Timothy Green,11654.044933722758,51872154,4451
Shaolin Soccer,11611.181423139598,42752370,3682
Chernobyl Diaries,7810.454506252695,18112444,2319
Girl with a Pearl Earring,7544.356371490281,31437333,4167
Crocodile Dundee II,6990.519738569719,239586083,34273
Garfield: A Tail of Two Kitties,6833.255353009259,141694383,20736
Sympathy for Lady Vengeance,5636.224644549763,23784868,4220
Scooby-Doo 2: Monsters Unleashed,5249.511611765046,181512363,34577
The Blair Witch Project,4872.991316306483,248035258,50900


=> When comparing the revenue to the budget the ten movies listed above are the financially most succesful. The movie 'House of Flying Daggers' is in this respect the best.

<b> Aufgabe 4: </b> \
*What is the movie genre that has a median rating of at least 3 (over all movies with at least ten recommendations) with the lowest average production budget considering all movies?*

In [0]:
(pf_recom
     .groupBy("movie_id")
     .agg(f.count("vote").alias("n_ratings"))
     .where(f.col("n_ratings")>10)
     .join(pf_genre, on=pf_recom.movie_id==pf_genre.id.astype(t.IntegerType()))
     .join(pf_movie, on=pf_recom.movie_id==pf_movie.id)
     .groupBy("name")
     .agg(f.mean("budget").alias("mean_budget"))
     .orderBy("mean_budget")
     .display())

name,mean_budget
Foreign,657476.5294117647
TV Movie,1163797.75
Documentary,2653978.0454545454
Horror,14575251.179190751
Music,15911043.75135135
Romance,20312007.863534678
Drama,20679360.096647803
Comedy,25313660.76132404
Western,27074936.19512195
Crime,27850306.416666668


=> The movie genre 'Foreign' is the genre, whcih has a median rating of at least 3 with at least 10 recommendations and the lowest average production budget considering all movies.

## Individual Tasks

me:

In [0]:
(pf_movie
     .join(pf_crew, on=pf_movie.id==pf_crew.id.astype(t.IntegerType()))
     .where(
         (pf_crew.job=="Writer") 
         & (pf_crew.gender==1))
     .orderBy("name")
     .count())

Out[42]: 151

<b> Tom Zehle: </b>\
Who is the actor (regardless of gender) who played in the most movies where languages other than English were spoken?\
Could this analysis also have been done with MapReduce? What are the benefits and drawbacks of using MapReduce in comparison to using Spark?

In [0]:
non_english_movies = df_lang.where(df_lang.name!="English").select("id").distinct()
(df_cast
     .join(non_english_movies, on=df_cast.id==non_english_movies.id)
     .groupBy("name")
     .count()
     .orderBy("count", ascending=False)
     .display())