In [0]:
# imports
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
import pyspark.sql.types as t

In [0]:
spark = SparkSession.builder.master("yarn").appName("MovieAnalysis").getOrCreate()

In [0]:
df_lang = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/ibele@stud.dhbw-ravensburg.de/spoken_languages.csv")
df_cast = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/ibele@stud.dhbw-ravensburg.de/cast-2.csv")
df_genre = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/ibele@stud.dhbw-ravensburg.de/genres-1.csv")
df_movie = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/ibele@stud.dhbw-ravensburg.de/movies-2.csv")
df_crew = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/ibele@stud.dhbw-ravensburg.de/crew-1.csv")
df_recom = spark.read.format("csv").option("header", "true").load("dbfs:/FileStore/shared_uploads/ibele@stud.dhbw-ravensburg.de/recommendations.csv")

## Gruppenarbeit

<b> Aufgabe 1: </b> \
*Which movie genres have the most movies with a runtime over 120 minutes and how many movies? Please list the top three movie genres.*

In [0]:
# Aufgabe 1
(df_genre
    .join(df_movie, on=df_genre.id==df_movie.id.astype(t.FloatType()))
    .where(f.col("runtime")>120)
    .groupBy("name")
    .count()
    .orderBy("count", ascending=False)
    .limit(3)
    .display())

name,count
Drama,459
Action,264
Adventure,203


=>

<b> Aufgabe 2: </b> \
*In how many movies did the actor Johnny Depp take part in as an actor?*

In [0]:
(df_cast
    .where(f.col("name")=="Johnny Depp")
    .count())

Out[6]: 40

=> In 40 movies the actor Johnny Depp did take part as an actor

*In how many of those did he also act as a producer?*

In [0]:
(df_cast
     .join(df_crew, on=df_cast.id==df_crew.id)
     .where(
         (df_cast.name=="Johnny Depp")
          & (df_crew.name=="Johnny Depp")
          & ((df_crew.job=="Producer") | (df_crew.job=="Executive Producer")))
     .count())

Out[7]: 3

=> Johnny Depp acted in 3 movies as a producer.

<b> Aufgabe 3: </b> \
*List the names and the revenue of the ten movies with the most revenue which were released before 2015.*

In [0]:
(df_movie
    .where(
        f.year(f.to_date("release_date", "yyyy-MM-dd"))<2015)
     .orderBy("revenue", ascending=False)
     .select("title", "revenue")
     .display()
)

title,revenue
Spice World,99988373
Godzilla 2000,9998656
Project X,99975956
Conan the Destroyer,99969399
55 Days at Peking,9980616
Kissing Jessica Stein,9972763
What the #$*! Do We (K)now!?,9966568
Very Bad Things,9946690
The Lucky One,99388630
Street Fighter,99382450


=>

*What are the ten financially most succesful movies when comparing the revenue to the budget?*

In [0]:
(df_movie
    .select("title", f.col("revenue")/f.col("budget"), "revenue", "budget")
    .orderBy("(revenue / budget)", ascending=False)
     .limit(10)
    .display())

title,(revenue / budget),revenue,budget
House of Flying Daggers,128791.59223300972,92858738,721
The Odd Life of Timothy Green,11654.044933722758,51872154,4451
Shaolin Soccer,11611.181423139598,42752370,3682
Chernobyl Diaries,7810.454506252695,18112444,2319
Girl with a Pearl Earring,7544.356371490281,31437333,4167
Crocodile Dundee II,6990.519738569719,239586083,34273
Garfield: A Tail of Two Kitties,6833.255353009259,141694383,20736
Sympathy for Lady Vengeance,5636.224644549763,23784868,4220
Scooby-Doo 2: Monsters Unleashed,5249.511611765046,181512363,34577
The Blair Witch Project,4872.991316306483,248035258,50900


=>

<b> Aufgabe 4: </b> \
*What is the movie genre that has a median rating of at least 3 (over all movies with at least ten recommendations) with the lowest average production budget considering all movies?*

In [0]:
(df_recom
     .groupBy("movie_id")
     .agg(f.count("vote").alias("n_ratings")
     .where(f.col("n_ratings")>10)
     .join(df_genre, on=df_recom.movie_id==df_genre.id.astype(t.IntegerType()))
     .join(df_movie, on=df_recom.movie_id==df_movie.id)
     .groupBy("name")
     .agg(f.mean("budget").alias("mean_budget"))
     .orderBy("mean_budget")
     .display())

name,mean_budget
TV Movie,257661.0
Foreign,522717.2727272727
Documentary,3167593.3076923075
Horror,16006915.069252076
Music,16512565.98780488
Drama,22956108.41383899
Romance,24189479.5
Comedy,25661861.15276631
Crime,27143488.51777778
Mystery,28885104.17142857


## Individual-Aufgaben

<b> Tom Zehle: </b>\
Who is the actor (regardless of gender) who played in the most movies where languages other than English were spoken?\
Could this analysis also have been done with MapReduce? What are the benefits and drawbacks of using MapReduce in comparison to using Spark?

In [0]:
non_english_movies = df_lang.where(df_lang.name!="English").select("id").distinct()
(df_cast
     .join(non_english_movies, on=df_cast.id==non_english_movies.id)
     .groupBy("name")
     .count()
     .orderBy("count", ascending=False)
     .display())