In [None]:
# imports
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql.types import StringType, IntegerType, FloatType, DateType
import pyspark.sql.types as t
import pandas as pd

In [None]:
# creating a spark session
spark = SparkSession.builder.master("yarn").appName("MovieAnalysis").getOrCreate()

In [None]:
# schemes to be defined

lang_schema = StructType([StructField('name', StringType(), True),
                          StructField('id', IntegerType(), True)])

cast_schema = StructType([StructField('name', StringType(), True),
                          StructField('character', StringType(), True),
                          StructField('gender', IntegerType(), True),
                          StructField('id', IntegerType(), True)])

genre_schema = StructType([StructField('name', StringType(), True),
                          StructField('id', IntegerType(), True)])

movie_schema = StructType([StructField('index', IntegerType(), True),
                          StructField('title', IntegerType(), True),
                          StructField('release_date', DateType(), True),
                          StructField('runtime', FloatType(), True),
                          StructField('revenue', IntegerType(), True), #Longtype
                          StructField('budget', IntegerType(), True),
                          StructField('popularity', FloatType(), True),
                          StructField('id', IntegerType(), True)])

crew_schema = StructType([StructField('name', StringType(), True),
                          StructField('job', StringType(), True),
                          StructField('gender', IntegerType(), True),
                          StructField('id', IntegerType(), True)])

recom_schema = StructType([StructField('index', IntegerType(), True),
                          StructField('movie_id', IntegerType(), True),
                          StructField('user_id', IntegerType(), True),
                          StructField('vote', IntegerType(), True)])

In [None]:
# reading the data from the stored parquet files in dataframes
pf_lang = spark.read.parquet("dbfs:/FileStore/shared_uploads/ibele@stud.dhbw-ravensburg.de/spoken_languages.parquet")
pf_cast = spark.read.parquet("dbfs:/FileStore/shared_uploads/ibele@stud.dhbw-ravensburg.de/cast.parquet")
pf_genre = spark.read.parquet("dbfs:/FileStore/shared_uploads/ibele@stud.dhbw-ravensburg.de/genres.parquet")
pf_movie = spark.read.parquet("dbfs:/FileStore/shared_uploads/ibele@stud.dhbw-ravensburg.de/movies.parquet")
pf_crew = spark.read.parquet("dbfs:/FileStore/shared_uploads/ibele@stud.dhbw-ravensburg.de/crew.parquet")
pf_recom = spark.read.parquet("dbfs:/FileStore/shared_uploads/ibele@stud.dhbw-ravensburg.de/recom.parquet")
pf_pcompanies = spark.read.parquet("dbfs:/FileStore/shared_uploads/ibele@stud.dhbw-ravensburg.de/production_companies.parquet")

## Group Task

<b> Task 1: </b> \
*Which movie genres have the most movies with a runtime over 120 minutes and how many movies? Please list the top three movie genres.*

In [None]:
(pf_genre
    .join(pf_movie, on=pf_genre.id==pf_movie.id.astype(t.FloatType()))
    .where(f.col("runtime")>120)
    .groupBy("name")
    .count()
    .orderBy("count", ascending=False)
    .limit(3)
    .display())

name,count
Drama,672
Action,298
Thriller,246


<b> Answer: </b> \
The movie genres "Drama", "Action" and "Thriller" have the most movies with a runtime over 120 minutes.

<b> Task 2: </b> \
*In how many movies did the actor Johnny Depp take part in as an actor?*

In [None]:
(pf_cast
    .where(f.col("name")=="Johnny Depp")
    .count())

Out[14]: 40

=> The actor Johnny Depp did take part in 40 movies as an actor.

*In how many of those did he also act as a producer?*

In [None]:
(pf_cast
     .join(pf_crew, on=pf_cast.id==pf_crew.id)
     .where(
         (pf_cast.name=="Johnny Depp")
          & (pf_crew.name=="Johnny Depp")
          & ((pf_crew.job=="Producer") | (pf_crew.job=="Executive Producer")))
     .count())

Out[15]: 3

<b> Answer: </b> \
Johnny Depp acted in 3 movies as a Producer or Executive Producer.

<b> Task 3: </b> \
*List the names and the revenue of the ten movies with the most revenue which were released before 2015.*

In [None]:
(pf_movie
    .where(
        f.year(f.to_date("release_date", "yyyy-MM-dd"))<2015)
     .withColumn("revenue", pf_movie.revenue.astype(t.FloatType()))
     .orderBy("revenue", ascending=False)
     .select("title", "revenue")
     .limit(10)
     .display()
)

title,revenue
Avatar,2787966720.0
Titanic,1845063940.0
The Avengers,1519574780.0
Frozen,1274198780.0
Iron Man 3,1215448450.0
Transformers: Dark of the Moon,1123772160.0
The Lord of the Rings: The Return of the King,1118902530.0
Skyfall,1108547970.0
Transformers: Age of Extinction,1091412100.0
The Dark Knight Rises,1084987260.0


<b> Answer: </b> \
The ten movies listed above are the movies, which were released before 2015 and have the most revenue. The one with most revenue is 'Avatar'.

*What are the ten financially most succesful movies when comparing the revenue to the budget?*

In [None]:
(pf_movie
    .select("title", f.col("revenue")/f.col("budget"), "revenue", "budget")
    .orderBy("(revenue / budget)", ascending=False)
    .limit(10)
    .display())

title,(revenue / budget),revenue,budget
House of Flying Daggers,128791.59223300972,92858738,721
The Odd Life of Timothy Green,11654.044933722758,51872154,4451
Shaolin Soccer,11611.181423139598,42752370,3682
Chernobyl Diaries,7810.454506252695,18112444,2319
Girl with a Pearl Earring,7544.356371490281,31437333,4167
Crocodile Dundee II,6990.519738569719,239586083,34273
Garfield: A Tail of Two Kitties,6833.255353009259,141694383,20736
Sympathy for Lady Vengeance,5636.224644549763,23784868,4220
Scooby-Doo 2: Monsters Unleashed,5249.511611765046,181512363,34577
The Blair Witch Project,4872.991316306483,248035258,50900


<b> Answer: </b> \
When comparing the revenue to the budget the ten movies listed above are the financially most succesful. The movie 'House of Flying Daggers' is in this respect the best.

<b> Task 4: </b> \
*What is the movie genre that has a median rating of at least 3 (over all movies with at least ten recommendations) with the lowest average production budget considering all movies?*

In [None]:
(pf_recom
     .groupBy("movie_id")
     .agg(f.mean("vote").alias("mean_votings"), f.count("vote").alias("n_ratings"))
     .where(f.col("n_ratings")>=10)
     .join(pf_genre, on=pf_recom.movie_id==pf_genre.id.astype(t.IntegerType()))
     .join(pf_movie, on=pf_recom.movie_id==pf_movie.id)
     .groupBy("name")
     .agg(f.expr("percentile_approx(mean_votings, 0.5)").alias("median_votings"), f.mean("budget").alias("mean_budget"))
     .where(f.col("median_votings")>=3)
     .orderBy("mean_budget")
     .select("name", "mean_budget")
     .orderBy("mean_budget")
     .display())

name,mean_budget
Foreign,657476.5294117647
Horror,14575251.179190751
Music,15911043.75135135
Drama,20679360.096647803
Western,27074936.19512195
Crime,27850306.416666668
History,29901663.005076144
Mystery,30747275.45689655
Thriller,31969503.72841444
Family,50719601.48732944


<b> Answer: </b> \
The movie genre 'Foreign' is the genre, which has a median rating of at least 3 (which is not displayed because of the specific select statement) with at least 10 recommendations and the lowest average production budget considering all movies.