## IMDb Datasets Analysis

* **Collaborators:** Mingrui Zhang
* **ID:** 20985422

In [50]:
# Java, Spark install
!apt-get update -qq > /dev/null
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.2.3/spark-3.2.3-bin-hadoop2.7.tgz
!tar xf spark-3.2.3-bin-hadoop2.7.tgz
!pip install -q findspark

# Download IMDb Datasets and unzip it as tsv files
!wget -q https://datasets.imdbws.com/name.basics.tsv.gz
!wget -q https://datasets.imdbws.com/title.akas.tsv.gz
!wget -q https://datasets.imdbws.com/title.basics.tsv.gz
!wget -q https://datasets.imdbws.com/title.crew.tsv.gz
!wget -q https://datasets.imdbws.com/title.episode.tsv.gz
!wget -q https://datasets.imdbws.com/title.principals.tsv.gz
!wget -q https://datasets.imdbws.com/title.ratings.tsv.gz

# nameBasics
!gzip -d name.basics.tsv.gz
!mv name.basics.tsv nameBasics.tsv

# titleAkas
!gzip -d title.akas.tsv.gz
!mv title.akas.tsv titleAkas.tsv

# titleBasics
!gzip -d title.basics.tsv.gz
!mv title.basics.tsv titleBasics.tsv

# titleCrew
!gzip -d title.crew.tsv.gz
!mv title.crew.tsv titleCrew.tsv

# titleEpisode
!gzip -d title.episode.tsv.gz
!mv title.episode.tsv titleEpisode.tsv

# titlePrincipals
!gzip -d title.principals.tsv.gz
!mv title.principals.tsv titlePrincipals.tsv

# titleRatings
!gzip -d title.ratings.tsv.gz
!mv title.ratings.tsv titleRatings.tsv

In [52]:
# Download boxOffice csv
!wget -q https://raw.githubusercontent.com/MingruiZhangW/IMDbBigData/d47ceeecd1436f7d8936aef176514efe316bbf7b/boxoffice.csv

In [47]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.3-bin-hadoop2.7"

import findspark
findspark.init()

from pyspark.sql import SparkSession
import random

spark = SparkSession.builder.appName("YourTest").master("local[2]").config('spark.ui.port', random.randrange(4000,5000)).getOrCreate()

In [60]:
# nameBasics
nameBasicRaw = spark.read.option("header","true").option("sep", "\t") \
        .option("multiLine", "true") \
        .option("quote","\"") \
        .option("escape","\"") \
        .option("ignoreTrailingWhiteSpace", True) \
        .csv("nameBasics.tsv").cache()
nameBasicRaw.createOrReplaceTempView("nameBasics")

# titleAkas
titleAkasRaw = spark.read.option("header","true").option("sep", "\t") \
        .option("multiLine", "true") \
        .option("quote","\"") \
        .option("escape","\"") \
        .option("ignoreTrailingWhiteSpace", True) \
        .csv("titleAkas.tsv").cache()
titleAkasRaw.createOrReplaceTempView("titleAkas")

# titleBasics
titleBasicsRaw = spark.read.option("header","true").option("sep", "\t") \
        .option("multiLine", "true") \
        .option("quote","\"") \
        .option("escape","\"") \
        .option("ignoreTrailingWhiteSpace", True) \
        .csv("titleBasics.tsv").cache()
titleBasicsRaw.createOrReplaceTempView("titleBasics")

# titleCrew
titleCrewRaw = spark.read.option("header","true").option("sep", "\t") \
        .option("multiLine", "true") \
        .option("quote","\"") \
        .option("escape","\"") \
        .option("ignoreTrailingWhiteSpace", True) \
        .csv("titleCrew.tsv").cache()
titleCrewRaw.createOrReplaceTempView("titleCrew")

# titleEpisode
titleEpisodeRaw = spark.read.option("header","true").option("sep", "\t") \
        .option("multiLine", "true") \
        .option("quote","\"") \
        .option("escape","\"") \
        .option("ignoreTrailingWhiteSpace", True) \
        .csv("titleEpisode.tsv").cache()
titleEpisodeRaw.createOrReplaceTempView("titleEpisode")

# titlePrincipals
titlePrincipalsRaw = spark.read.option("header","true").option("sep", "\t") \
        .option("multiLine", "true") \
        .option("quote","\"") \
        .option("escape","\"") \
        .option("ignoreTrailingWhiteSpace", True) \
        .csv("titlePrincipals.tsv").cache()
titlePrincipalsRaw.createOrReplaceTempView("titlePrincipals")

# titleRatings
titleRatingsRaw = spark.read.option("header","true").option("sep", "\t") \
        .option("multiLine", "true") \
        .option("quote","\"") \
        .option("escape","\"") \
        .option("ignoreTrailingWhiteSpace", True) \
        .csv("titleRatings.tsv").cache()
titleRatingsRaw.createOrReplaceTempView("titleRatings")

In [65]:
# boxOffice
boxOfficeRaw = spark.read.option("header","true").csv("boxoffice.csv").cache()
boxOfficeRaw.createOrReplaceTempView("boxOffice")

> To see the data in table format, run the following code. 

In [66]:
print("nameBasicRaw")
nameBasicRaw.show()
print(nameBasicRaw.dtypes)
print("\n")

print("titleAkasRaw")
titleAkasRaw.show()
print(titleAkasRaw.dtypes)
print("\n")

print("titleBasicsRaw")
titleBasicsRaw.show()
print(titleBasicsRaw.dtypes)
print("\n")

print("titleCrewRaw")
titleCrewRaw.show()
print(titleCrewRaw.dtypes)
print("\n")

print("titleEpisodeRaw")
titleEpisodeRaw.show()
print(titleEpisodeRaw.dtypes)
print("\n")

print("titlePrincipalsRaw")
titlePrincipalsRaw.show()
print(titlePrincipalsRaw.dtypes)
print("\n")

print("titleRatingsRaw")
titleRatingsRaw.show()
print(titleRatingsRaw.dtypes)
print("\n")

print("boxOfficeRaw")
boxOfficeRaw.show()
print(boxOfficeRaw.dtypes)
print("\n")

nameBasicRaw
+---------+-------------------+---------+---------+--------------------+--------------------+
|   nconst|        primaryName|birthYear|deathYear|   primaryProfession|      knownForTitles|
+---------+-------------------+---------+---------+--------------------+--------------------+
|nm0000001|       Fred Astaire|     1899|     1987|soundtrack,actor,...|tt0072308,tt00504...|
|nm0000002|      Lauren Bacall|     1924|     2014|  actress,soundtrack|tt0037382,tt00383...|
|nm0000003|    Brigitte Bardot|     1934|       \N|actress,soundtrac...|tt0054452,tt00491...|
|nm0000004|       John Belushi|     1949|     1982|actor,soundtrack,...|tt0072562,tt00787...|
|nm0000005|     Ingmar Bergman|     1918|     2007|writer,director,a...|tt0050986,tt00509...|
|nm0000006|     Ingrid Bergman|     1915|     1982|actress,soundtrac...|tt0038109,tt00387...|
|nm0000007|    Humphrey Bogart|     1899|     1957|actor,soundtrack,...|tt0034583,tt00432...|
|nm0000008|      Marlon Brando|     1924|     2

## Movie Production Team Selection - Ratings 

- Input:
1. Genres
2. IsAdult
3. Maximum Age

- Output:
1. Possible Crews
2. Possible Actors

In [41]:
import datetime

def movieProductionToolRatings(language, genres, isAdult, maxAge):
  # # Filter out the people that are passed away and older than maxAge
  # thisYear = datetime.datetime.now().year
  # birthYearToFilter = thisYear - maxAge

  # nameBasicCandidates = nameBasicRaw.filter(nameBasicRaw.deathYear == "\\N").filter(nameBasicRaw.birthYear >= birthYearToFilter)
  # nameBasicCandidates.show()

  # isOriginalTitle
  titleAkasRaw.filter(titleAkasRaw.region == "CN").filter(titleAkasRaw.isOriginalTitle == "1").show()
  pass


###################################################################################################################
#  the user interface
###################################################################################################################

# while True:
#     q = input("Input 1 or 2 space-separated tokens (return to quit): ")
#     if len(q) == 0:
#         break
#     q_tokens = simple_tokenize(q)
#     if len(q_tokens) == 1:
#         threshold = 0
#         while threshold <= 0:
#             try:
#                 threshold = int(input("Input a positive integer frequency threshold: "))
#             except ValueError:
#                 print("Threshold must be a positive integer!")
#                 continue
#         result = oneTokenQueries(q_tokens[0], threshold)

#         # Output result
#         print("  n({0}) = {1}".format(q_tokens[0], result[0]))

#         if len(result) > 1 :
#           print("  high PMI tokens with respect to {0} (threshold: {1}):".format(q_tokens[0],threshold))
#         else:
#           print("  No occurrence")
#         for i in range(1, len(result)):
#           print("    n({0},{1}) = {2},  PMI({0},{1}) = {3}".format(q_tokens[0], result[i][1], result[i][0], result[i][2]))
#     elif len(q_tokens) == 2:
#         # Put code here to answer a Two-Token Query with tokens q_tokens[0] and q_tokens[1]
#         # As was the case for the One-Token query, the print statements below show the desired output format
#         # Replace them with your own output code
#         result = twoTokenQueries(q_tokens[0], q_tokens[1])

#         if not len(result):
#           print("  No occurrence")
#         else:
#           print("  n({0},{1}) = {2}".format(q_tokens[0], q_tokens[1], result[0]))
#           print("  PMI({0},{1}) = {2}".format(q_tokens[0], q_tokens[1], result[1]))
#     else:
#         print("Input must consist of 1 or 2 space-separated tokens!")

In [42]:
movieProductionToolRatings(True, "", "", 40)

+---------+--------+-------+------+--------+--------+----------+---------------+
|  titleId|ordering|  title|region|language|   types|attributes|isOriginalTitle|
+---------+--------+-------+------+--------+--------+----------+---------------+
|tt4537986|       2|Zhui bu|    CN|     cmn|original|        \N|              1|
+---------+--------+-------+------+--------+--------+----------+---------------+

