## IMDb Datasets Analysis

* **Collaborators:** Mingrui Zhang
* **ID:** 20985422

In [4]:
# Java, Spark install
!apt-get update -qq > /dev/null
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://dlcdn.apache.org/spark/spark-3.2.3/spark-3.2.3-bin-hadoop2.7.tgz
!tar xf spark-3.2.3-bin-hadoop2.7.tgz
!pip install -q findspark

# Download IMDb Datasets and unzip it as tsv files
!wget -q https://datasets.imdbws.com/name.basics.tsv.gz
!wget -q https://datasets.imdbws.com/title.akas.tsv.gz
!wget -q https://datasets.imdbws.com/title.basics.tsv.gz
!wget -q https://datasets.imdbws.com/title.crew.tsv.gz
!wget -q https://datasets.imdbws.com/title.episode.tsv.gz
!wget -q https://datasets.imdbws.com/title.principals.tsv.gz
!wget -q https://datasets.imdbws.com/title.ratings.tsv.gz

# nameBasics
!gzip -d name.basics.tsv.gz
!mv name.basics.tsv nameBasics.tsv

# titleAkas
!gzip -d title.akas.tsv.gz
!mv title.akas.tsv titleAkas.tsv

# titleBasics
!gzip -d title.basics.tsv.gz
!mv title.basics.tsv titleBasics.tsv

# titleCrew
!gzip -d title.crew.tsv.gz
!mv title.crew.tsv titleCrew.tsv

# titleEpisode
!gzip -d title.episode.tsv.gz
!mv title.episode.tsv titleEpisode.tsv

# titlePrincipals
!gzip -d title.principals.tsv.gz
!mv title.principals.tsv titlePrincipals.tsv

# titleRatings
!gzip -d title.ratings.tsv.gz
!mv title.ratings.tsv titleRatings.tsv

In [5]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.3-bin-hadoop2.7"

import findspark
findspark.init()

from pyspark.sql import SparkSession
import random

spark = SparkSession.builder.appName("YourTest").master("local[2]").config('spark.ui.port', random.randrange(4000,5000)).getOrCreate()

In [9]:
# nameBasics
nameBasicRaw = spark.read.option("header","true").option("sep", "\t") \
        .option("multiLine", "true") \
        .option("quote","\"") \
        .option("escape","\"") \
        .option("ignoreTrailingWhiteSpace", True) \
        .csv("nameBasics.tsv")

# titleAkas
titleAkasRaw = spark.read.option("header","true").option("sep", "\t") \
        .option("multiLine", "true") \
        .option("quote","\"") \
        .option("escape","\"") \
        .option("ignoreTrailingWhiteSpace", True) \
        .csv("titleAkas.tsv")

# titleBasics
titleBasicsRaw = spark.read.option("header","true").option("sep", "\t") \
        .option("multiLine", "true") \
        .option("quote","\"") \
        .option("escape","\"") \
        .option("ignoreTrailingWhiteSpace", True) \
        .csv("titleBasics.tsv")

# titleCrew
titleCrewRaw = spark.read.option("header","true").option("sep", "\t") \
        .option("multiLine", "true") \
        .option("quote","\"") \
        .option("escape","\"") \
        .option("ignoreTrailingWhiteSpace", True) \
        .csv("titleCrew.tsv")

# titleEpisode
titleEpisodeRaw = spark.read.option("header","true").option("sep", "\t") \
        .option("multiLine", "true") \
        .option("quote","\"") \
        .option("escape","\"") \
        .option("ignoreTrailingWhiteSpace", True) \
        .csv("titleEpisode.tsv")

# titlePrincipals
titlePrincipalsRaw = spark.read.option("header","true").option("sep", "\t") \
        .option("multiLine", "true") \
        .option("quote","\"") \
        .option("escape","\"") \
        .option("ignoreTrailingWhiteSpace", True) \
        .csv("titlePrincipals.tsv")

# titleRatings
titleRatingsRaw = spark.read.option("header","true").option("sep", "\t") \
        .option("multiLine", "true") \
        .option("quote","\"") \
        .option("escape","\"") \
        .option("ignoreTrailingWhiteSpace", True) \
        .csv("titleRatings.tsv")

print("nameBasicRaw")
nameBasicRaw.show()
print(nameBasicRaw.dtypes)
print("\n")

print("titleAkasRaw")
titleAkasRaw.show()
print(titleAkasRaw.dtypes)
print("\n")

print("titleBasicsRaw")
titleBasicsRaw.show()
print(titleBasicsRaw.dtypes)
print("\n")

print("titleCrewRaw")
titleCrewRaw.show()
print(titleCrewRaw.dtypes)
print("\n")

print("titleEpisodeRaw")
titleEpisodeRaw.show()
print(titleEpisodeRaw.dtypes)
print("\n")

print("titlePrincipalsRaw")
titlePrincipalsRaw.show()
print(titlePrincipalsRaw.dtypes)
print("\n")

print("titleRatingsRaw")
titleRatingsRaw.show()
print(titleRatingsRaw.dtypes)
print("\n")

nameBasicRaw
+---------+-------------------+---------+---------+--------------------+--------------------+
|   nconst|        primaryName|birthYear|deathYear|   primaryProfession|      knownForTitles|
+---------+-------------------+---------+---------+--------------------+--------------------+
|nm0000001|       Fred Astaire|     1899|     1987|soundtrack,actor,...|tt0072308,tt00504...|
|nm0000002|      Lauren Bacall|     1924|     2014|  actress,soundtrack|tt0037382,tt00383...|
|nm0000003|    Brigitte Bardot|     1934|       \N|actress,soundtrac...|tt0054452,tt00491...|
|nm0000004|       John Belushi|     1949|     1982|actor,soundtrack,...|tt0072562,tt00787...|
|nm0000005|     Ingmar Bergman|     1918|     2007|writer,director,a...|tt0050986,tt00509...|
|nm0000006|     Ingrid Bergman|     1915|     1982|actress,soundtrac...|tt0038109,tt00387...|
|nm0000007|    Humphrey Bogart|     1899|     1957|actor,soundtrack,...|tt0034583,tt00432...|
|nm0000008|      Marlon Brando|     1924|     2