# Data Exploration

In [1]:
import pandas as pd
import numpy as np
import databricks.koalas as ks
import matplotlib.pyplot as plt
import plotly as px
import seaborn as sns

import findspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

ks.set_option('compute.ops_on_diff_frames', True)

import warnings
warnings.filterwarnings("ignore")
warnings.simplefilter('ignore')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/07 17:28:22 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


Launch Spark session

In [2]:
findspark.init()
spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

### Import Data

In [3]:
ratings = spark.read.options(quote="\"", escape="\"", inferSchema=True, header=True, multiline=True)\
                   .csv("../data/raw/ratings.csv")
ratings.printSchema()
print(ratings.count())
ratings = ratings.to_koalas()

                                                                                

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)



                                                                                

26024289


In [4]:
ratings_small = spark.read.options(quote="\"", escape="\"", inferSchema=True, header=True, multiline=True)\
                   .csv("../data/raw/ratings_small.csv")
ratings_small.printSchema()
print(ratings_small.count())
movies = ratings_small.to_koalas()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: integer (nullable = true)

100004


In [3]:
movies = spark.read.options(quote="\"", escape="\"", inferSchema=True, header=True, multiline=True)\
                   .csv("../data/raw/movies_metadata.csv")
movies.printSchema()
print(movies.count())
movies = movies.to_koalas()

                                                                                

root
 |-- adult: string (nullable = true)
 |-- belongs_to_collection: string (nullable = true)
 |-- budget: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- homepage: string (nullable = true)
 |-- id: string (nullable = true)
 |-- imdb_id: string (nullable = true)
 |-- original_language: string (nullable = true)
 |-- original_title: string (nullable = true)
 |-- overview: string (nullable = true)
 |-- popularity: string (nullable = true)
 |-- poster_path: string (nullable = true)
 |-- production_companies: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- release_date: string (nullable = true)
 |-- revenue: long (nullable = true)
 |-- runtime: double (nullable = true)
 |-- spoken_languages: string (nullable = true)
 |-- status: string (nullable = true)
 |-- tagline: string (nullable = true)
 |-- title: string (nullable = true)
 |-- video: boolean (nullable = true)
 |-- vote_average: double (nullable = true)
 |-- vote_count: integer (nu

                                                                                

45466


In [6]:
# import re
# from pathlib import Path

# p = "../data/raw/credits.csv"
# p2 = "../data/interim/credits.csv"

# with open(p, 'r') as f:
#     with open(p2, 'w') as f2:
#         for cnt, line in enumerate(f):
#             if cnt == 0:
#                 line = line.replace(',', '|')
#             else:
#                 line = re.sub(r',(?=(((?!\]).)*\[)|[^\[\]]*$)', '|', line)
#             f2.write(line)

# credits = spark.read.options(header='True', sep='|')\
#                    .csv("../data/interim/credits.csv")
# credits.printSchema()
# print(credits.count())
# credits = credits.to_koalas()

credits = spark.read.options(quote="\"", escape="\"", inferSchema=True, header=True, multiline=True)\
                   .csv("../data/raw/credits.csv")
credits.printSchema()
print(credits.count())
credits = credits.to_koalas()

                                                                                

root
 |-- cast: string (nullable = true)
 |-- crew: string (nullable = true)
 |-- id: integer (nullable = true)

45476


In [7]:
keywords = spark.read.options(quote="\"", escape="\"", inferSchema=True, header=True, multiline=True)\
                   .csv("../data/raw/keywords.csv")
keywords.printSchema()
print(keywords.count())
keywords = keywords.to_koalas()

root
 |-- id: integer (nullable = true)
 |-- keywords: string (nullable = true)

46419


In [8]:
links = spark.read.options(quote="\"", escape="\"", inferSchema=True, header=True, multiline=True)\
                   .csv("../data/raw/links.csv")
links.printSchema()
print(links.count())
links = links.to_koalas()

root
 |-- movieId: integer (nullable = true)
 |-- imdbId: integer (nullable = true)
 |-- tmdbId: integer (nullable = true)

45843


In [9]:
links_small = spark.read.options(quote="\"", escape="\"", inferSchema=True, header=True, multiline=True)\
                   .csv("../data/raw/links_small.csv")
links_small.printSchema()
print(links_small.count())
links_small = links_small.to_koalas()

root
 |-- movieId: integer (nullable = true)
 |-- imdbId: integer (nullable = true)
 |-- tmdbId: integer (nullable = true)

9125


In [10]:
movies.head()

                                                                                

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",3.859495,/16XOMpEaLWkrcPqSQqhTmeJuqQl.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,81452156,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,8.387519,/e64sOI48hQXyru7naBFyssKFxVd.jpg,"[{'name': 'Sandollar Productions', 'id': 5842}...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-02-10,76578911,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173


In [11]:
# df = movies.merge(credits, on='id', how='left')
# print(df.shape)
# df.head()

## Missing Data

In [6]:
nan_list = movies.isna().sum().sort_values()
nan_list.plot.bar(title="Missing in movies metadata")

                                                                                

In [8]:
pop = movies.sort_values('popularity', ascending=False)
pop = pop.head(10)
pop.plot.barh(y='title', x='popularity')

                                                                                

In [7]:
pop = movies.sort_values('budget', ascending=False)
pop = pop.head(10)
pop.plot.barh(y='title', x='budget')

                                                                                

In [11]:
pop = movies.sort_values('release_date', ascending=False)
pop.head()

                                                                                

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
35587,Avalanche Sharks tells the story of a bikini ...,2.185485,/zaSf5OG7V8X8gqFvly88zDdRm46.jpg,"[{'name': 'Odyssey Media', 'id': 17161}, {'nam...","[{'iso_3166_1': 'CA', 'name': 'Canada'}]",2014-01-01,0,82.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Beware Of Frost Bites,Avalanche Sharks,False,4.3,22,,,,,,,,,
26559,False,"{'id': 87096, 'name': 'Avatar Collection', 'po...",0,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",http://www.avatarmovie.com/,76600,tt1630029,en,Avatar 2,A sequel to Avatar (2009).,6.020055,/wvpOVl37PR21mENpmZKVwrLUsRD.jpg,[{'name': 'Twentieth Century Fox Film Corporat...,"[{'iso_3166_1': 'US', 'name': 'United States o...",2020-12-16,0.0,0.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",In Production,,Avatar 2,False,0.0,58.0
38885,False,,12000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,299782,tt0069049,en,The Other Side of the Wind,"Orson Welles' unfinished masterpiece, restored...",0.238154,/wtSpgCw6MmSxGYysaa17f0a9U4y.jpg,"[{'name': ""Les Films de l'Astrophore"", 'id': 7...","[{'iso_3166_1': 'IR', 'name': 'Iran'}, {'iso_3...",2018-12-31,0.0,0.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Post Production,,The Other Side of the Wind,False,0.0,1.0
30402,False,"{'id': 14890, 'name': 'Bad Boys Collection', '...",0,"[{'id': 53, 'name': 'Thriller'}, {'id': 28, 'n...",,38700,tt1502397,en,Bad Boys for Life,The continuing adventures of Miami detectives ...,2.178546,/2GwwyfykFAf3jKXFWMkBiLgTi3k.jpg,"[{'name': 'Columbia Pictures', 'id': 5}, {'nam...","[{'iso_3166_1': 'US', 'name': 'United States o...",2018-11-07,0.0,0.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Planned,,Bad Boys for Life,False,0.0,12.0
38130,False,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,332283,tt3906082,en,Mary Shelley,The love affair between poet Percy Shelley and...,3.328261,/AnHutc9eDDrVXvlmYFohEougTHq.jpg,"[{'name': 'Parallel Films', 'id': 5122}, {'nam...","[{'iso_3166_1': 'IE', 'name': 'Ireland'}, {'is...",2018-04-25,0.0,0.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Post Production,,Mary Shelley,False,0.0,1.0
