# How much disk space does one need to store all relevant movies since the year 2000?

## Assumptions:
* Netflix uses 13 MB for one minute of video
* 1024 MB are in a GB
* 1024 GB are in a TB
* Relevant movies are the movies that account for 80% of votes on IMDB 
* AND are released after the year 2000.

In [42]:
# We want inline charts
%matplotlib inline

import pandas as pd # For data management
import numpy as np # For math and array functions
import matplotlib.pyplot as plt # For plotting

In [43]:
# Read movies file
movies = pd.read_csv('basics.tsv', sep='\t', index_col='tconst', low_memory=False)

In [44]:
list(movies)

['titleType',
 'primaryTitle',
 'originalTitle',
 'isAdult',
 'startYear',
 'endYear',
 'runtimeMinutes',
 'genres']

In [45]:
movies.values[1]

array(['short', 'Le clown et ses chiens', 'Le clown et ses chiens', 0,
       '1892', '\\N', '5', 'Animation,Short'], dtype=object)

In [46]:
# runtime should be a number
movies[['runtimeMinutes']] = movies[['runtimeMinutes']].apply(pd.to_numeric, errors='coerce')

# Set start year to be a number
movies[['startYear']] = movies[['startYear']].apply(pd.to_numeric, errors='coerce')

In [47]:
movies.values[1]

array(['short', 'Le clown et ses chiens', 'Le clown et ses chiens', 0,
       1892.0, '\\N', 5.0, 'Animation,Short'], dtype=object)

In [48]:
# load the ratings file
ratings = pd.read_csv('ratings.tsv', sep='\t', index_col='tconst', low_memory=False)

In [49]:
list(ratings)

['averageRating', 'numVotes']

In [50]:
ratings.values[1]

array([   6.5,  156. ])

In [51]:
# Add the ratings to the movies
movies = pd.merge(movies, ratings, left_index=True, right_index=True)

In [52]:
# Dropping NANs
tmp = movies.dropna(axis=0, how='any')
dropped = movies.size - tmp.size
movies = tmp
dropped

2323560

In [53]:
movies

Unnamed: 0_level_0,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
tt0000001,short,Carmencita,Carmencita,0,1894.0,\N,1.0,"Documentary,Short",5.8,1346
tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892.0,\N,5.0,"Animation,Short",6.5,156
tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892.0,\N,4.0,"Animation,Comedy,Romance",6.6,927
tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893.0,\N,1.0,Short,6.2,1612
tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894.0,\N,1.0,Short,5.7,79
tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894.0,\N,1.0,"Short,Sport",5.5,539
tt0000008,short,Edison Kinetoscopic Record of a Sneeze,Edison Kinetoscopic Record of a Sneeze,0,1894.0,\N,1.0,"Documentary,Short",5.6,1437
tt0000009,movie,Miss Jerry,Miss Jerry,0,1894.0,\N,45.0,Romance,5.4,61
tt0000010,short,Employees Leaving the Lumière Factory,La sortie de l'usine Lumière à Lyon,0,1895.0,\N,1.0,"Documentary,Short",6.9,4821
tt0000011,short,Akrobatisches Potpourri,Akrobatisches Potpourri,0,1895.0,\N,1.0,"Documentary,Short",5.4,191


In [54]:
# What is the average number of votes per movie?
avgNumVotes = movies['numVotes'].mean()
avgNumVotes

1379.8699463443925

In [55]:
# Sort movies that the most popular are on top
movies = movies.sort(['numVotes'],  ascending=False)

# Get the number of all ratings
allRatings = movies['numVotes'].sum()
seenRatings = 0

# Set all movies as irrelevant
movies['relevant'] = False

  from ipykernel import kernelapp as app


In [56]:
# Go through all movies until you have seen 80% of all ratings
# The movies that account for 80% of votes are relevant

for index, movie in movies.iterrows():
    movies.loc[index, 'relevant'] = True
    seenRatings += movie['numVotes']
    if seenRatings / allRatings >= 0.8:
        break

In [57]:
# How many percent of the movies do account for 80% of votes
movies[movies['relevant']].size / movies.size * 100

1.2620601851444229

In [91]:
# Get movies that are new and relevant
newAndRelevant = movies[((movies['relevant'] == True) & (movies['startYear'] > 2000) & (movies['titleType'] == "movie"))]
newAndRelevant.size

41536

In [92]:
# Netflix uses 13mb for one minute of video
# 1024 mb in a gb
# 1024 gb in a tb
print (str(newAndRelevant['runtimeMinutes'].sum() * 13 / 1024 / 1024) + " tb")

5.107929229736328 tb


# One would need 5.1 TB to store all relevant movies since the year 2000