# How much disk space does one need to store all relevant movies?

## Assumptions:

* Netflix uses 13 MB per minute of video
* 1024 MB are in a GB
* 1024 GB are in a TB
* Relevant movies are the movies that account for 80% of votes on IMDB

In [97]:
# We want inline charts
%matplotlib inline

import pandas as pd # For data management
import numpy as np # For math and array functions
import matplotlib.pyplot as plt # For plotting

In [98]:
# Read movies file
movies = pd.read_csv('basics.tsv', sep='\t', index_col='tconst', low_memory=False)

In [99]:
list(movies)

['titleType',
 'primaryTitle',
 'originalTitle',
 'isAdult',
 'startYear',
 'endYear',
 'runtimeMinutes',
 'genres']

In [100]:
movies.values[1]

array(['short', 'Le clown et ses chiens', 'Le clown et ses chiens', 0,
       '1892', '\\N', '5', 'Animation,Short'], dtype=object)

In [101]:
# runtime should be a number
movies[['runtimeMinutes']] = movies[['runtimeMinutes']].apply(pd.to_numeric, errors='coerce')

# Set start year to be a number
movies[['startYear']] = movies[['startYear']].apply(pd.to_numeric, errors='coerce')

In [102]:
movies.values[1]

array(['short', 'Le clown et ses chiens', 'Le clown et ses chiens', 0,
       1892.0, '\\N', 5.0, 'Animation,Short'], dtype=object)

In [103]:
# load the ratings file
ratings = pd.read_csv('ratings.tsv', sep='\t', index_col='tconst', low_memory=False)

In [104]:
list(ratings)

['averageRating', 'numVotes']

In [105]:
ratings.values[1]

array([   6.5,  156. ])

In [106]:
# Add the ratings to the movies
movies = pd.merge(movies, ratings, left_index=True, right_index=True)

In [107]:
# Dropping NANs
tmp = movies.dropna(axis=0, how='any')
dropped = movies.size - tmp.size
movies = tmp
dropped

2323560

In [108]:
movies.values[1]

array(['short', 'Le clown et ses chiens', 'Le clown et ses chiens', 0,
       1892.0, '\\N', 5.0, 'Animation,Short', 6.5, 156], dtype=object)

In [109]:
# What is the average number of votes per movie?
avgNumVotes = movies['numVotes'].mean()
avgNumVotes

1379.8699463443925

In [110]:
# Sort movies that the most popular are on top
movies = movies.sort(['numVotes'],  ascending=False)

# Get the number of all ratings
allRatings = movies['numVotes'].sum()
seenRatings = 0

# Set all movies as irrelevant
movies['relevant'] = False

  from ipykernel import kernelapp as app


In [111]:
# Go through all movies until you have seen 80% of all ratings
# The movies that account for 80% of votes are relevant

for index, movie in movies.iterrows():
    movies.loc[index, 'relevant'] = True
    seenRatings += movie['numVotes']
    if seenRatings / allRatings >= 0.8:
        break

In [112]:
# How many percent of the movies do account for 80% of votes
movies[movies['relevant']].size / movies.size * 100

1.2620601851444229

In [113]:
# Get movies that are new and relevant
# We also exclude series

# To only consider recent movies:
#  & (movies['startYear'] > 2000)

newAndRelevant = movies[((movies['relevant'] == True) & (movies['titleType'] == "movie"))]
newAndRelevant.size

68068

In [114]:
# Netflix uses 13mb for one minute of video
# 1024 mb in a gb
# 1024 gb in a tb
print (str(newAndRelevant['runtimeMinutes'].sum() * 13 / 1024 / 1024) + " tb")

8.423178672790527 tb


# One would need 8.4 TB to store all relevant movies