In [1]:
%load_ext autoreload
%matplotlib inline
%autoreload 2
from deco.imports import *
from deco.context import *
import json
from pprint import pprint
import sklearn.ensemble as st
from datetime import datetime
import random
from math import sin, cos, sqrt, atan2, radians
import networkx as nx
import numpy as np
from scipy.stats import norm
import matplotlib.pyplot as plt

In [2]:
PATH = '/home/rotem/Documents/code/unsupervised-learning/data/netflix/'

In [3]:
pandas_cache = {}

In [4]:
def lower_columns(df, columns):
    for column in columns:
        df[column] = df[column].str.lower()

In [5]:
def load_netflix_data():
    if 'netflix' not in pandas_cache:
        df = pd.read_csv(PATH + 'netflix_titles.csv')
        pandas_cache['netflix'] = df
    else:
        df = pandas_cache['netflix']
    return df

In [6]:
def load_imdb():
    if 'imdb' not in pandas_cache:
        titles = pd.read_csv(PATH + 'title.basics.tsv', sep='\t')
        raiting = pd.read_csv(PATH + 'title.ratings.tsv', sep='\t')
        df = titles.merge(raiting, on=['tconst'])
        pandas_cache['imdb'] = df
    else:
        df = pandas_cache['imdb']
    return df

def prepare_imdb(imdb, to_lower=['primaryTitle', 'originalTitle'], to_keep=['averageRating', 'numVotes']):
    imdb = imdb.copy()
    lower_columns(imdb, to_lower)
    imdb = imdb[to_lower + to_keep]
    imdb['title'] = imdb.primaryTitle.str.replace("'", '')
    imdb.averageRating = imdb.averageRating * imdb.numVotes
    imdb = imdb.groupby(['title'], as_index=False).agg({'averageRating': 'sum', 'numVotes': 'sum'})
    imdb.averageRating = imdb.averageRating / imdb.numVotes
    return imdb

def add_imdb_raiting(df):
    imdb = load_imdb()
    imdb = prepare_imdb(imdb)
    imdb.title = imdb.title.str.replace('&', 'and')
    df.title = df.title.str.replace('&', 'and')
    df = df.merge(imdb, on=['title'], how='left')
    return df

In [7]:
def load_oscars():
    if 'oscars' not in pandas_cache:
        df = pd.read_csv(PATH + 'oscars.csv')
        pandas_cache['oscars'] = df
    else:
        df = pandas_cache['oscars']
    return df

def prepare_oscars(oscars):
    oscars = oscars.copy()
    oscars.win = oscars.win.apply(lambda value: 1 if value else 0)
    oscars.film = oscars.film.str.lower()
    oscars = oscars[(oscars.film != ' ') & (oscars.film.notnull())].groupby(['film'], as_index=False).agg({'win': 'sum'})
    oscars['title'] = oscars.film.str.replace("'", '')
    oscars = oscars[['title', 'win']]
    return oscars

def add_oscars(df):
    oscars = load_oscars()
    oscars = prepare_oscars(oscars)
    oscars['type'] = 'Movie'
    df = df.merge(oscars, on=['title', 'type'], how='left')
    df.win = df.win.apply(lambda value: value if value is not None else 0)
    return df

In [8]:
def prepare(df, to_list=['cast', 'country', 'listed_in', 'director'], to_lower=['title', 'director', 'cast']):
    df = df.copy()
    lower_columns(df, to_lower)
    for column in to_list:
        df[column] = df[column].str.split(', ')
    df.title = df.title.str.replace("'", "")
    df.date_added = df.date_added.str.replace('^ ', '')
    df.date_added = pd.to_datetime(df.date_added, format='%B %d, %Y')
    return df

In [9]:
enrichers = [add_oscars, add_imdb_raiting]
def enrich(df):
    df = df.copy()
    for enricher in enrichers:
        df = enricher(df)
    return df

In [10]:
df = load_netflix_data()
df = prepare(df)
df = enrich(df)

  This is separate from the ipykernel package so we can avoid doing imports until


In [11]:
df.to_json(PATH + 'stage.json')