In [1]:
from bs4 import BeautifulSoup as bs
from functools import reduce
from pandas.io.json import json_normalize
import json
import matplotlib as mpl
import pandas as pd
import re
import requests
import seaborn as sns
import unicodedata as uni
import os

from IPython.display import display
pd.options.display.max_columns = None

In [2]:
def read_json_to_dict(filename, path_to_file = None, encoding = 'utf-8'):
    
    path_to_file = reduce(lambda x, y: x if x is not None else y, [path_to_file, os.getcwd().replace('\\', '/')])
    
    if path_to_file[-1:] != '/':
        path_to_file = path_to_file + '/'
        
    full_file_path = path_to_file + '/' + filename
    
    with open(full_file_path, 'r', encoding = encoding) as file:
        data = json.load(file)
    
    return data
        

In [3]:
linneburg_ratings = read_json_to_dict('linneburg.json', 'movie_jsons')

In [4]:
def flatten_crew_dict(movie_dict):

    crew_list = movie_dict['crew']['crew_list']
    crew_wanted_dict = {'director':2, 'producer':3, 'writer':1, 'composer':1}
    flat_crew_dict = {}

    for role, nbr in crew_wanted_dict.items():
        inner_list = []
        try:
            filtered_crew_dict = list(filter(lambda input_list: input_list['crew_role'] == role, crew_list))[0]
        except:
            continue

        for i in range(nbr):

            try:
                flat_crew_dict[role + '_' + str(i)] = filtered_crew_dict['crew_attributes'][i]['name']
            except IndexError:
                flat_crew_dict[role + '_' + str(i)] = None

    flat_crew_dict['total_crew_size'] = movie_dict['crew']['total_crew']
    flat_crew_dict['unique_crew_size'] = movie_dict['crew']['total_unique_crew']

    return flat_crew_dict

In [5]:
def flatten_cast_dict(movie_dict):

    cast_list = movie_dict['cast']['cast_list']
    flat_cast_dict = {}
    cast_to_retrieve = 5

    for i in range(min(cast_to_retrieve, movie_dict['cast']['total_cast'])):
        try:
            flat_cast_dict['actor_'+str(i)] = cast_list[i]['actor_name']
        except:
            flat_cast_dict['actor_'+str(i)] = None


    flat_cast_dict['total_cast_size'] = movie_dict['cast']['total_cast']
    return flat_cast_dict

In [6]:
def flatten_genre_list(movie_dict):

    genres_list = movie_dict['genres']
    flat_genres_dict = {}
    for i in range(3):
        try:
            flat_genres_dict['genre_'+str(i)] = genres_list[i]
        except IndexError:
            flat_genres_dict['genre_'+str(i)] = None

    return flat_genres_dict

In [7]:
def flatten_ratings_list(movie_dict):

    ratings_list = movie_dict['rating_counts']
    flat_ratings_dict = {}


    for i in range(len(ratings_list)):
        flat_ratings_dict[str(ratings_list[i]['rating']) + '_stars'] = ratings_list[i]['number_of_ratings']

    flat_ratings_dict['total'] = sum(list(flat_ratings_dict.values()))
    return flat_ratings_dict

In [8]:
def main(movie_list):
    
    movies_to_add = movie_list
    return_movie_list = []

    for i in range(len(movies_to_add)):
        movie_dict = {}
        movie_dict['title'] = movies_to_add[i]['title']
        movie_dict.update(flatten_genre_list(movies_to_add[i]))
        movie_dict['avg_rating'] = movies_to_add[i]['avg_rating']
        movie_dict.update(flatten_ratings_list(movies_to_add[i]))
        movie_dict['user_rating'] = movies_to_add[i]['user_rating']
        movie_dict.update(flatten_crew_dict(movies_to_add[i]))
        movie_dict.update(flatten_cast_dict(movies_to_add[i])) 
        
        return_movie_list.append(movie_dict)
        
    return return_movie_list

In [19]:
flat_dict = main(linneburg_ratings['movies_rated'])
linneburg_movie_data = json_normalize(flat_dict).reindex(columns=list(flat_dict[0].keys()))


0                drama
1               action
2                drama
3                crime
4               action
5               action
6            adventure
7               family
8              fantasy
9               comedy
10           adventure
11           animation
12              action
13              action
14            thriller
15                 war
16              family
17              action
18     science fiction
19            thriller
20               drama
21                 war
22               drama
23              family
24              comedy
25            thriller
26               drama
27           animation
28     science fiction
29              family
            ...       
393             comedy
394             family
395           thriller
396           thriller
397              drama
398          adventure
399             action
400             action
401             action
402             comedy
403          animation
404             comedy
405        

In [53]:
linneburg_movie_data.iloc[0:1]

Unnamed: 0,title,genre_0,genre_1,genre_2,avg_rating,0.5_stars,1.0_stars,1.5_stars,2.0_stars,2.5_stars,3.0_stars,3.5_stars,4.0_stars,4.5_stars,5.0_stars,total,user_rating,director_0,director_1,producer_0,producer_1,producer_2,writer_0,composer_0,total_crew_size,unique_crew_size,actor_0,actor_1,actor_2,actor_3,actor_4,total_cast_size
0,Chernobyl,drama,,,4.5,64,66,42,215,260,1231,2336,11318,14205,26662,56399,4.5,Johan Renck,,Jane Featherstone,Carolyn Strauss,Craig Mazin,Craig Mazin,Hildur Guðnadóttir,30,29,Jared Harris,Stellan Skarsgård,Emily Watson,Paul Ritter,Jessie Buckley,18


In [28]:
unique_genres = set()
all_genres = []

all_genres = all_genres + list(linneburg_movie_data.genre_0.unique())
all_genres = all_genres + list(linneburg_movie_data.genre_1.unique())
all_genres = all_genres + list(linneburg_movie_data.genre_2.unique())

for genre in all_genres:
    if genre not in unique_genres:
        unique_genres.add(genre)

unique_genres

{None,
 'action',
 'adventure',
 'animation',
 'comedy',
 'crime',
 'drama',
 'family',
 'fantasy',
 'history',
 'horror',
 'music',
 'mystery',
 'romance',
 'science fiction',
 'thriller',
 'war',
 'western'}