In [1]:
# Import tmdb api wrapper library
import tmdbsimple as tmdb

# Import JSON
import json

# import time for sleeping and scraping
import time

# Import the usual suspects
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_columns', 500)

# pickle for saving and loading scraped info
import pickle


from timeit import default_timer as timer


# Pandas profiling - great library for analyzing dataframes
import pandas_profiling as pp



In [2]:
# Import api key
tmdb.API_KEY = open('../api_key', 'rt').read()

In [3]:
# Load list of json info for all currently valid TV shows
tv_series_json = [json.loads(line) for line in open('../data/raw/tv_series_ids_07_09_2019.json', 'r')]

In [4]:
# Extract ids into list
tv_series_id = [x['id'] for x in tv_series_json]

In [5]:
# Load existing pickles into list

tv_shows_info = []

f_num = 0
exists = True
while(exists):
    f_num += 1
    exists = os.path.isfile(
        '../data/processed/tv_shows/tv_shows_info_{}.pkl'.format(f_num))

    if exists:
        with open('../data/processed/tv_shows/tv_shows_info_{}.pkl'.format(f_num), 'rb') as f:
            mynewlist = pickle.load(f)
        tv_shows_info.extend(mynewlist)
    

len(tv_shows_info)



82817

In [6]:
already_scraped_id = [x['id'] for x in tv_shows_info]
print(len(already_scraped_id))
print(already_scraped_id[:10])

82817
[601, 602, 603, 604, 605, 606, 607, 608, 609, 610]


In [8]:
# scrape info for all tv shows
count = 0

remaining_tv_id = list(set(tv_series_id)-set(already_scraped_id))
print('number of shows to scrape: {}'.format(len(remaining_tv_id)))

# start new empty list to temporarily store results
temp_list = []
skipped_ids = []

start = timer()
for i in remaining_tv_id:
    no_show = True
    attempt = 0
    while no_show:
        try:
            tv_show = tmdb.TV(i)
            tv_info = tv_show.info()
            temp_list.append(tv_info)
            no_show = False
        except:
            time.sleep(1)
            attempt += 1
                # if trying same file multiple times, skip it
            if attempt>10:
                skipped_ids.append(i)
                print('skipped: {}'.format(i))
                break
       
    count+=1
    
    if count%100==0: 
        print(len(temp_list))
        end = timer()
        print('{:.2f} seconds'.format(end - start))
        start = timer()
    if count%1000==0:
        f_num = 0
        exists = True
        while(exists):
            f_num += 1
            exists = os.path.isfile(
                '../data/processed/tv_shows/tv_shows_info_{}.pkl'.format(f_num))
 
        if not exists:
            with open('../data/processed/tv_shows/tv_shows_info_{}.pkl'.format(f_num), 'wb') as f:
                pickle.dump(temp_list, f)
            print('saved to: ../data/processed/tv_shows/tv_shows_info_{}.pkl'.format(f_num))
            temp_list = []

            
# once more at the end to catch any remaining
if len(temp_list)>0:
    f_num = 0
    exists = True
    while(exists):
        f_num += 1
        exists = os.path.isfile(
            '../data/processed/tv_shows/tv_shows_info_{}.pkl'.format(f_num))

    if not exists:
        with open('../data/processed/tv_shows/tv_shows_info_{}.pkl'.format(f_num), 'wb') as f:
            pickle.dump(temp_list, f)
        print('saved to: ../data/processed/tv_shows/tv_shows_info_{}.pkl'.format(f_num))        

number of shows to scrape: 7
skipped: 88513
skipped: 90818
skipped: 84771
skipped: 89161
skipped: 30928
skipped: 90512
skipped: 90323


# View Data in Pandas DataFrame

In [28]:
tv_info_df = pd.DataFrame(tv_shows_info)
tv_info_df.shape

(82817, 28)

In [29]:
tv_info_df.head()

Unnamed: 0,backdrop_path,created_by,episode_run_time,first_air_date,genres,homepage,id,in_production,languages,last_air_date,last_episode_to_air,name,networks,next_episode_to_air,number_of_episodes,number_of_seasons,origin_country,original_language,original_name,overview,popularity,poster_path,production_companies,seasons,status,type,vote_average,vote_count
0,,"[{'id': 383, 'credit_id': '525389d019c29579402...",[30],1969-01-12,[],,601,True,[en],1969-02-16,"{'air_date': '1969-02-16', 'episode_number': 6...",The Complete and Utter History of Britain,"[{'name': 'London Weekend Television', 'id': 1...",,6.0,1,[GB],en,The Complete and Utter History of Britain,The Complete And Utter History Of Britain was ...,0.6,,[],"[{'air_date': '1969-01-12', 'episode_count': 6...",Ended,Scripted,0.0,0
1,/5ekOlsSAEHTEx4XS0cSzAQLzKcV.jpg,"[{'id': 157395, 'credit_id': '525389dc19c29579...",[30],1966-09-06,"[{'id': 35, 'name': 'Comedy'}]",,602,False,[en],1967-04-06,"{'air_date': '1967-04-06', 'episode_number': 3...",Love on a Rooftop,"[{'name': 'ABC', 'id': 2, 'logo_path': '/ndAvF...",,32.0,1,[US],en,Love on a Rooftop,Love on a Rooftop was an American sitcom about...,1.313,/6sxAv9Mrw3e6WhQqd0lbRUmRZRg.jpg,[],"[{'air_date': '1966-09-06', 'episode_count': 3...",Ended,Scripted,0.0,0
2,/khveUylm0fqlLGuUHLg74tKozdy.jpg,"[{'id': 163461, 'credit_id': '525389de19c29579...",[24],1997-09-25,"[{'id': 35, 'name': 'Comedy'}]",,603,False,[en],2000-06-27,"{'air_date': '2000-06-27', 'episode_number': 1...",Veronica's Closet,"[{'name': 'NBC', 'id': 6, 'logo_path': '/o3Oed...",,62.0,3,[US],en,Veronica's Closet,Veronica 'Ronnie' Chase is the 'Queen of Roman...,2.223,/zDSdDiZgOMS7zAda01yzZJLUiAA.jpg,"[{'id': 1957, 'logo_path': '/nmcNfPq03WLtOyufJ...","[{'air_date': '1997-09-25', 'episode_count': 2...",Ended,Scripted,4.0,6
3,/hHwEptckXUwZM7XO2lxZ8w8upuU.jpg,"[{'id': 141043, 'credit_id': '525389ed19c29579...",[23],2003-07-20,"[{'id': 16, 'name': 'Animation'}, {'id': 10759...",http://cartoonnetwork.com/tv_shows/titans/inde...,604,False,[en],2006-01-16,"{'air_date': '2006-01-16', 'episode_number': 1...",Teen Titans,"[{'name': 'Cartoon Network', 'id': 56, 'logo_p...",,65.0,5,[US],en,Teen Titans,The Teen Titans are five heroes under one roof...,29.619,/tfdiVvJkYMbUOXDWibPjzu5dY6S.jpg,"[{'id': 429, 'logo_path': '/2Tc1P3Ac8M479naPp1...","[{'air_date': '2005-01-11', 'episode_count': 2...",Ended,Scripted,7.4,136
4,/4PhaaA593zHfAyYUIXVWTih3qcD.jpg,"[{'id': 69396, 'credit_id': '525389f019c295794...",[22],1996-09-27,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",,605,False,[en],2003-04-24,"{'air_date': '2003-04-24', 'episode_number': 2...","Sabrina, the Teenage Witch","[{'name': 'ABC', 'id': 2, 'logo_path': '/ndAvF...",,163.0,7,[US],en,"Sabrina, the Teenage Witch","Sabrina, the Teenage Witch is an American sitc...",22.814,/8bwNu5JlRQyi6j6JO7UF42MAgX.jpg,"[{'id': 5452, 'logo_path': None, 'name': 'Hart...","[{'air_date': '1998-10-04', 'episode_count': 3...",Ended,Scripted,6.4,119


# Scraping Season Info

In [7]:
# Load existing pickles into list

tv_seasons_info = []

f_num = 0
exists = True
while(exists):
    f_num += 1
    exists = os.path.isfile(
        '../data/processed/tv_seasons/tv_seasons_info_{}.pkl'.format(f_num))

    if exists:
        with open('../data/processed/tv_seasons/tv_seasons_info_{}.pkl'.format(f_num), 'rb') as f:
            mynewlist = pickle.load(f)
#             print(len(mynewlist))
        tv_seasons_info.extend(mynewlist)
    

len(tv_seasons_info)



82772

In [8]:
num_seasons_scraped = 0

for show in tv_seasons_info:
    num_seasons_scraped += len(show['season_info'])


num_seasons_scraped

111568

In [9]:
already_scraped_season_id = [x['show_id'] for x in tv_seasons_info]
print(len(already_scraped_season_id))
print(already_scraped_season_id[:10])
print(already_scraped_season_id[-10:])

82772
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
[63023, 78622, 60242, 64418, 89543, 65769, 81073, 89717, 90298, 75121]


In [11]:
initial_tv_show_id = [x['id'] for x in tv_series_json]
print(len(initial_tv_show_id))
print(initial_tv_show_id[:10])
print(initial_tv_show_id[-10:])

82824
[601, 602, 603, 604, 605, 606, 607, 608, 609, 610]
[90941, 90942, 90943, 90944, 90945, 90946, 90947, 90948, 90949, 90950]


In [10]:
tv_season_dict = {x['id']:[y['season_number'] for y in x['seasons']] for x in tv_shows_info}

In [11]:
# total number of seasons to scrape
# note some shows have none
sumseasons = 0
for k, v in tv_season_dict.items():
    sumseasons += len(v)
sumseasons

113338

In [32]:
# scrape info for all tv shows
count = 0
season_count = 0

remaining_tv_id = list(set(initial_tv_show_id)-set(already_scraped_season_id))

#shuffle so we don't always start with those that have been skipped
np.random.shuffle(remaining_tv_id)
print('number of shows to scrape: {}'.format(len(remaining_tv_id)))

# start new empty list to temporarily store results
temp_list = []
skipped_ids = []

start = timer()
for i in remaining_tv_id:
    no_show = True
    attempt = 0
    while no_show:
        try:
            season_dict = {}
            show_id = i
            season_dict['show_id'] = show_id
            season_nums = tv_season_dict[show_id]
            season_info_list = []
#             print('before season loop')
            for sn in season_nums:
                season_iter = tmdb.TV_Seasons(show_id, sn)
                season_info = season_iter.info()
                season_info_list.append(season_info)
                season_count += 1
#                 print('season iter')
#             print('after season loop')

            season_dict['season_info'] = season_info_list
            temp_list.append(season_dict)
            
            no_show = False
        except:
            time.sleep(1)
            attempt += 1
                # if trying same file multiple times, skip it
            if attempt>10:
                skipped_ids.append(i)
                print('skipped: {}'.format(i))
                break
       
    count+=1
    
    if count%50==0: 
        print(len(temp_list))
        print('seasons: {}'.format(season_count))
        end = timer()
        print('{:.2f} seconds'.format(end - start))
        start = timer()
    if count%500==0:
        season_count = 0
        f_num = 0
        exists = True
        while(exists):
            f_num += 1
            exists = os.path.isfile(
                '../data/processed/tv_seasons/tv_seasons_info_{}.pkl'.format(f_num))
 
        if not exists:
            with open('../data/processed/tv_seasons/tv_seasons_info_{}.pkl'.format(f_num), 'wb') as f:
                pickle.dump(temp_list, f)
            print('saved to: ../data/processed/tv_seasons/tv_seasons_info_{}.pkl'.format(f_num))
            temp_list = []

            
# once more at the end to catch any remaining
if len(temp_list)>0:
    f_num = 0
    exists = True
    while(exists):
        f_num += 1
        exists = os.path.isfile(
            '../data/processed/tv_seasons/tv_seasons_info_{}.pkl'.format(f_num))

    if not exists:
        with open('../data/processed/tv_seasons/tv_seasons_info_{}.pkl'.format(f_num), 'wb') as f:
            pickle.dump(temp_list, f)
        print('saved to: ../data/processed/tv_seasons/tv_seasons_info_{}.pkl'.format(f_num))        

number of shows to scrape: 25859
50
seasons: 72
14.44 seconds
skipped: 80818
99
seasons: 131
30.59 seconds
149
seasons: 197
18.21 seconds
199
seasons: 277
20.43 seconds
249
seasons: 365
25.75 seconds
299
seasons: 474
26.62 seconds
349
seasons: 552
20.76 seconds
399
seasons: 622
17.19 seconds
449
seasons: 694
17.49 seconds
499
seasons: 766
19.48 seconds
saved to: ../data/processed/tv_seasons/tv_seasons_info_115.pkl
skipped: 23318
49
seasons: 181
43.60 seconds
99
seasons: 278
27.91 seconds
149
seasons: 331
12.21 seconds
199
seasons: 438
30.59 seconds
249
seasons: 542
23.70 seconds
299
seasons: 624
29.22 seconds
349
seasons: 689
11.31 seconds
399
seasons: 792
29.76 seconds
449
seasons: 866
19.63 seconds
skipped: 90818
498
seasons: 942
27.57 seconds
saved to: ../data/processed/tv_seasons/tv_seasons_info_116.pkl
skipped: 26715
49
seasons: 156
39.69 seconds
99
seasons: 217
12.92 seconds
skipped: 74976
148
seasons: 336
44.53 seconds
198
seasons: 407
19.93 seconds
248
seasons: 475
17.02 second

150
seasons: 243
22.51 seconds
200
seasons: 356
29.97 seconds
250
seasons: 439
22.12 seconds
300
seasons: 558
32.26 seconds
350
seasons: 632
20.05 seconds
skipped: 5859
399
seasons: 775
34.97 seconds
449
seasons: 876
30.03 seconds
skipped: 89161
498
seasons: 944
25.88 seconds
saved to: ../data/processed/tv_seasons/tv_seasons_info_136.pkl
50
seasons: 71
18.32 seconds
100
seasons: 132
18.64 seconds
150
seasons: 197
13.91 seconds
200
seasons: 314
29.74 seconds
250
seasons: 396
21.52 seconds
300
seasons: 478
21.81 seconds
350
seasons: 558
22.12 seconds
skipped: 90839
399
seasons: 624
31.65 seconds
449
seasons: 697
17.93 seconds
499
seasons: 766
14.08 seconds
saved to: ../data/processed/tv_seasons/tv_seasons_info_137.pkl
50
seasons: 59
17.72 seconds
100
seasons: 134
20.24 seconds
150
seasons: 206
19.37 seconds
skipped: 4909
skipped: 71494
198
seasons: 395
54.62 seconds
skipped: 85299
skipped: 87128
246
seasons: 543
44.23 seconds
skipped: 16656
295
seasons: 676
39.89 seconds
345
seasons: 753

300
seasons: 471
12.21 seconds
350
seasons: 548
21.34 seconds
400
seasons: 629
22.08 seconds
skipped: 64289
449
seasons: 696
26.06 seconds
499
seasons: 754
15.00 seconds
saved to: ../data/processed/tv_seasons/tv_seasons_info_157.pkl
50
seasons: 73
21.02 seconds
100
seasons: 146
14.64 seconds
150
seasons: 212
20.55 seconds
200
seasons: 290
21.48 seconds
250
seasons: 363
20.97 seconds
300
seasons: 483
31.88 seconds
350
seasons: 570
22.36 seconds
400
seasons: 652
20.63 seconds
450
seasons: 710
12.25 seconds
500
seasons: 783
20.51 seconds
saved to: ../data/processed/tv_seasons/tv_seasons_info_158.pkl
50
seasons: 73
19.42 seconds
100
seasons: 136
19.09 seconds
150
seasons: 200
13.80 seconds
200
seasons: 262
19.01 seconds
250
seasons: 334
21.06 seconds
300
seasons: 432
23.74 seconds
350
seasons: 499
20.32 seconds
400
seasons: 574
20.95 seconds
450
seasons: 643
14.24 seconds
500
seasons: 702
19.72 seconds
saved to: ../data/processed/tv_seasons/tv_seasons_info_159.pkl
50
seasons: 67
19.46 seco

In [42]:
for show in tv_seasons_info:
    id_num = show['show_id']
    exp = len(tv_season_dict[id_num])
    actual = len(show['season_info'])
    if exp-actual!=0: print(id_num)

In [39]:
len(tv_series_json) - len(tv_seasons_info)

52

In [26]:

total_eps = 0

for show in tv_seasons_info:
    temp_list = []
    for ssn in show['season_info']:
        temp_list.extend(ssn['episodes'])
    total_eps += len(temp_list)

In [27]:
total_eps

1669306

In [37]:
all_episodes_list = []

for show in tv_seasons_info:
    temp_list = []
    for ssn in show['season_info']:
        for ep in ssn['episodes']:
            ep['show_id'] = show['show_id']
            temp_list.append(ep)
    all_episodes_list.extend(temp_list)


In [39]:
all_episodes_df = pd.DataFrame(all_episodes_list)

In [41]:
all_episodes_df.loc[all_episodes_df.show_id==604].sort_values('air_date')

Unnamed: 0,air_date,crew,episode_number,guest_stars,id,name,overview,production_code,season_number,show_id,still_path,vote_average,vote_count
45162,2003-07-20,"[{'id': 127380, 'credit_id': '525389e119c29579...",1,[],34373,Final Exam,"Three H.I.V.E. graduates, Mammoth, Jinx, and G...",,1.0,604,/dMPWTfiBIkTqfHDDhFEDHQc5Kql.jpg,7.50,2
45163,2003-07-27,"[{'id': 1215728, 'credit_id': '525389e219c2957...",2,[],34374,Sisters,Starfires better-than-her-at-everything sister...,,1.0,604,/cXUdnAfi1keW5prUtNkIZfXnxWe.jpg,9.00,1
45164,2003-08-03,"[{'id': 127383, 'credit_id': '525389e019c29579...",3,[],34375,Divide and Conquer,The Titans attempt to thwart the villain Cinde...,,1.0,604,/iqAUy5f2hNCye3DSkZ2NR3Daxek.jpg,5.00,2
45165,2003-08-17,"[{'id': 127383, 'credit_id': '525389e019c29579...",4,[],34376,Forces of Nature,"Dark clouds blow in, bringing Thunder and Ligh...",,1.0,604,/nMSf05Rqi2MuMG1GVyUFrCghIoT.jpg,9.00,1
45166,2003-08-24,"[{'id': 127383, 'credit_id': '525389e019c29579...",5,"[{'id': 78798, 'name': 'Tom Kenny', 'credit_id...",34364,The Sum of His Parts,"Cyborg is reminded he's not completely human, ...",,1.0,604,/dE2SheFcNfY42lwYuR0yDqkKsWs.jpg,8.00,1
45167,2003-08-31,"[{'id': 127380, 'credit_id': '525389e119c29579...",6,"[{'id': 87957, 'name': 'Keith Szarabajka', 'cr...",34365,Nevermore,Raven's creepy tantrum of crackling black ener...,,1.0,604,/7eu11B7GBSB5mFVogVEZLomOlSb.jpg,10.00,1
45168,2003-09-07,[],7,"[{'id': 3801, 'name': 'Tracey Walter', 'credit...",34366,Switched,"Starfire and Raven don't really """"get each oth...",,1.0,604,/obx60faEoFYDGyYejAmgvLy9DjP.jpg,9.00,1
45169,2003-09-14,[],8,"[{'id': 1218736, 'name': 'Dave Coulier', 'cred...",34367,Deep Six,When a mysterious amphibious villain called Tr...,,1.0,604,/vcgpuEjnVfPgl1LIlFyypmie2p.jpg,8.00,1
45170,2003-09-21,"[{'id': 127380, 'credit_id': '525389e119c29579...",9,"[{'id': 2372, 'name': 'Ron Perlman', 'credit_i...",34368,Masks,Slade sends a team of Robotic Commandos to ste...,,1.0,604,/abFMJM2VpUcI47tYNM53nyO7h6S.jpg,9.00,1
45171,2003-09-28,"[{'id': 160092, 'credit_id': '525389e119c29579...",10,"[{'id': 56890, 'name': 'Malcolm McDowell', 'cr...",34369,Mad Mod,The Titans awake to find themselves captured a...,,1.0,604,/9BsqqTKZT4N7gXVj9tsICuIzY4Z.jpg,8.00,1


In [45]:
merge_shows_episodes_df = tv_info_df.merge(all_episodes_df, how='outer', left_on='id', right_on='show_id', suffixes=('_show', '_ep'))

In [46]:
merge_shows_episodes_df.head()

Unnamed: 0,backdrop_path,created_by,episode_run_time,first_air_date,genres,homepage,id_show,in_production,languages,last_air_date,last_episode_to_air,name_show,networks,next_episode_to_air,number_of_episodes,number_of_seasons,origin_country,original_language,original_name,overview_show,popularity,poster_path,production_companies,seasons,status,type,vote_average_show,vote_count_show,air_date,crew,episode_number,guest_stars,id_ep,name_ep,overview_ep,production_code,season_number,show_id,still_path,vote_average_ep,vote_count_ep
0,,"[{'id': 383, 'credit_id': '525389d019c29579402...",[30],1969-01-12,[],,601,True,[en],1969-02-16,"{'air_date': '1969-02-16', 'episode_number': 6...",The Complete and Utter History of Britain,"[{'name': 'London Weekend Television', 'id': 1...",,6.0,1,[GB],en,The Complete and Utter History of Britain,The Complete And Utter History Of Britain was ...,0.6,,[],"[{'air_date': '1969-01-12', 'episode_count': 6...",Ended,Scripted,0.0,0,1969-01-12,"[{'id': 10707, 'credit_id': '525389cd19c295794...",1.0,[],34326.0,From the Dawn of History to the Normal Conquest,Satirical newscasts involve everything chronol...,,1.0,601.0,,0.0,0.0
1,,"[{'id': 383, 'credit_id': '525389d019c29579402...",[30],1969-01-12,[],,601,True,[en],1969-02-16,"{'air_date': '1969-02-16', 'episode_number': 6...",The Complete and Utter History of Britain,"[{'name': 'London Weekend Television', 'id': 1...",,6.0,1,[GB],en,The Complete and Utter History of Britain,The Complete And Utter History Of Britain was ...,0.6,,[],"[{'air_date': '1969-01-12', 'episode_count': 6...",Ended,Scripted,0.0,0,1969-01-19,"[{'id': 10707, 'credit_id': '525389cd19c295794...",2.0,[],34328.0,Richard the Lionheart to Robin the Hood,The late twelfth century is sent up.,,1.0,601.0,,0.0,0.0
2,,"[{'id': 383, 'credit_id': '525389d019c29579402...",[30],1969-01-12,[],,601,True,[en],1969-02-16,"{'air_date': '1969-02-16', 'episode_number': 6...",The Complete and Utter History of Britain,"[{'name': 'London Weekend Television', 'id': 1...",,6.0,1,[GB],en,The Complete and Utter History of Britain,The Complete And Utter History Of Britain was ...,0.6,,[],"[{'air_date': '1969-01-12', 'episode_count': 6...",Ended,Scripted,0.0,0,1969-01-26,"[{'id': 10707, 'credit_id': '525389cd19c295794...",3.0,[],34327.0,Edward the First to Richard the Last,Satirical newscast covers major events from 12...,,1.0,601.0,,0.0,0.0
3,,"[{'id': 383, 'credit_id': '525389d019c29579402...",[30],1969-01-12,[],,601,True,[en],1969-02-16,"{'air_date': '1969-02-16', 'episode_number': 6...",The Complete and Utter History of Britain,"[{'name': 'London Weekend Television', 'id': 1...",,6.0,1,[GB],en,The Complete and Utter History of Britain,The Complete And Utter History Of Britain was ...,0.6,,[],"[{'air_date': '1969-01-12', 'episode_count': 6...",Ended,Scripted,0.0,0,1969-02-02,"[{'id': 10707, 'credit_id': '525389cd19c295794...",4.0,[],34331.0,Perkin Warbeck to Bloody Mary,The early Tudor period is satirized.,,1.0,601.0,,0.0,0.0
4,,"[{'id': 383, 'credit_id': '525389d019c29579402...",[30],1969-01-12,[],,601,True,[en],1969-02-16,"{'air_date': '1969-02-16', 'episode_number': 6...",The Complete and Utter History of Britain,"[{'name': 'London Weekend Television', 'id': 1...",,6.0,1,[GB],en,The Complete and Utter History of Britain,The Complete And Utter History Of Britain was ...,0.6,,[],"[{'air_date': '1969-01-12', 'episode_count': 6...",Ended,Scripted,0.0,0,1969-02-09,"[{'id': 10707, 'credit_id': '525389cd19c295794...",5.0,[],34330.0,The Great and Glorious Age of Elizabeth,The late 16th Century is sent up in this fake ...,,1.0,601.0,,0.0,0.0


In [47]:
merge_shows_episodes_df.shape

(1699153, 41)

In [51]:
merge_shows_episodes_df.isna().sum() / len(merge_shows_episodes_df)

backdrop_path           0.348405
created_by              0.000000
episode_run_time        0.000000
first_air_date          0.044247
genres                  0.000000
homepage                0.000000
id_show                 0.000000
in_production           0.000000
languages               0.000000
last_air_date           0.039476
last_episode_to_air     0.039476
name_show               0.000000
networks                0.000000
next_episode_to_air     0.974313
number_of_episodes      0.000894
number_of_seasons       0.000000
origin_country          0.000000
original_language       0.000000
original_name           0.000000
overview_show           0.000000
popularity              0.000000
poster_path             0.276914
production_companies    0.000000
seasons                 0.000000
status                  0.000000
type                    0.000000
vote_average_show       0.000000
vote_count_show         0.000000
air_date                0.098899
crew                    0.017566
episode_nu

In [63]:
# replace empty lists with nan to get more accurate measure of missing values
merge_shows_episodes_df = merge_shows_episodes_df.mask(merge_shows_episodes_df.applymap(type).eq(list) & ~merge_shows_episodes_df.astype(bool))

In [64]:
merge_shows_episodes_df.isna().sum() / len(merge_shows_episodes_df)

backdrop_path           0.348405
created_by              0.583551
episode_run_time        0.152288
first_air_date          0.044247
genres                  0.246189
homepage                0.000000
id_show                 0.000000
in_production           0.000000
languages               0.169211
last_air_date           0.039476
last_episode_to_air     0.039476
name_show               0.000000
networks                0.185211
next_episode_to_air     0.974313
number_of_episodes      0.000894
number_of_seasons       0.000000
origin_country          0.100657
original_language       0.000000
original_name           0.000000
overview_show           0.000000
popularity              0.000000
poster_path             0.276914
production_companies    0.623106
seasons                 0.016485
status                  0.000000
type                    0.000000
vote_average_show       0.000000
vote_count_show         0.000000
air_date                0.098899
crew                    0.910268
episode_nu

In [71]:
merge_shows_episodes_df.original_language.apply(lambda x: tuple(x) if type(x)==list else x).value_counts()

en    1089243
ja     114423
de     101310
fr      69720
es      53551
ko      47700
zh      31440
pt      31088
pl      22249
nl      21612
ru      15097
cs      11369
tr       9448
sv       9158
it       8690
no       6818
hi       6298
fi       5213
ar       4924
da       4868
th       3657
he       3510
hu       3022
ka       2999
ur       2778
sk       2233
cn       2033
ms       1740
bg       1362
ro       1140
       ...   
af         53
mo         45
ho         39
mk         37
lv         33
sl         33
sh         32
nn         30
za         29
ne         27
hz         27
jv         25
so         18
my         18
sq         18
fy         16
as         15
se         15
st         13
eu         12
am          6
lt          6
kk          6
si          3
az          2
km          2
av          1
pa          1
mi          1
ht          1
Name: original_language, Length: 86, dtype: int64

In [73]:
columns_to_drop = ['backdrop_path', 'homepage', 'last_episode_to_air', 
                   'next_episode_to_air', 'languages', 'overview_show',
                   'seasons', 'overview_ep', 'show_id', 'production_code', 'still_path']

In [78]:
export_df = merge_shows_episodes_df.drop(columns=columns_to_drop)

export_df = export_df[export_df.original_language=='en']

In [79]:
with open('../data/processed/all_episodes.pkl', 'wb') as f:
    pickle.dump(export_df, f)

f.close()