In [20]:
# Import tmdb api wrapper library
import tmdbsimple as tmdb

# Import JSON
import json

# import time for sleeping and scraping
import time

# Import the usual suspects
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
%matplotlib inline

pd.set_option('display.max_columns', 500)

# pickle for saving and loading scraped info
import pickle


from timeit import default_timer as timer


# Pandas profiling - great library for analyzing dataframes
import pandas_profiling as pp

In [2]:
# Import api key
tmdb.API_KEY = open('../api_key', 'rt').read()

In [3]:
# Load list of json info for all currently valid TV shows
tv_series_json = [json.loads(line) for line in open('../data/raw/tv_series_ids_07_09_2019.json', 'r')]

In [4]:
# Extract ids into list
tv_series_id = [x['id'] for x in tv_series_json]

In [5]:
print(tv_series_id[:100])

[601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702]


In [9]:
# Load existing pickles into list

tv_shows_info = []

f_num = 0
exists = True
while(exists):
    f_num += 1
    exists = os.path.isfile(
        '../data/processed/tv_shows_info_{}.pkl'.format(f_num))

    if exists:
        with open('../data/processed/tv_shows_info_{}.pkl'.format(f_num), 'rb') as f:
            mynewlist = pickle.load(f)
        tv_shows_info.extend(mynewlist)
    

len(tv_shows_info)



82817

In [10]:
already_scraped_id = [x['id'] for x in tv_shows_info]
print(len(already_scraped_id))
print(already_scraped_id[:100])

82817
[601, 602, 603, 604, 605, 606, 607, 608, 609, 610, 611, 612, 613, 614, 615, 616, 617, 618, 619, 620, 621, 622, 623, 625, 626, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 654, 655, 656, 657, 658, 660, 661, 662, 663, 664, 665, 666, 667, 668, 669, 670, 671, 672, 673, 674, 675, 676, 677, 678, 679, 680, 681, 682, 683, 684, 685, 686, 687, 688, 689, 690, 691, 692, 693, 694, 695, 696, 697, 698, 699, 700, 701, 702]


In [11]:
# scrape info for all tv shows
count = 0

remaining_tv_id = list(set(tv_series_id)-set(already_scraped_id))
print('number of shows to scrape: {}'.format(len(remaining_tv_id)))

# start new empty list to temporarily store results
temp_list = []
skipped_ids = []

start = timer()
for i in remaining_tv_id:
    no_show = True
    attempt = 0
    while no_show:
        try:
            tv_show = tmdb.TV(i)
            tv_info = tv_show.info()
            temp_list.append(tv_info)
            no_show = False
        except:
            time.sleep(1)
            attempt += 1
                # if trying same file multiple times, skip it
            if attempt>10:
                skipped_ids.append(i)
                print('skipped: {}'.format(i))
                break
       
    count+=1
    
    if count%100==0: 
        print(len(temp_list))
        end = timer()
        print('{:.2f} seconds'.format(end - start))
        start = timer()
    if count%1000==0:
        f_num = 0
        exists = True
        while(exists):
            f_num += 1
            exists = os.path.isfile(
                '../data/processed/tv_shows_info_{}.pkl'.format(f_num))
 
        if not exists:
            with open('../data/processed/tv_shows_info_{}.pkl'.format(f_num), 'wb') as f:
                pickle.dump(temp_list, f)
            print('saved to: ../data/processed/tv_shows_info_{}.pkl'.format(f_num))
            temp_list = []

            
# once more at the end to catch any remaining
if len(temp_list)>0:
    f_num = 0
    exists = True
    while(exists):
        f_num += 1
        exists = os.path.isfile(
            '../data/processed/tv_shows_info_{}.pkl'.format(f_num))

    if not exists:
        with open('../data/processed/tv_shows_info_{}.pkl'.format(f_num), 'wb') as f:
            pickle.dump(temp_list, f)
        print('saved to: ../data/processed/tv_shows_info_{}.pkl'.format(f_num))        

number of shows to scrape: 7
skipped: 88513
skipped: 90818
skipped: 84771
skipped: 89161
skipped: 30928
skipped: 90512
skipped: 90323


# filter for US shows

In [12]:
tv_info_df = pd.DataFrame(tv_shows_info)
tv_info_df.shape

(82817, 28)

In [13]:
tv_info_df.head()

Unnamed: 0,backdrop_path,created_by,episode_run_time,first_air_date,genres,homepage,id,in_production,languages,last_air_date,last_episode_to_air,name,networks,next_episode_to_air,number_of_episodes,number_of_seasons,origin_country,original_language,original_name,overview,popularity,poster_path,production_companies,seasons,status,type,vote_average,vote_count
0,,"[{'id': 383, 'credit_id': '525389d019c29579402...",[30],1969-01-12,[],,601,True,[en],1969-02-16,"{'air_date': '1969-02-16', 'episode_number': 6...",The Complete and Utter History of Britain,"[{'name': 'London Weekend Television', 'id': 1...",,6.0,1,[GB],en,The Complete and Utter History of Britain,The Complete And Utter History Of Britain was ...,0.6,,[],"[{'air_date': '1969-01-12', 'episode_count': 6...",Ended,Scripted,0.0,0
1,/5ekOlsSAEHTEx4XS0cSzAQLzKcV.jpg,"[{'id': 157395, 'credit_id': '525389dc19c29579...",[30],1966-09-06,"[{'id': 35, 'name': 'Comedy'}]",,602,False,[en],1967-04-06,"{'air_date': '1967-04-06', 'episode_number': 3...",Love on a Rooftop,"[{'name': 'ABC', 'id': 2, 'logo_path': '/ndAvF...",,32.0,1,[US],en,Love on a Rooftop,Love on a Rooftop was an American sitcom about...,1.313,/6sxAv9Mrw3e6WhQqd0lbRUmRZRg.jpg,[],"[{'air_date': '1966-09-06', 'episode_count': 3...",Ended,Scripted,0.0,0
2,/khveUylm0fqlLGuUHLg74tKozdy.jpg,"[{'id': 163461, 'credit_id': '525389de19c29579...",[24],1997-09-25,"[{'id': 35, 'name': 'Comedy'}]",,603,False,[en],2000-06-27,"{'air_date': '2000-06-27', 'episode_number': 1...",Veronica's Closet,"[{'name': 'NBC', 'id': 6, 'logo_path': '/o3Oed...",,62.0,3,[US],en,Veronica's Closet,Veronica 'Ronnie' Chase is the 'Queen of Roman...,2.223,/zDSdDiZgOMS7zAda01yzZJLUiAA.jpg,"[{'id': 1957, 'logo_path': '/nmcNfPq03WLtOyufJ...","[{'air_date': '1997-09-25', 'episode_count': 2...",Ended,Scripted,4.0,6
3,/hHwEptckXUwZM7XO2lxZ8w8upuU.jpg,"[{'id': 141043, 'credit_id': '525389ed19c29579...",[23],2003-07-20,"[{'id': 16, 'name': 'Animation'}, {'id': 10759...",http://cartoonnetwork.com/tv_shows/titans/inde...,604,False,[en],2006-01-16,"{'air_date': '2006-01-16', 'episode_number': 1...",Teen Titans,"[{'name': 'Cartoon Network', 'id': 56, 'logo_p...",,65.0,5,[US],en,Teen Titans,The Teen Titans are five heroes under one roof...,29.619,/tfdiVvJkYMbUOXDWibPjzu5dY6S.jpg,"[{'id': 429, 'logo_path': '/2Tc1P3Ac8M479naPp1...","[{'air_date': '2005-01-11', 'episode_count': 2...",Ended,Scripted,7.4,136
4,/4PhaaA593zHfAyYUIXVWTih3qcD.jpg,"[{'id': 69396, 'credit_id': '525389f019c295794...",[22],1996-09-27,"[{'id': 35, 'name': 'Comedy'}, {'id': 10751, '...",,605,False,[en],2003-04-24,"{'air_date': '2003-04-24', 'episode_number': 2...","Sabrina, the Teenage Witch","[{'name': 'ABC', 'id': 2, 'logo_path': '/ndAvF...",,163.0,7,[US],en,"Sabrina, the Teenage Witch","Sabrina, the Teenage Witch is an American sitc...",22.814,/8bwNu5JlRQyi6j6JO7UF42MAgX.jpg,"[{'id': 5452, 'logo_path': None, 'name': 'Hart...","[{'air_date': '1998-10-04', 'episode_count': 3...",Ended,Scripted,6.4,119


In [28]:
# english shows only

tv_info_df.original_language.value_counts()

english_tv_info_df = tv_info_df[tv_info_df.original_language=='en']

tv_info_df = None

english_tv_info_df.shape

(60613, 28)

In [31]:
english_tv_info_df.iloc[3, :]

backdrop_path                            /hHwEptckXUwZM7XO2lxZ8w8upuU.jpg
created_by              [{'id': 141043, 'credit_id': '525389ed19c29579...
episode_run_time                                                     [23]
first_air_date                                                 2003-07-20
genres                  [{'id': 16, 'name': 'Animation'}, {'id': 10759...
homepage                http://cartoonnetwork.com/tv_shows/titans/inde...
id                                                                    604
in_production                                                       False
languages                                                            [en]
last_air_date                                                  2006-01-16
last_episode_to_air     {'air_date': '2006-01-16', 'episode_number': 1...
name                                                          Teen Titans
networks                [{'name': 'Cartoon Network', 'id': 56, 'logo_p...
next_episode_to_air                   

In [39]:
test_season = tmdb.TV_Seasons(604, 0)

In [40]:
test_season_info = test_season.info()


In [41]:
test_season_info['episodes']

[{'air_date': '2005-01-11',
  'episode_number': 1,
  'id': 34430,
  'name': 'The Lost Episode',
  'overview': 'The Titans face a villain called Punk Rocket, who wields a mean guitar that emits crushing sonic waves.',
  'production_code': None,
  'season_number': 0,
  'show_id': 604,
  'still_path': None,
  'vote_average': 8.0,
  'vote_count': 1,
  'crew': [{'id': 127382,
    'credit_id': '525389e319c295794023a91c',
    'name': 'Matt Youngberg',
    'department': 'Directing',
    'job': 'Director',
    'gender': 2,
    'profile_path': None}],
  'guest_stars': []},
 {'air_date': '2006-09-15',
  'episode_number': 2,
  'id': 34429,
  'name': 'Trouble in Tokyo',
  'overview': 'When a Japanese villain attacks Titans Tower, Robin, Starfire, Cyborg, Raven, and Beast Boy spring into action. When Robin finds out that Saico-Tek was sent by a mysterious and menacing Japanese criminal known as Brushogun, and the Teen Titans travel to Tokyo to track the villain down.',
  'production_code': None,
  '