In [1]:
import time
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt
import random
import itertools

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))


#Dependancies for scraping IMDB
# from https://github.com/msaqib4203/IMDB-API
from bs4 import BeautifulSoup
import requests
import json

# data from:
# https://www.imdb.com/interfaces/


In [2]:
def getHTML(url):
    response = requests.get(url)
    return BeautifulSoup(response.content,'html.parser')

def get_description(input):
    page        = getHTML('https://www.imdb.com/title/'+input+'/')
    page_json   = page.find(attrs={'type':'application/ld+json'}).text.strip()
    fetchedJson = json.loads(page_json)
    return fetchedJson['description']

def get_ratings(input):
    page = getHTML('https://www.imdb.com/title/'+input+'/ratings/')
    histogram_votes = get_histogram_votes(page)
    table_votes     = get_table_votes(page)
    return histogram_votes, table_votes

def get_histogram_votes(page):
    votes = []
    for tr in page.find_all('tr')[1:11]:
        for tds in tr.find_all('td')[2:]:
            votes.append(tds.text.strip())
    return votes

def get_table_votes(page):
    out = []
    for dem in [12,13,14]:
        aux = [tds.text.split() for tds in page.find_all('tr')[dem].find_all('td')[1:]]
        aux = list(itertools.chain(*aux))
        new = []
        for aa in aux:
            if aa == '-':
                new.append('-')
            new.append(aa)
        out = out + [aa.replace(',', '') for aa in new]
    out = [aa.replace('-', 'NaN') for aa in out]
    return out


# Import titles

In [3]:
fpath = "./title.basics.tsv/data.tsv"
df_title = pd.read_csv(fpath, sep='\t')
df_title.head(5)

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,\N,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


# Download by time periods

In [4]:
year = 2000

print('total number of titles', len(df_title[df_title['startYear'] != '\\N']))
dfdwn = df_title[(df_title['startYear'] != '\\N') & (df_title['isAdult'] == 0) & (df_title['titleType'] == 'movie')]
dfdwn = dfdwn[dfdwn['startYear'].astype(int) == year]
print()
print('Year to donwload            :', year)
print('number of titles to download:', len(dfdwn))
print()
dfdwn.head()

total number of titles 5844555

Year to donwload            : 2000
number of titles to download: 4877



Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
100128,tt0102362,movie,Istota,Istota,0,2000,\N,80,"Drama,Romance"
105380,tt0107706,movie,Nothing,Nothing,0,2000,\N,\N,\N
110583,tt0113026,movie,The Fantasticks,The Fantasticks,0,2000,\N,86,"Musical,Romance"
110641,tt0113086,movie,Florentino y el diablo,Florentino y el diablo,0,2000,\N,\N,Drama
110647,tt0113092,movie,For the Cause,For the Cause,0,2000,\N,100,"Action,Adventure,Drama"


In [7]:
cols_hv = ['tconst', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']
cols_tv = ['tconst',
           'All_V', 'All_N', 'All_18V', 'All_18N', 'All_1829V', 'All_1829N', 'All_3044V', 'All_3044N', 'All_45V', 'All_45N',
           'M_V'  , 'M_N'  , 'M_18V'  , 'M_18N'  , 'M_1829V'  , 'M_1829N'  , 'M_3044V'  , 'M_3044N'  , 'M_45V'  , 'M_45N'  ,
           'F_V'  , 'F_N'  , 'F_18V'  , 'F_18N'  , 'F_1829V'  , 'F_1829N'  , 'F_3044V'  , 'F_3044N'  , 'F_45V'  , 'F_45N'  ]


for year in [2001,2002,2003,2004,2005,2006,2007,2008,2009,2010]:
    print('total number of titles', len(df_title[df_title['startYear'] != '\\N']))
    dfdwn = df_title[(df_title['startYear'] != '\\N') & (df_title['isAdult'] == 0) & (df_title['titleType'] == 'movie')]
    dfdwn = dfdwn[dfdwn['startYear'].astype(int) == year]
    print()
    print('Year to donwload            :', year)
    print('number of titles to download:', len(dfdwn))
    print()

    df_descr           = pd.DataFrame(columns = ['tconst', 'description'])
    df_histogram_votes = pd.DataFrame(columns = cols_hv)
    df_table_votes     = pd.DataFrame(columns = cols_tv)

    ide   = 0
    ira   = 0
    j     = 0
    start = time.time()
    for title in dfdwn['tconst'].unique():
        j = j+1
        if j % 50==0:
            print('Attempted # movies, description, ratings:',j,ide,ira)
        try:
            df_descr.loc[ide] = [title,get_description(title)]
            ide               = ide+1
        except:
            pass
        try:
            histogram, table_votes      = get_ratings(title)
            df_histogram_votes.loc[ira] = [title] + histogram      
            df_table_votes.loc[ira]     = [title] + table_votes
            ira                         = ira+1
        except:
            pass

    end = time.time()
    print()
    print('time elapsed:', end-start)
    df_descr.to_csv(          'IMDB_movie_description_'     + str(year) + '.csv')
    df_histogram_votes.to_csv('IMDB_movie_histogram_votes_' + str(year) + '.csv')
    df_table_votes.to_csv(    'IMDB_movie_table_votes_'     + str(year) + '.csv')

total number of titles 5844555

Year to donwload            : 2001
number of titles to download: 5220

Attempted # movies, description, ratings: 50 42 46
Attempted # movies, description, ratings: 100 85 92
Attempted # movies, description, ratings: 150 124 137
Attempted # movies, description, ratings: 200 162 181
Attempted # movies, description, ratings: 250 202 229
Attempted # movies, description, ratings: 300 245 277
Attempted # movies, description, ratings: 350 287 324
Attempted # movies, description, ratings: 400 329 370
Attempted # movies, description, ratings: 450 371 417
Attempted # movies, description, ratings: 500 414 464
Attempted # movies, description, ratings: 550 459 511
Attempted # movies, description, ratings: 600 507 559
Attempted # movies, description, ratings: 650 547 607
Attempted # movies, description, ratings: 700 594 653
Attempted # movies, description, ratings: 750 637 700
Attempted # movies, description, ratings: 800 677 743
Attempted # movies, description, ratin

Attempted # movies, description, ratings: 1950 1463 1755
Attempted # movies, description, ratings: 2000 1490 1798
Attempted # movies, description, ratings: 2050 1527 1843
Attempted # movies, description, ratings: 2100 1553 1885
Attempted # movies, description, ratings: 2150 1589 1934
Attempted # movies, description, ratings: 2200 1622 1978
Attempted # movies, description, ratings: 2250 1644 2014
Attempted # movies, description, ratings: 2300 1675 2057
Attempted # movies, description, ratings: 2350 1712 2104
Attempted # movies, description, ratings: 2400 1746 2147
Attempted # movies, description, ratings: 2450 1774 2190
Attempted # movies, description, ratings: 2500 1805 2234
Attempted # movies, description, ratings: 2550 1838 2278
Attempted # movies, description, ratings: 2600 1862 2320
Attempted # movies, description, ratings: 2650 1892 2361
Attempted # movies, description, ratings: 2700 1910 2396
Attempted # movies, description, ratings: 2750 1933 2436
Attempted # movies, description

Attempted # movies, description, ratings: 3650 2556 3157
Attempted # movies, description, ratings: 3700 2573 3183
Attempted # movies, description, ratings: 3750 2591 3210
Attempted # movies, description, ratings: 3800 2607 3238
Attempted # movies, description, ratings: 3850 2632 3267
Attempted # movies, description, ratings: 3900 2651 3286
Attempted # movies, description, ratings: 3950 2676 3315
Attempted # movies, description, ratings: 4000 2694 3340
Attempted # movies, description, ratings: 4050 2711 3367
Attempted # movies, description, ratings: 4100 2729 3381
Attempted # movies, description, ratings: 4150 2746 3395
Attempted # movies, description, ratings: 4200 2765 3411
Attempted # movies, description, ratings: 4250 2783 3437
Attempted # movies, description, ratings: 4300 2805 3456
Attempted # movies, description, ratings: 4350 2817 3464
Attempted # movies, description, ratings: 4400 2835 3477
Attempted # movies, description, ratings: 4450 2853 3489
Attempted # movies, description

Attempted # movies, description, ratings: 5300 3359 3924
Attempted # movies, description, ratings: 5350 3368 3933
Attempted # movies, description, ratings: 5400 3381 3947
Attempted # movies, description, ratings: 5450 3395 3958
Attempted # movies, description, ratings: 5500 3406 3964
Attempted # movies, description, ratings: 5550 3409 3968
Attempted # movies, description, ratings: 5600 3415 3975
Attempted # movies, description, ratings: 5650 3424 3979
Attempted # movies, description, ratings: 5700 3437 3988
Attempted # movies, description, ratings: 5750 3447 3992
Attempted # movies, description, ratings: 5800 3465 3994
Attempted # movies, description, ratings: 5850 3487 4005
Attempted # movies, description, ratings: 5900 3496 4008
Attempted # movies, description, ratings: 5950 3516 4011
Attempted # movies, description, ratings: 6000 3528 4015
Attempted # movies, description, ratings: 6050 3542 4017
Attempted # movies, description, ratings: 6100 3555 4022
Attempted # movies, description

Attempted # movies, description, ratings: 6300 3910 4419
Attempted # movies, description, ratings: 6350 3920 4424
Attempted # movies, description, ratings: 6400 3925 4429
Attempted # movies, description, ratings: 6450 3925 4429
Attempted # movies, description, ratings: 6500 3931 4441
Attempted # movies, description, ratings: 6550 3944 4451
Attempted # movies, description, ratings: 6600 3961 4457
Attempted # movies, description, ratings: 6650 3984 4465
Attempted # movies, description, ratings: 6700 3992 4468
Attempted # movies, description, ratings: 6750 4004 4473
Attempted # movies, description, ratings: 6800 4019 4479
Attempted # movies, description, ratings: 6850 4030 4484
Attempted # movies, description, ratings: 6900 4046 4487
Attempted # movies, description, ratings: 6950 4058 4494

time elapsed: 8540.508304357529
total number of titles 5844555

Year to donwload            : 2006
number of titles to download: 7482

Attempted # movies, description, ratings: 50 41 45
Attempted # mov

Attempted # movies, description, ratings: 6500 4336 4810
Attempted # movies, description, ratings: 6550 4354 4827
Attempted # movies, description, ratings: 6600 4370 4841
Attempted # movies, description, ratings: 6650 4388 4849
Attempted # movies, description, ratings: 6700 4398 4858
Attempted # movies, description, ratings: 6750 4410 4870
Attempted # movies, description, ratings: 6800 4423 4878
Attempted # movies, description, ratings: 6850 4436 4889
Attempted # movies, description, ratings: 6900 4451 4902
Attempted # movies, description, ratings: 6950 4461 4909
Attempted # movies, description, ratings: 7000 4481 4922
Attempted # movies, description, ratings: 7050 4500 4931
Attempted # movies, description, ratings: 7100 4517 4944
Attempted # movies, description, ratings: 7150 4524 4949
Attempted # movies, description, ratings: 7200 4542 4960
Attempted # movies, description, ratings: 7250 4562 4970
Attempted # movies, description, ratings: 7300 4572 4973
Attempted # movies, description

Attempted # movies, description, ratings: 6200 4380 4812
Attempted # movies, description, ratings: 6250 4405 4827
Attempted # movies, description, ratings: 6300 4428 4846
Attempted # movies, description, ratings: 6350 4449 4859
Attempted # movies, description, ratings: 6400 4465 4873
Attempted # movies, description, ratings: 6450 4481 4886
Attempted # movies, description, ratings: 6500 4498 4903
Attempted # movies, description, ratings: 6550 4513 4921
Attempted # movies, description, ratings: 6600 4533 4931
Attempted # movies, description, ratings: 6650 4544 4940
Attempted # movies, description, ratings: 6700 4558 4953
Attempted # movies, description, ratings: 6750 4579 4966
Attempted # movies, description, ratings: 6800 4596 4978
Attempted # movies, description, ratings: 6850 4618 4993
Attempted # movies, description, ratings: 6900 4636 5003
Attempted # movies, description, ratings: 6950 4648 5013
Attempted # movies, description, ratings: 7000 4668 5032
Attempted # movies, description

Attempted # movies, description, ratings: 5250 3967 4370
Attempted # movies, description, ratings: 5300 4000 4399
Attempted # movies, description, ratings: 5350 4032 4418
Attempted # movies, description, ratings: 5400 4062 4437
Attempted # movies, description, ratings: 5450 4091 4458
Attempted # movies, description, ratings: 5500 4116 4487
Attempted # movies, description, ratings: 5550 4147 4514
Attempted # movies, description, ratings: 5600 4177 4537
Attempted # movies, description, ratings: 5650 4207 4566
Attempted # movies, description, ratings: 5700 4236 4593
Attempted # movies, description, ratings: 5750 4258 4618
Attempted # movies, description, ratings: 5800 4291 4651
Attempted # movies, description, ratings: 5850 4322 4679
Attempted # movies, description, ratings: 5900 4350 4699
Attempted # movies, description, ratings: 5950 4378 4731
Attempted # movies, description, ratings: 6000 4401 4753
Attempted # movies, description, ratings: 6050 4433 4784
Attempted # movies, description

Attempted # movies, description, ratings: 2950 2492 2529
Attempted # movies, description, ratings: 3000 2525 2567
Attempted # movies, description, ratings: 3050 2557 2604
Attempted # movies, description, ratings: 3100 2594 2644
Attempted # movies, description, ratings: 3150 2625 2675
Attempted # movies, description, ratings: 3200 2662 2711
Attempted # movies, description, ratings: 3250 2696 2755
Attempted # movies, description, ratings: 3300 2729 2794
Attempted # movies, description, ratings: 3350 2763 2821
Attempted # movies, description, ratings: 3400 2799 2864
Attempted # movies, description, ratings: 3450 2837 2892
Attempted # movies, description, ratings: 3500 2870 2928
Attempted # movies, description, ratings: 3550 2903 2960
Attempted # movies, description, ratings: 3600 2943 2999
Attempted # movies, description, ratings: 3650 2983 3030
Attempted # movies, description, ratings: 3700 3021 3062
Attempted # movies, description, ratings: 3750 3053 3099
Attempted # movies, description

Attempted # movies, description, ratings: 10150 6614 6500
Attempted # movies, description, ratings: 10200 6624 6506
Attempted # movies, description, ratings: 10250 6641 6515
Attempted # movies, description, ratings: 10300 6658 6523
Attempted # movies, description, ratings: 10350 6671 6527
Attempted # movies, description, ratings: 10400 6685 6532
Attempted # movies, description, ratings: 10450 6701 6544
Attempted # movies, description, ratings: 10500 6721 6547
Attempted # movies, description, ratings: 10550 6749 6557
Attempted # movies, description, ratings: 10600 6762 6568
Attempted # movies, description, ratings: 10650 6767 6576
Attempted # movies, description, ratings: 10700 6767 6578
Attempted # movies, description, ratings: 10750 6788 6587
Attempted # movies, description, ratings: 10800 6808 6593
Attempted # movies, description, ratings: 10850 6831 6596
Attempted # movies, description, ratings: 10900 6849 6602
Attempted # movies, description, ratings: 10950 6861 6605
Attempted # mo

Attempted # movies, description, ratings: 6250 4724 4654
Attempted # movies, description, ratings: 6300 4762 4683
Attempted # movies, description, ratings: 6350 4794 4711
Attempted # movies, description, ratings: 6400 4824 4738
Attempted # movies, description, ratings: 6450 4863 4767
Attempted # movies, description, ratings: 6500 4894 4793
Attempted # movies, description, ratings: 6550 4925 4825
Attempted # movies, description, ratings: 6600 4959 4854
Attempted # movies, description, ratings: 6650 4996 4886
Attempted # movies, description, ratings: 6700 5027 4915
Attempted # movies, description, ratings: 6750 5060 4950
Attempted # movies, description, ratings: 6800 5096 4978
Attempted # movies, description, ratings: 6850 5128 5012
Attempted # movies, description, ratings: 6900 5162 5041
Attempted # movies, description, ratings: 6950 5201 5068
Attempted # movies, description, ratings: 7000 5229 5096
Attempted # movies, description, ratings: 7050 5261 5121
Attempted # movies, description