In [1]:
import time
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt
import random
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

#import gender_guesser.detector as gender
#from genderize import Genderize

#Dependancies for scraping IMDB
# from https://github.com/msaqib4203/IMDB-API
from bs4 import BeautifulSoup
import requests
import json

# data from:
# https://www.imdb.com/interfaces/


In [2]:
def parsePersons(persons):
    names = []
    if isinstance(persons,dict):
        names.append(persons['name'])
        return names

    for person in persons:
        if person['@type'] == "Person":
            names.append(person['name'])
    return names

def getJSON(html):
    data = {}
    data['id']          = html.find(attrs={'property':'pageId'})['content']
    data['url']         = 'https://www.imdb.com/title/'+ data['id']
    html_json           = html.find(attrs={'type':'application/ld+json'}).text.strip()
    fetchedJson         = json.loads(html_json)
    data['poster']      = html.find(attrs={'class':'poster'}).find('img')['src']
    title_wrapper       = html.find(attrs={'class':'title_wrapper'}).text.strip()
    data['title']       = title_wrapper[:title_wrapper.find(')')+1]
    data['rating']      = html.find(itemprop='ratingValue').text
    data['bestRating']  = html.find(itemprop='bestRating').text
    data['votes']       = html.find(itemprop='ratingCount').text
    data['rated']       = fetchedJson['contentRating']
    data['genres']      = fetchedJson['genre']
    data['description'] = fetchedJson['description']
    data['cast']        = parsePersons(fetchedJson['actor'])
    data['writers']     = parsePersons(fetchedJson['creator'])
    data['directors']   = parsePersons(fetchedJson['director'])
    return data 

def getHTML(url):
    response = requests.get(url)
    return BeautifulSoup(response.content,'html.parser')

def getURL(input):
    try:
        if input[0] == 't' and input[1] == 't':
            html = getHTML('https://www.imdb.com/title/'+input+'/')

        else:
            html = getHTML('https://www.google.co.in/search?q='+input)
            for cite in html.findAll('cite'):
                if 'imdb.com/title/tt' in cite.text:
                    html = getHTML(cite.text)
                    break
        return getJSON(html)
    except Exception as e:
        #print(e)
        return 'Invalid input or Network Error!'


# Import titles

In [3]:
fpath = "./title.basics.tsv/data.tsv"
df_title = pd.read_csv(fpath, sep='\t')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
df_title.head(10)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,\N,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"
5,tt0000006,short,Chinese Opium Den,Chinese Opium Den,0,1894,\N,1,Short
6,tt0000007,short,Corbett and Courtney Before the Kinetograph,Corbett and Courtney Before the Kinetograph,0,1894,\N,1,"Short,Sport"
7,tt0000008,short,Edison Kinetoscopic Record of a Sneeze,Edison Kinetoscopic Record of a Sneeze,0,1894,\N,1,"Documentary,Short"
8,tt0000009,movie,Miss Jerry,Miss Jerry,0,1894,\N,45,Romance
9,tt0000010,short,Exiting the Factory,La sortie de l'usine Lumière à Lyon,0,1895,\N,1,"Documentary,Short"


# Download by time periods

In [5]:
year = 2009

In [6]:
print('total number of titles', len(df_title[df_title['startYear'] != '\\N']))
dfdwn = df_title[(df_title['startYear'] != '\\N') & (df_title['isAdult'] == 0) & (df_title['titleType'] == 'movie')]
dfdwn = dfdwn[dfdwn['startYear'].astype(int) == year]
print()
print('Year to donwload            :', year)
print('number of titles to download:', len(dfdwn))
print()
dfdwn.head()

total number of titles 5844555

Year to donwload            : 2009
number of titles to download: 11098



Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
118654,tt0121569,movie,Moonlight Blade,Moonlight Blade,0,2009,\N,\N,"Mystery,Thriller"
133379,tt0137140,movie,One Bad Mice,One Bad Mice,0,2009,\N,88,\N
139441,tt0143558,movie,Apaföld,Apaföld,0,2009,\N,85,Drama
148655,tt0153140,movie,The Shark in Me,Rózsaszín sajt,0,2009,\N,93,"Comedy,Drama"
197492,tt0205380,movie,Sanam Teri Kasam,Sanam Teri Kasam,0,2009,\N,170,"Action,Drama,Romance"


In [None]:
start      = time.time()
df_descr   = pd.DataFrame(columns = ['tconst', 'description'])
dfshuffled = dfdwn[:].sample(frac=1)


i = 0
j = 0
for title in dfshuffled['tconst'].unique()[9000:]:
    j = j +1
    if j % 50==0:
        print('Attempted # movies, downloaded:',j,i)
    newinfo = getURL(title)
    try:
        new_row = [title, newinfo['description']]
        df_descr.loc[i] = new_row
        i = i+1
    except:
        pass
end = time.time()
print()
print('time elapsed:', end-start)
df_descr.to_csv('IMDB_movie_description_' + str(year) + '.csv')

Attempted # movies, downloaded: 50 10
Attempted # movies, downloaded: 100 22


In [11]:
df_descr.to_csv('IMDB_movie_description_' + str(year) + '.csv')