In [57]:
import requests
from bs4 import BeautifulSoup as bs
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set()

In [33]:
pages = 'https://www.imdb.com/search/title?year={}&title_type=feature&page={}&ref_=adv_nxt'
pages = [pages.format(i, j) for i in range(1990, 2018) for j in range(1, 11)]

In [34]:
def get_title(soup):
    obj = soup.find('h1')
    if not obj:
        return None
    else: return obj.text.strip().replace('\xa0', ' ')

def get_rating(soup):
    obj = soup.find(itemprop='ratingValue')
    if not obj:
        return None
    else: return float(obj.text)

def get_usercount(soup):
    obj = soup.find(text=re.compile('user'))
    if not obj:
        return None
    else: return obj.split()[0].replace(',', '')

def get_critcount(soup):
    obj = soup.find_all(text=re.compile('critic'))
    if not obj:
        return None
    if len(obj) < 2:
        return None
    else: return obj[1].split()[0].replace(',', '')
    
def get_metacritic(soup):
    obj = soup.find(class_='metacriticScore')
    if not obj:
        return None
    else: return int(obj.text.strip())
    
def get_ratcount(soup):
    obj = soup.find(itemprop='ratingCount')
    if not obj:
        return None
    else: return int(obj.text.strip().replace(',', ''))
    
def get_pop(soup):
    obj = soup.find(name='span', class_=re.compile('popularity'))
    if not obj:
        return None
    obj = obj.findPrevious()
    if not obj:
        return None
    else: return obj.text.split()[0]
    
def get_pop_trend(soup):
    obj = soup.find(name='span', class_=re.compile('popularity'))
    if not obj:
        return None
    obj = obj.findNext()
    if not obj:
        return None
    if obj.attrs['class'][0] == 'popularityDown':
        return '-' + obj.text
    else: return '+' + obj.text
    
def get_plotkeys(soup):
    obj = soup.find(text='Plot Keywords:')
    if not obj:
        return None
    obj = obj.parent.parent.find_all(itemprop='keywords')
    if not obj:
        return None
    else: return [i.text.strip() for i in obj]
    
def get_awards(soup):
    obj = soup.find_all(class_='awards-blurb')
    if not obj:
        return None
    else: return [' '.join(i.text.split()) for i in obj] 

def get_genres(soup):
    obj = soup.find('h4', class_='inline', text='Genres:')
    if not obj:
        return None
    obj = obj.parent.find_all('a')
    if not obj:
        return None
    else: return [i.text.strip() for i in obj]

def get_mpaa(soup):
    obj = soup.find(itemprop='contentRating')
    if not obj:
        return None
    else: return obj.attrs['content']

def get_release(soup):
    obj = soup.find('h4', class_='inline', text='Release Date:')
    if not obj:
        return None
    obj = obj.next_element.next_element
    if not obj:
        return None
    else: return obj.strip().rsplit(' ', 1)[0]

def get_duration(soup):
    obj = soup.find('h4', class_='inline', text='Runtime:')
    if not obj:
        return None
    obj = obj.parent.find(itemprop='duration')
    if not obj:
        return None
    else: return obj.text.strip(' min')

def get_money(soup, money):
    obj = soup.find('h4', class_='inline', text=money)
    if not obj:
        return None
    obj = obj.next_element.next_element
    if not obj:
        return None
    else: return obj.strip().replace('$', '').replace(',', '')

def get_movie_people(soup, prod):
    obj = soup.find('h4', class_='inline', text=prod)
    if not obj:
        return None
    obj = obj.parent.find_all(itemprop='name')
    if not obj:
        return None
    return [i.text.strip() for i in obj]

def get_country(soup):
    obj = soup.find(text='Country:')
    if not obj:
        return None
    obj = obj.findNext()
    if not obj:
        return None
    else: return obj.text

In [28]:
movie_list = []
url = 'https://www.imdb.com{}'
for j, page in enumerate(pages[5:]):
    print(page)
    soup_page = bs(requests.get(page).text, 'lxml')
    lst = soup_page.find_all(class_='lister-item-header')
    movie_urls = [url.format(lst[num].find('a').attrs['href']) for num in range(len(lst))]
    for i, page in enumerate(movie_urls):
        resp = requests.get(page)
        soup = bs(resp.text, 'lxml')
        rating = get_rating(soup)
        if rating == None:
            continue
        title = get_title(soup)
        user = get_usercount(soup)
        critic = get_critcount(soup)
        metascore = get_metacritic(soup)
        pop = get_pop(soup)
        pop_trend = get_pop_trend(soup)
        plot_keys = get_plotkeys(soup)
        rating_count = get_ratcount(soup)
        awards = get_awards(soup)
        genres = get_genres(soup)
        mpaa = get_mpaa(soup)
        release = get_release(soup)
        runtime = get_duration(soup)
        budget = get_money(soup, 'Budget:')
        opening = get_money(soup, 'Opening Weekend USA:')
        us_g = get_money(soup, 'Gross USA:')
        world_g = get_money(soup, 'Cumulative Worldwide Gross:')
        prod_co = get_movie_people(soup, 'Production Co:')
        stars = get_movie_people(soup, 'Stars:')
        headers = ['Title', 'IMDBRating', 'UserCount', 'CriticCount', 'Metascore', 'Popularity', 'PopularityTrend',
                   'PlotKeywords', 'IMDBRatingCount', 'Awards', 'Genres', 'MPAA', 'ReleaseDate', 'RunTime', 'Budget',
                   'Opening', 'GrossUSA', 'GrossIntl', 'ProdCo', 'Stars']
        movie_dict = dict(zip(headers, [title, rating, user, critic, metascore, pop, pop_trend, plot_keys, rating_count,
                                        awards, genres, mpaa, release, runtime, budget, opening, us_g, world_g, prod_co, stars]))
        movie_list.append(movie_dict)
        if (i + 1) % 10 == 0:
            print(title)
    if (j + 1) % 5 == 0:
        movies = pd.DataFrame(movie_list)
        movies.to_csv('movies.csv', mode='a', index=False, header=False)
        movie_list = []

https://www.imdb.com/search/title?year=2004&title_type=feature&page=6&ref_=adv_nxt
Decoys (2004)
Appleseed (2004)
Hellbent (2004)
Musafir (2004)
Haven (2004)
https://www.imdb.com/search/title?year=2004&title_type=feature&page=7&ref_=adv_nxt
The Lizard (2004)
Elvis Has Left the Building (2004)
Saint Ralph (2004)
Sud pralad (2004)
Visions of Europe (2004)
https://www.imdb.com/search/title?year=2004&title_type=feature&page=8&ref_=adv_nxt
Walk on Water (2004)
Imaginary Heroes (2004)
Survive Style 5+ (2004)
Konkurîto (2004)
Síndrome (2004)
https://www.imdb.com/search/title?year=2004&title_type=feature&page=9&ref_=adv_nxt
Tumsa Nahin Dekha (2004)
Pluk van de petteflet (2004)
The Calcium Kid (2004)
Freshman Orientation (2004)
https://www.imdb.com/search/title?year=2004&title_type=feature&page=10&ref_=adv_nxt
Strings (2004)
Bunshinsaba (2004)
Clipping Adam (2004)
Kansen (2004)
Hawas (2004)
https://www.imdb.com/search/title?year=2005&title_type=feature&page=1&ref_=adv_nxt
The Longest Yard (2005

In [37]:
movie_list = []
url = 'https://www.imdb.com{}'
for j, page in enumerate(pages):
    print(page)
    soup_page = bs(requests.get(page).text, 'lxml')
    lst = soup_page.find_all(class_='lister-item-header')
    movie_urls = [url.format(lst[num].find('a').attrs['href']) for num in range(len(lst))]
    for i, page in enumerate(movie_urls):
        resp = requests.get(page)
        soup = bs(resp.text, 'lxml')
        rating = get_rating(soup)
        if rating == None:
            continue
        title = get_title(soup)
        country = get_country(soup)
        headers = ['Title', 'Country']
        movie_dict = dict(zip(headers, [title, country]))
        movie_list.append(movie_dict)
        if (i + 1) % 1000 == 0:
            print(title)
    if (j + 1) % 20 == 0:
        movies = pd.DataFrame(movie_list)
        movies.to_csv('movies_country.csv', mode='a', index=False, header=False)
        movie_list = []

https://www.imdb.com/search/title?year=1990&title_type=feature&page=1&ref_=adv_nxt
https://www.imdb.com/search/title?year=1990&title_type=feature&page=2&ref_=adv_nxt
https://www.imdb.com/search/title?year=1990&title_type=feature&page=3&ref_=adv_nxt
https://www.imdb.com/search/title?year=1990&title_type=feature&page=4&ref_=adv_nxt
https://www.imdb.com/search/title?year=1990&title_type=feature&page=5&ref_=adv_nxt
https://www.imdb.com/search/title?year=1990&title_type=feature&page=6&ref_=adv_nxt
https://www.imdb.com/search/title?year=1990&title_type=feature&page=7&ref_=adv_nxt
https://www.imdb.com/search/title?year=1990&title_type=feature&page=8&ref_=adv_nxt
https://www.imdb.com/search/title?year=1990&title_type=feature&page=9&ref_=adv_nxt
https://www.imdb.com/search/title?year=1990&title_type=feature&page=10&ref_=adv_nxt
https://www.imdb.com/search/title?year=1991&title_type=feature&page=1&ref_=adv_nxt
https://www.imdb.com/search/title?year=1991&title_type=feature&page=2&ref_=adv_nxt
htt

In [58]:
countries = pd.read_csv('movies_country.csv', names=sorted(headers))
movs = pd.read_csv('movies_raw.csv', names=sorted(['Title', 'IMDBRating', 'UserCount', 'CriticCount', 'Metascore', 'Popularity', 'PopularityTrend',
                   'PlotKeywords', 'IMDBRatingCount', 'Awards', 'Genres', 'MPAA', 'ReleaseDate', 'RunTime', 'Budget',
                   'Opening', 'GrossUSA', 'GrossIntl', 'ProdCo', 'Stars']))

In [60]:
movies = pd.merge(movs, countries, how='left', on='Title')
movies = movies[['Title', 'IMDBRating', 'IMDBRatingCount', 'ReleaseDate', 'RunTime', 'UserCount', 'CriticCount', 'Metascore', 'Popularity', 'PopularityTrend', 'Budget', 'Opening', 'GrossUSA', 'GrossIntl', 'Country', 'PlotKeywords', 'Awards', 'Genres', 'MPAA', 'ProdCo', 'Stars']]

In [62]:
movies.to_csv('movies.csv', index=False, header=['Title', 'IMDBRating', 'IMDBRatingCount', 'ReleaseDate', 'RunTime', 'UserCount', 'CriticCount', 'Metascore', 'Popularity', 'PopularityTrend', 'Budget', 'Opening', 'GrossUSA', 'GrossIntl', 'Country', 'PlotKeywords', 'Awards', 'Genres', 'MPAA', 'ProdCo', 'Stars'])