In [31]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scraping_class
from matplotlib.dates import DateFormatter
import seaborn as sns
from bs4 import BeautifulSoup

In [32]:
plt.style.use('ggplot')
%matplotlib inline

SMALL_SIZE = 16
MEDIUM_SIZE = 18
BIGGER_SIZE = 20

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [33]:
logfile = 'oscar_winners_log.csv'

In [34]:

def get_movies(winners_or_nominees):
    
    if winners_or_nominees == 'nominees':
        set_range = range(1,600,50)
    elif winners_or_nominees == 'winners':
        set_range = range(1,100, 100)
    
    connector = scraping_class.Connector(logfile)

    movies_data = []
    
    for i in set_range:
        
        if winners_or_nominees == 'nominees':        
            url = 'https://www.imdb.com/search/title/?groups=oscar_best_picture_nominees&start=%s&ref_=adv_nxt' % str(i)
        elif winners_or_nominees == 'winners':
            url = 'https://www.imdb.com/search/title/?count=100&groups=oscar_best_picture_winners&sort=year,desc&start=%s&ref_=nv_ch_osc' % str(i)

        #response, call_id = connector.get(url, 'get_%s' % winners_or_nominees)
        response = requests.get(url, headers={"Accept-Language":"en-US, en;q=0.5"})
        
        if response.ok:
            data = response.text
            soup = BeautifulSoup(data, "lxml")
            movies = soup.find_all('div', attrs={'lister-item-content'})


            for movie in movies:

                try:
                    runtime = movie.find('p', attrs={'text-muted'}).find('span', attrs={'runtime'}).text
                    runtime = int(runtime.split(' ')[0])

                    genre   = movie.find('p', attrs={'text-muted'}).find('span', attrs={'genre'}).text.strip().split(', ')


                    idx, title, year = movie.find('h3', attrs={'lister-item-header'}).text.strip().split('\n')
                    idx = idx.split('.')[0]
                    
                    link = movie.find('a')
                    link = str(link).split('href="')[1].split('"')[0]
                    link  = 'https://www.imdb.com' + link

                    try:
                        if movie.find_all('span')[-2].text == 'Gross:':
                            gross = movie.find_all('span', attrs={'name':'nv'})[-1].text
                            gross = float(gross.split('$')[1].split('M')[0])
                        else:
                            gross = np.nan
                    except:
                        gross = np.nan

                    try:
                        metascore = movie.find('span', attrs={'metascore'}).text
                    except:
                        metascore = np.nan

                    job = movie.find_all('p')[2].text

                    actors = [i.split('\n')[1] for i in job.split('Stars:')[1].split(',')]
                    directors = [i.split(',')[0] for i in job.split('Stars:')[0].split('Director')[1].split(':\n')[1].split('\n')[:-2]]
                    
                    links_people = [str(i).split('href="')[1].split('"')[0] for i in movie.find_all('p')[2].find_all('a')]
                    links_people = ["https://www.imdb.com/" + i + "awards" for i in links_people]

                    try:
                        year = year.split('(I')[1]
                        try:
                            year = year.split('I) (')[1].split(')')[0]
                        except:
                            year = year.split(') (')[1].split(')')[0]
                    except:
                        year = year.split('(')[1].split(')')[0]

                    movies_data.append([idx, title, year, runtime, genre, metascore, gross, link, directors, actors, links_people])

                except:
                    NameError

        else:
            print('Response failed!')

    df = pd.DataFrame(movies_data)
    df.columns = ['index', 'title', 'year', 'runtime_min', 'genre', 'metascore', 'gross_mil', 'link_movie', 'director', 'actors', 'link_people']
    
    return df

In [35]:
def get_awards(actorlist):
    
    connector = scraping_class.Connector(logfile)
    
    nom = 0
    win = 0
    
    for i in actorlist:
        url = str(i)
        
        #response, call_id = connector.get(url, 'get_awards')
        response = requests.get(url, headers={"Accept-Language":"en-US, en;q=0.5"})
        
        html = response.text
        soup = BeautifulSoup(html,'html.parser')

        table_node = soup.find_all('table', attrs ={'class': 'awards'})

        awards_data = []

        for i in table_node:
            try:
                award_year = i.find_all('td', attrs ={'class': 'award_year'})
                award_year =[i.find('a').text.strip('\n') for i in award_year]
                outcome = i.find_all('td', attrs = {'class': 'award_outcome'})

                outcome = [i.text for i in outcome]

                award = [i.split('\n')[2] for i in outcome]
                result = [i.split('\n')[1] for i in outcome]

                if award[0] == 'Oscar':
                    awards_data.append([award_year, award, result])
            except:
                NameError
        
        if awards_data:
            df = pd.DataFrame(awards_data[0]).T

            df = df.assign(nom = lambda df: pd.Series.str(df[2])[0:] == 'Nominee')
            df = df.assign(win = lambda df: pd.Series.str(df[2])[0:] == 'Winner')


            for i in df.nom:
                if i == True:
                    nom += 1

            for i in df.win:
                if i == True:
                    win += 1
    return nom, win

In [36]:
def get_metadata(movie_url):
    
    #connector = scraping_class.Connector(logfile)

    #response, call_id = connector.get(movie_url, 'get_metadata')
    response = requests.get(movie_url, headers={"Accept-Language":"en-US, en;q=0.5"})

    soup = BeautifulSoup(response.text,'html.parser')
    movie = soup.find_all('div',attrs={'class':'txt-block'})
    metadata = {}

    for i in movie:
        try:
            lookup = ['country', 'language', 'color']
            lookup2 = ['release date', 'aspect ratio']
            valuelist = ['budget', 'cumulative worldwide gross', 'aspect ratio']

            headline = i.find('h4').text.lower().split(':')[0]
            txtblock = i.text

            if headline in lookup:
                link_values = i.find('a').text
                metadata[headline] = link_values

            if headline in lookup2:
                values = txtblock.split('\n')[1].split(':')[1]
                try:
                    values = values.split('(')[0].strip()
                    metadata[headline] = values
                except:
                    values = float(values.strip())
                    metadata[headline] = values

            data = txtblock.split(':')[0].strip().lower()
            values = txtblock.split(':')[1].split('$')[1].strip().lower()

            if data in valuelist:
                values = values.replace(',','')

                try:
                    values = int(values.split('(')[0])
                    metadata[data] = values

                except:
                    values = int(values)
                    metadata[data] = values

        except:
            NameError

    return metadata

In [39]:
def get_data(input_list):
    win_nom = []
    for movie_list in input_list:
        print('... Initializing "%s" scraper ...' %movie_list)
        
        movies   = get_movies(movie_list)
        print('... Movies has been scraped ...')
    
        awards   = [get_awards(i) for i in movies.link_people]
        print('... Awards has been scraped ...')
        
        metadata = [get_metadata(i) for i in movies.link_movie]
        print('... Metadata has been scraped ...')
    
        awards   = pd.DataFrame(awards, columns=['nom_people_sum', 'won_people_sum'])
        metadata = pd.DataFrame(metadata)

        df = movies.merge(awards, left_index = True, right_index = True)
        df = df.merge(metadata, left_index = True, right_index = True)

        df.to_csv('oscar_%s.csv' % movie_list)
        print('... CSV file: oscar_%s.csv has been created ...' % movie_list)
        
        win_nom.append(df)
        
    return win_nom

In [None]:
df = get_data(['winners'])

In [42]:
df[0]

Unnamed: 0,index,title,year,runtime_min,genre,metascore,gross_mil,link_movie,director,actors,link_people,nom_people_sum,won_people_sum,aspect ratio,budget,color,country,cumulative worldwide gross,language,release date
0,1,Green Book,2018,130,"[Biography, Comedy, Drama]",69,85.08,https://www.imdb.com/title/tt6966692/,[Peter Farrelly],"[Viggo Mortensen, Mahershala Ali, Linda Cardel...","[https://www.imdb.com//name/nm0268380/awards, ...",3,3,2.00,23000000.0,Color,USA,3.209402e+08,English,16 November 2018
1,2,The Shape of Water,2017,123,"[Adventure, Drama, Fantasy]",87,63.86,https://www.imdb.com/title/tt5580390/,[Guillermo del Toro],"[Sally Hawkins, Octavia Spencer, Michael Shann...","[https://www.imdb.com//name/nm0868219/awards, ...",8,2,1.85,19400000.0,Color,USA,1.952435e+08,English,22 December 2017
2,3,Moonlight,2016,111,[Drama],99,27.85,https://www.imdb.com/title/tt4975722/,[Barry Jenkins],"[Mahershala Ali, Naomie Harris, Trevante Rhode...","[https://www.imdb.com//name/nm1503575/awards, ...",3,3,2.39,4000000.0,Color,USA,6.504669e+07,English,18 November 2016
3,4,Spotlight,2015,129,"[Biography, Crime, Drama]",93,45.06,https://www.imdb.com/title/tt1895587/,[Tom McCarthy],"[Mark Ruffalo, Michael Keaton, Rachel McAdams,...","[https://www.imdb.com//name/nm0565336/awards, ...",7,1,1.85,20000000.0,Color,USA,9.827524e+07,English,20 November 2015
4,5,Birdman or (The Unexpected Virtue of Ignorance),2014,119,"[Comedy, Drama]",87,42.34,https://www.imdb.com/title/tt2562232/,[Alejandro G. Iñárritu],"[Michael Keaton, Zach Galifianakis, Edward Nor...","[https://www.imdb.com//name/nm0327944/awards, ...",4,0,1.85,18000000.0,Color,USA,1.032151e+08,English,14 November 2014
5,6,12 Years a Slave,2013,134,"[Biography, Drama, History]",96,56.67,https://www.imdb.com/title/tt2024544/,[Steve McQueen],"[Chiwetel Ejiofor, Michael Kenneth Williams, M...","[https://www.imdb.com//name/nm2588606/awards, ...",8,2,2.35,20000000.0,Color,USA,1.877332e+08,English,8 November 2013
6,7,Argo,2012,120,"[Biography, Drama, Thriller]",86,136.03,https://www.imdb.com/title/tt1024648/,[Ben Affleck],"[Ben Affleck, Bryan Cranston, John Goodman, Al...","[https://www.imdb.com//name/nm0000255/awards, ...",4,5,2.39,44500000.0,Color,USA,2.323255e+08,English,12 October 2012
7,8,The Artist,2011,100,"[Comedy, Drama, Romance]",89,44.67,https://www.imdb.com/title/tt1655442/,[Michel Hazanavicius],"[Jean Dujardin, Bérénice Bejo, John Goodman, J...","[https://www.imdb.com//name/nm0371890/awards, ...",3,2,1.33,15000000.0,Black and White,France,1.334329e+08,English,20 January 2012
8,9,The King's Speech,2010,118,"[Biography, Drama, History]",88,138.80,https://www.imdb.com/title/tt1504320/,[Tom Hooper],"[Colin Firth, Geoffrey Rush, Helena Bonham Car...","[https://www.imdb.com//name/nm0393799/awards, ...",6,3,1.85,15000000.0,Color,UK,4.142425e+08,English,25 December 2010
9,10,Slumdog Millionaire,2008,120,"[Drama, Romance]",86,141.32,https://www.imdb.com/title/tt1010048/,"[Danny Boyle, Loveleen Tandan]","[Dev Patel, Freida Pinto, Saurabh Shukla, Anil...","[https://www.imdb.com//name/nm0000965/awards, ...",2,1,2.35,15000000.0,Color,UK,3.779105e+08,English,25 December 2008
