In [1]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scraping_class
from matplotlib.dates import DateFormatter
import seaborn as sns
from bs4 import BeautifulSoup

In [2]:
plt.style.use('ggplot')
%matplotlib inline

SMALL_SIZE = 16
MEDIUM_SIZE = 18
BIGGER_SIZE = 20

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [316]:

def get_movies(winners_or_nominees):
    
    if winners_or_nominees == 'nominees':
        set_range = range(1,600,50)
    elif winners_or_nominees == 'winners':
        set_range = range(1,100, 100)
        
    
    logfile = 'oscar_winners_log.csv'
    connector = scraping_class.Connector(logfile)

    movies_data = []
    
    for i in set_range:
        
        if winners_or_nominees == 'nominees':        
            url = 'https://www.imdb.com/search/title/?groups=oscar_best_picture_nominees&start=%s&ref_=adv_nxt' % str(i)
        elif winners_or_nominees == 'winners':
            url = 'https://www.imdb.com/search/title/?count=100&groups=oscar_best_picture_winners&sort=year,desc&start=%s&ref_=nv_ch_osc' % str(i)

        response, call_id = connector.get(url, 'get_%s' % winners_or_nominees)
        if response.ok:
            data = response.text
            soup = BeautifulSoup(data, "lxml")
            movies = soup.find_all('div', attrs={'lister-item-content'})


            for movie in movies:

                try:
                    runtime = movie.find('p', attrs={'text-muted'}).find('span', attrs={'runtime'}).text
                    runtime = int(runtime.split(' ')[0])

                    genre   = movie.find('p', attrs={'text-muted'}).find('span', attrs={'genre'}).text.strip().split(', ')


                    idx, title, year = movie.find('h3', attrs={'lister-item-header'}).text.strip().split('\n')
                    idx = idx.split('.')[0]
                    
                    link = movie.find('a')
                    link = str(link).split('href="')[1].split('"')[0]
                    link  = 'https://www.imdb.com' + link

                    try:
                        if movie.find_all('span')[-2].text == 'Gross:':
                            gross = movie.find_all('span', attrs={'name':'nv'})[-1].text
                            gross = float(gross.split('$')[1].split('M')[0])
                        else:
                            gross = np.nan
                    except:
                        gross = np.nan

                    try:
                        metascore = movie.find('span', attrs={'metascore'}).text
                    except:
                        metascore = np.nan

                    job = movie.find_all('p')[2].text

                    actors = [i.split('\n')[1] for i in job.split('Stars:')[1].split(',')]
                    directors = [i.split(',')[0] for i in job.split('Stars:')[0].split('Director')[1].split(':\n')[1].split('\n')[:-2]]
                    
                    links_people = [str(i).split('href="')[1].split('"')[0] for i in movie.find_all('p')[2].find_all('a')]
                    links_people = ["https://www.imdb.com/" + i + "awards" for i in links_people]

                    try:
                        year = year.split('(I')[1]
                        try:
                            year = year.split('I) (')[1].split(')')[0]
                        except:
                            year = year.split(') (')[1].split(')')[0]
                    except:
                        year = year.split('(')[1].split(')')[0]

                    movies_data.append([idx, title, year, runtime, genre, metascore, gross, link, directors, actors, links_people])

                except:
                    NameError

        else:
            print('Response failed!')

    df = pd.DataFrame(movies_data)
    df.columns = ['index', 'title', 'year', 'runtime_min', 'genre', 'metascore', 'gross_mil', 'link_movie', 'director', 'actors', 'link_people']
    
    return df

In [317]:
def get_awards(actorlist):
    
    awards = 'awards.txt'
    connector = scraping_class.Connector(awards)
    
    nom = 0
    win = 0
    
    for i in actorlist:
        url = str(i)
        response, call_id = connector.get(url, 'awards')

        html = response.text
        soup = BeautifulSoup(html,'html.parser')

        table_node = soup.find_all('table', attrs ={'class': 'awards'})

        awards_data = []

        for i in table_node:
            try:
                award_year = i.find_all('td', attrs ={'class': 'award_year'})
                award_year =[i.find('a').text.strip('\n') for i in award_year]
                outcome = i.find_all('td', attrs = {'class': 'award_outcome'})

                outcome = [i.text for i in outcome]

                award = [i.split('\n')[2] for i in outcome]
                result = [i.split('\n')[1] for i in outcome]

                if award[0] == 'Oscar':
                    awards_data.append([award_year, award, result])
            except:
                NameError
        
        if awards_data:
            df = pd.DataFrame(awards_data[0]).T

            df = df.assign(nom = lambda df: pd.Series.str(df[2])[0:] == 'Nominee')
            df = df.assign(win = lambda df: pd.Series.str(df[2])[0:] == 'Winner')


            for i in df.nom:
                if i == True:
                    nom += 1

            for i in df.win:
                if i == True:
                    win += 1
    return nom, win

In [318]:
winners = get_movies('winners')

In [322]:
awards = [get_awards(i) for i in winners.link_people]

In [331]:
awards_df = pd.DataFrame(awards, columns=['Nominated', 'Won'])
winners_df = winners

In [341]:
df = winners_df.merge(awards_df, left_index = True, right_index = True)