In [4]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scraping_class
from matplotlib.dates import DateFormatter
import seaborn as sns
from bs4 import BeautifulSoup

In [5]:
plt.style.use('ggplot')
%matplotlib inline

SMALL_SIZE = 16
MEDIUM_SIZE = 18
BIGGER_SIZE = 20

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [6]:

def get_movies(winners_or_nominees):
    
    if winners_or_nominees == 'nominees':
        set_range = range(1,600,50)
    elif winners_or_nominees == 'winners':
        set_range = range(1,100, 100)
        
    
    logfile = 'oscar_winners_log.csv'
    connector = scraping_class.Connector(logfile)

    movies_data = []
    
    for i in set_range:
        
        if winners_or_nominees == 'nominees':        
            url = 'https://www.imdb.com/search/title/?groups=oscar_best_picture_nominees&start=%s&ref_=adv_nxt' % str(i)
        elif winners_or_nominees == 'winners':
            url = 'https://www.imdb.com/search/title/?count=100&groups=oscar_best_picture_winners&sort=year,desc&start=%s&ref_=nv_ch_osc' % str(i)

        response, call_id = connector.get(url, 'get_%s' % winners_or_nominees)
        if response.ok:
            data = response.text
            soup = BeautifulSoup(data, "lxml")
            movies = soup.find_all('div', attrs={'lister-item-content'})


            for movie in movies:

                try:
                    runtime = movie.find('p', attrs={'text-muted'}).find('span', attrs={'runtime'}).text
                    runtime = int(runtime.split(' ')[0])

                    genre   = movie.find('p', attrs={'text-muted'}).find('span', attrs={'genre'}).text.strip().split(', ')


                    idx, title, year = movie.find('h3', attrs={'lister-item-header'}).text.strip().split('\n')
                    idx = idx.split('.')[0]
                    
                    link = movie.find('a')
                    link = str(link).split('href="')[1].split('"')[0]
                    link  = 'https://www.imdb.com' + link

                    try:
                        if movie.find_all('span')[-2].text == 'Gross:':
                            gross = movie.find_all('span', attrs={'name':'nv'})[-1].text
                            gross = float(gross.split('$')[1].split('M')[0])
                        else:
                            gross = np.nan
                    except:
                        gross = np.nan

                    try:
                        metascore = movie.find('span', attrs={'metascore'}).text
                    except:
                        metascore = np.nan

                    job = movie.find_all('p')[2].text

                    actors = [i.split('\n')[1] for i in job.split('Stars:')[1].split(',')]
                    directors = [i.split(',')[0] for i in job.split('Stars:')[0].split('Director')[1].split(':\n')[1].split('\n')[:-2]]
                    
                    links_people = [str(i).split('href="')[1].split('"')[0] for i in movie.find_all('p')[2].find_all('a')]
                    links_people = ["https://www.imdb.com/" + i + "awards" for i in links_people]

                    try:
                        year = year.split('(I')[1]
                        try:
                            year = year.split('I) (')[1].split(')')[0]
                        except:
                            year = year.split(') (')[1].split(')')[0]
                    except:
                        year = year.split('(')[1].split(')')[0]

                    movies_data.append([idx, title, year, runtime, genre, metascore, gross, link, directors, actors, links_people])

                except:
                    NameError

        else:
            print('Response failed!')

    df = pd.DataFrame(movies_data)
    df.columns = ['index', 'title', 'year', 'runtime_min', 'genre', 'metascore', 'gross_mil', 'link_movie', 'director', 'actors', 'link_people']
    
    return df

In [7]:
def get_awards(actorlist):
    
    awards = 'awards.csv'
    connector = scraping_class.Connector(awards)
    
    nom = 0
    win = 0
    
    for i in actorlist:
        url = str(i)
        response, call_id = connector.get(url, 'awards')

        html = response.text
        soup = BeautifulSoup(html,'html.parser')

        table_node = soup.find_all('table', attrs ={'class': 'awards'})

        awards_data = []

        for i in table_node:
            try:
                award_year = i.find_all('td', attrs ={'class': 'award_year'})
                award_year =[i.find('a').text.strip('\n') for i in award_year]
                outcome = i.find_all('td', attrs = {'class': 'award_outcome'})

                outcome = [i.text for i in outcome]

                award = [i.split('\n')[2] for i in outcome]
                result = [i.split('\n')[1] for i in outcome]

                if award[0] == 'Oscar':
                    awards_data.append([award_year, award, result])
            except:
                NameError
        
        if awards_data:
            df = pd.DataFrame(awards_data[0]).T

            df = df.assign(nom = lambda df: pd.Series.str(df[2])[0:] == 'Nominee')
            df = df.assign(win = lambda df: pd.Series.str(df[2])[0:] == 'Winner')


            for i in df.nom:
                if i == True:
                    nom += 1

            for i in df.win:
                if i == True:
                    win += 1
    return nom, win

In [39]:
def get_metadata(movie_url):
    
    logfile = 'imdb_movieinfo.csv'
    connector = scraping_class.Connector(logfile)

    response, call_id = connector.get(movie_url, 'get_metadata')
    
    movie_soup = BeautifulSoup(response.text,'html.parser')
    
    soup = movie_soup.find_all('div',attrs={'class':'txt-block'})

    metadata = []
    
    for i in soup:
        try: 
            inf=i.text.strip()
            
            if inf[0:8] == 'Country:':
                country = inf.split('\n')[1]
                metadata.append(country)
                
            elif inf[0:8] == 'Language':
                lang = inf.split('\n')[1]
                metadata.append(lang)
                
            elif inf[0:7] == 'Release':
                reldate = inf.split(': ')[1].split(' (')[0]
                metadata.append(reldate)
                
            elif inf[0:6] == 'Budget':
                budget = inf.split(':')[1].split('\n')[0].split('$')[1]
                budget = int(budget)
                metadata.append(budget)
                
            elif inf[0:5] == 'Color':
                color = inf.split('\n')[1]
                metadata.append(color)
                
            elif inf[0:6] ==  'Aspect':
                aspratio = inf.split(': ')[1:3]
                aspratio = aspratio[0] + ':' + aspratio[1]
                metadata.append(aspratio)
                
            else:
                country  = np.nan
                lang     = np.nan
                reldate  = np.nan
                budget   = np.nan
                color    = np.nan
                aspratio = np.nan
                
        except:
            NameError

    return metadata

In [64]:
#movies   = get_movies('nominees') #input 'winners' or 'nominees'
awards   = [get_awards(i) for i in movies.link_people]
metadata = [get_metadata(i) for i in movies.link_movie]

In [65]:
awards_df   = pd.DataFrame(awards, columns=['Nominated', 'Won'])
metadata_df = pd.DataFrame(metadata, columns = ['country', 'language', 'release_date', 'color', 'aspect_ratio'])

In [66]:
df = movies.merge(awards_df, left_index = True, right_index = True)
df = df.merge(metadata_df, left_index = True, right_index = True)

In [67]:
df.to_csv('oscar_nominees.csv')

In [68]:
df

Unnamed: 0,index,title,year,runtime_min,genre,metascore,gross_mil,link_movie,director,actors,link_people,Nominated,Won,country,language,release_date,color,aspect_ratio
0,1,Inglourious Basterds,2009,153,"[Adventure, Drama, War]",69,120.54,https://www.imdb.com/title/tt0361748/,[Quentin Tarantino],"[Brad Pitt, Diane Kruger, Eli Roth, Mélanie La...","[https://www.imdb.com//name/nm0000233/awards, ...",6,3,Germany,English,28 August 2009,Color,2.39 :1
1,2,Pulp Fiction,1994,154,"[Crime, Drama]",94,107.93,https://www.imdb.com/title/tt0110912/,[Quentin Tarantino],"[John Travolta, Uma Thurman, Samuel L. Jackson...","[https://www.imdb.com//name/nm0000233/awards, ...",6,2,USA,English,28 October 1994,Color,2.39 :1
2,3,Bohemian Rhapsody,2018,134,"[Biography, Drama, Music]",49,216.43,https://www.imdb.com/title/tt1727824/,[Bryan Singer],"[Rami Malek, Lucy Boynton, Gwilym Lee, Ben Hardy]","[https://www.imdb.com//name/nm0001741/awards, ...",0,1,UK,English,1 November 2018,Color,2.39 :1
3,4,The Wolf of Wall Street,2013,180,"[Biography, Crime, Drama]",75,116.90,https://www.imdb.com/title/tt0993846/,[Martin Scorsese],"[Leonardo DiCaprio, Jonah Hill, Margot Robbie,...","[https://www.imdb.com//name/nm0000217/awards, ...",15,3,USA,English,9 January 2014,Color,2.39 :1
4,5,The Favourite,2018,119,"[Biography, Drama, History]",90,34.24,https://www.imdb.com/title/tt5083738/,[Yorgos Lanthimos],"[Olivia Colman, Emma Stone, Rachel Weisz, Nich...","[https://www.imdb.com//name/nm0487166/awards, ...",5,3,Ireland,English,24 January 2019,Color,1.85 :1
5,6,A Star Is Born,2018,136,"[Drama, Music, Romance]",88,215.29,https://www.imdb.com/title/tt1517451/,[Bradley Cooper],"[Lady Gaga, Bradley Cooper, Sam Elliott, Greg ...","[https://www.imdb.com//name/nm0177896/awards, ...",11,1,USA,English,6 December 2018,Color,2.39 :1
6,7,Dommedag nu,1979,147,"[Drama, Mystery, War]",94,83.47,https://www.imdb.com/title/tt0078788/,[Francis Ford Coppola],"[Martin Sheen, Marlon Brando, Robert Duvall, F...","[https://www.imdb.com//name/nm0000338/awards, ...",13,3,USA,English,26 October 1979,Color,2.39 :1
7,8,En verden udenfor,1994,142,[Drama],80,28.34,https://www.imdb.com/title/tt0111161/,[Frank Darabont],"[Tim Robbins, Morgan Freeman, Bob Gunton, Will...","[https://www.imdb.com//name/nm0001104/awards, ...",7,2,USA,English,28 April 1995,Color,1.85 :1
8,9,Django Unchained,2012,165,"[Drama, Western]",81,162.81,https://www.imdb.com/title/tt1853728/,[Quentin Tarantino],"[Jamie Foxx, Christoph Waltz, Leonardo DiCapri...","[https://www.imdb.com//name/nm0000233/awards, ...",7,6,USA,English,24 January 2013,Color,2.39 :1
9,10,Godfather,1972,175,"[Crime, Drama]",100,134.97,https://www.imdb.com/title/tt0068646/,[Francis Ford Coppola],"[Marlon Brando, Al Pacino, James Caan, Diane K...","[https://www.imdb.com//name/nm0000338/awards, ...",17,4,USA,English,26 December 1972,Color,1.37 :1
