In [7]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scraping_class
from matplotlib.dates import DateFormatter
import seaborn as sns
from bs4 import BeautifulSoup

In [56]:
requests.get('https://www.imdb.com/title/tt2267998/?ref_=nm_knf_i2', headers={"Accept-Language":"en-US, en;q=0.5"}).text

'\n\n\n\n\n\n\n<!DOCTYPE html>\n<html\n    xmlns:og="http://ogp.me/ns#"\n    xmlns:fb="http://www.facebook.com/2008/fbml">\n    <head>\n         \n        <meta charset="utf-8">\n        <meta http-equiv="X-UA-Compatible" content="IE=edge">\n\n    <meta name="apple-itunes-app" content="app-id=342792525, app-argument=imdb:///title/tt2267998?src=mdot">\n\n\n\n        <script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:\'java\'};</script>\n\n<script>\n    if (typeof uet == \'function\') {\n      uet("bb", "LoadTitle", {wb: 1});\n    }\n</script>\n  <script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>\n        <title>Gone Girl (2014) - IMDb</title>\n  <script>(function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);</script>\n<script>\n    if (typeof uet == \'function\') {\n      uet("be", "LoadTitle", {wb: 1});\n    }\n</script>\n<script>\n    if (t

In [8]:
plt.style.use('ggplot')
%matplotlib inline

SMALL_SIZE = 16
MEDIUM_SIZE = 18
BIGGER_SIZE = 20

plt.rc('font', size=SMALL_SIZE)          # controls default text sizes
plt.rc('axes', titlesize=SMALL_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)    # fontsize of the x and y labels
plt.rc('xtick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=SMALL_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=SMALL_SIZE)    # legend fontsize
plt.rc('figure', titlesize=BIGGER_SIZE)  # fontsize of the figure title

In [9]:
logfile = 'oscar_winners_log.csv'

In [6]:

def get_movies(winners_or_nominees):
    
    if winners_or_nominees == 'nominees':
        set_range = range(1,600,50)
    elif winners_or_nominees == 'winners':
        set_range = range(1,100, 100)
    
    connector = scraping_class.Connector(logfile)

    movies_data = []
    
    for i in set_range:
        
        if winners_or_nominees == 'nominees':        
            url = 'https://www.imdb.com/search/title/?groups=oscar_best_picture_nominees&start=%s&ref_=adv_nxt' % str(i)
        elif winners_or_nominees == 'winners':
            url = 'https://www.imdb.com/search/title/?count=100&groups=oscar_best_picture_winners&sort=year,desc&start=%s&ref_=nv_ch_osc' % str(i)

        response, call_id = connector.get(url, 'get_%s' % winners_or_nominees)
        if response.ok:
            data = response.text
            soup = BeautifulSoup(data, "lxml")
            movies = soup.find_all('div', attrs={'lister-item-content'})


            for movie in movies:

                try:
                    runtime = movie.find('p', attrs={'text-muted'}).find('span', attrs={'runtime'}).text
                    runtime = int(runtime.split(' ')[0])

                    genre   = movie.find('p', attrs={'text-muted'}).find('span', attrs={'genre'}).text.strip().split(', ')


                    idx, title, year = movie.find('h3', attrs={'lister-item-header'}).text.strip().split('\n')
                    idx = idx.split('.')[0]
                    
                    link = movie.find('a')
                    link = str(link).split('href="')[1].split('"')[0]
                    link  = 'https://www.imdb.com' + link

                    try:
                        if movie.find_all('span')[-2].text == 'Gross:':
                            gross = movie.find_all('span', attrs={'name':'nv'})[-1].text
                            gross = float(gross.split('$')[1].split('M')[0])
                        else:
                            gross = np.nan
                    except:
                        gross = np.nan

                    try:
                        metascore = movie.find('span', attrs={'metascore'}).text
                    except:
                        metascore = np.nan

                    job = movie.find_all('p')[2].text

                    actors = [i.split('\n')[1] for i in job.split('Stars:')[1].split(',')]
                    directors = [i.split(',')[0] for i in job.split('Stars:')[0].split('Director')[1].split(':\n')[1].split('\n')[:-2]]
                    
                    links_people = [str(i).split('href="')[1].split('"')[0] for i in movie.find_all('p')[2].find_all('a')]
                    links_people = ["https://www.imdb.com/" + i + "awards" for i in links_people]

                    try:
                        year = year.split('(I')[1]
                        try:
                            year = year.split('I) (')[1].split(')')[0]
                        except:
                            year = year.split(') (')[1].split(')')[0]
                    except:
                        year = year.split('(')[1].split(')')[0]

                    movies_data.append([idx, title, year, runtime, genre, metascore, gross, link, directors, actors, links_people])

                except:
                    NameError

        else:
            print('Response failed!')

    df = pd.DataFrame(movies_data)
    df.columns = ['index', 'title', 'year', 'runtime_min', 'genre', 'metascore', 'gross_mil', 'link_movie', 'director', 'actors', 'link_people']
    
    return df

In [7]:
def get_awards(actorlist):
    
    connector = scraping_class.Connector(logfile)
    
    nom = 0
    win = 0
    
    for i in actorlist:
        url = str(i)
        response, call_id = connector.get(url, 'get_awards')

        html = response.text
        soup = BeautifulSoup(html,'html.parser')

        table_node = soup.find_all('table', attrs ={'class': 'awards'})

        awards_data = []

        for i in table_node:
            try:
                award_year = i.find_all('td', attrs ={'class': 'award_year'})
                award_year =[i.find('a').text.strip('\n') for i in award_year]
                outcome = i.find_all('td', attrs = {'class': 'award_outcome'})

                outcome = [i.text for i in outcome]

                award = [i.split('\n')[2] for i in outcome]
                result = [i.split('\n')[1] for i in outcome]

                if award[0] == 'Oscar':
                    awards_data.append([award_year, award, result])
            except:
                NameError
        
        if awards_data:
            df = pd.DataFrame(awards_data[0]).T

            df = df.assign(nom = lambda df: pd.Series.str(df[2])[0:] == 'Nominee')
            df = df.assign(win = lambda df: pd.Series.str(df[2])[0:] == 'Winner')


            for i in df.nom:
                if i == True:
                    nom += 1

            for i in df.win:
                if i == True:
                    win += 1
    return nom, win

In [43]:
def get_metadata(movie_url):
    
    connector = scraping_class.Connector(logfile)

    response, call_id = connector.get(movie_url, 'get_metadata')
    
    movie_soup = BeautifulSoup(response.text,'html.parser')
    
    soup = movie_soup.find_all('div',attrs={'class':'txt-block'})

    metadata = []
    
    for i in soup:
        try: 
            inf=i.text.strip()
            
            if inf[0:8] == 'Country:':
                country = inf.split('\n')[1]
                metadata.append(country)
                
            elif inf[0:8] == 'Language':
                lang = inf.split('\n')[1]
                metadata.append(lang)
                
            elif inf[0:7] == 'Release':
                reldate = inf.split(': ')[1].split(' (')[0]
                metadata.append(reldate)
                
            elif inf[0:6] == 'Budget':
                budget = inf.split(':')[1].split('\n')[0].split('$')[1]
                budget = int(budget.replace(',', ''))
                metadata.append(budget)
                
            elif inf[0:5] == 'Color':
                color = inf.split('\n')[1]
                metadata.append(color)
                
            elif inf[0:6] ==  'Aspect':
                aspratio = float(inf.split(': ')[1].strip())
                metadata.append(aspratio)
            elif inf[0:6] == 'Opening':
                opening = inf.split(', ')[1]
                metadata.append(opening)
                
            else:
                country  = np.nan
                lang     = np.nan
                reldate  = np.nan
                budget   = np.nan
                color    = np.nan
                aspratio = np.nan
                opening  = np.nan
                
        except:
            NameError

    return metadata, soup

In [46]:
get_metadata('https://www.imdb.com/title/tt2267998/')[1]

[<div class="txt-block">
 <h4 class="inline">Taglines:</h4>
 Who are you married to?                <span class="see-more inline">
 <a href="/title/tt2267998/taglines"> See more</a> »
                 </span>
 </div>, <div class="txt-block">
 <h4 class="inline">Certificate:</h4>
 <span>15</span>
 <span class="ghost">|</span> <span class="see-more inline">
 <a href="/title/tt2267998/parentalguide#certification"> See all certifications</a> »
             </span>
 </div>, <div class="txt-block">
 <h4 class="inline">Parents Guide:</h4>
 <span class="see-more inline">
 <a href="/title/tt2267998/parentalguide"> View content advisory</a> »
         </span>
 </div>, <div class="txt-block">
 <h4 class="inline">Official Sites:</h4>
 <a href="/offsite/?page-action=offsite-facebook&amp;token=BCYu5_dqB-oHNY8Q5fTNTdpfgFizCTiy72HVnDPUsn7n0lxe4wxLoJGYVjA4lSjOohl-hJaaeVxd%0D%0AFE77UVLyH76Y4Rke5OLHXt4k_5ysa5fS_VQdEn59AJI9XPcuIlxnlsQyDZreplm2IvFq36uu5xNU%0D%0AZ04O5GA3-lxmpfv9ezYCxbJICmH69ES0xrow9v-NCYuG7

In [64]:
movies   = get_movies('nominees') #input 'winners' or 'nominees'
awards   = [get_awards(i) for i in movies.link_people]
metadata = [get_metadata(i) for i in movies.link_movie]

In [65]:
awards_df   = pd.DataFrame(awards, columns=['nom_people_sum', 'won_people_sum'])
metadata_df = pd.DataFrame(metadata, columns = ['country', 'language', 'release_date', 'budget', 'color', 'aspect_ratio'])

In [66]:
df = movies.merge(awards_df, left_index = True, right_index = True)
df = df.merge(metadata_df, left_index = True, right_index = True)

In [3]:
df.to_csv('oscar_nominees.csv')

NameError: name 'df' is not defined

In [4]:
df

NameError: name 'df' is not defined