In [1]:
import requests
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scraping_class
from matplotlib.dates import DateFormatter
import seaborn as sns
from bs4 import BeautifulSoup

In [2]:
logfile = 'oscar_winners_log.csv'

In [3]:

def get_movies(winners_or_nominees):
    
    if winners_or_nominees == 'nominees':
        set_range = range(1,600,50)
    elif winners_or_nominees == 'winners':
        set_range = range(1,100, 100)
    
    connector = scraping_class.Connector(logfile)

    movies_data = []
    
    for i in set_range:
        
        if winners_or_nominees == 'nominees':        
            url = 'https://www.imdb.com/search/title/?groups=oscar_best_picture_nominees&start=%s&ref_=adv_nxt' % str(i)
        elif winners_or_nominees == 'winners':
            url = 'https://www.imdb.com/search/title/?count=100&groups=oscar_best_picture_winners&sort=year,desc&start=%s&ref_=nv_ch_osc' % str(i)

        #response, call_id = connector.get(url, 'get_%s' % winners_or_nominees)
        response = requests.get(url, headers={"Accept-Language":"en-US, en;q=0.5"})
        
        if response.ok:
            data = response.text
            soup = BeautifulSoup(data, "lxml")
            movies = soup.find_all('div', attrs={'lister-item-content'})


            for movie in movies:

                try:
                    runtime = movie.find('p', attrs={'text-muted'}).find('span', attrs={'runtime'}).text
                    runtime = int(runtime.split(' ')[0])

                    genre   = movie.find('p', attrs={'text-muted'}).find('span', attrs={'genre'}).text.strip().split(', ')


                    idx, title, year = movie.find('h3', attrs={'lister-item-header'}).text.strip().split('\n')
                    idx = idx.split('.')[0]
                    
                    link = movie.find('a')
                    link = str(link).split('href="')[1].split('"')[0]
                    link  = 'https://www.imdb.com' + link

                    try:
                        if movie.find_all('span')[-2].text == 'Gross:':
                            gross = movie.find_all('span', attrs={'name':'nv'})[-1].text
                            gross = float(gross.split('$')[1].split('M')[0])
                        else:
                            gross = np.nan
                    except:
                        gross = np.nan

                    try:
                        metascore = movie.find('span', attrs={'metascore'}).text
                    except:
                        metascore = np.nan

                    job = movie.find_all('p')[2].text

                    actors = [i.split('\n')[1] for i in job.split('Stars:')[1].split(',')]
                    directors = [i.split(',')[0] for i in job.split('Stars:')[0].split('Director')[1].split(':\n')[1].split('\n')[:-2]]
                    
                    links_people = [str(i).split('href="')[1].split('"')[0] for i in movie.find_all('p')[2].find_all('a')]
                    links_people = ["https://www.imdb.com/" + i + "awards" for i in links_people]

                    try:
                        year = year.split('(I')[1]
                        try:
                            year = year.split('I) (')[1].split(')')[0]
                        except:
                            year = year.split(') (')[1].split(')')[0]
                    except:
                        year = year.split('(')[1].split(')')[0]

                    movies_data.append([idx, title, year, runtime, genre, metascore, gross, link, directors, actors, links_people])

                except:
                    NameError

        else:
            print('Response failed!')

    df = pd.DataFrame(movies_data)
    df.columns = ['index', 'title', 'year', 'runtime_min', 'genre', 'metascore', 'gross_mil', 'link_movie', 'director', 'actors', 'link_people']
    
    return df

In [4]:
def get_awards(actorlist):
    
    connector = scraping_class.Connector(logfile)
    
    nom = 0
    win = 0
    
    for i in actorlist:
        url = str(i)
        
        #response, call_id = connector.get(url, 'get_awards')
        response = requests.get(url, headers={"Accept-Language":"en-US, en;q=0.5"})
        
        html = response.text
        soup = BeautifulSoup(html,'html.parser')

        table_node = soup.find_all('table', attrs ={'class': 'awards'})

        awards_data = []

        for i in table_node:
            try:
                award_year = i.find_all('td', attrs ={'class': 'award_year'})
                award_year =[i.find('a').text.strip('\n') for i in award_year]
                outcome = i.find_all('td', attrs = {'class': 'award_outcome'})

                outcome = [i.text for i in outcome]

                award = [i.split('\n')[2] for i in outcome]
                result = [i.split('\n')[1] for i in outcome]

                if award[0] == 'Oscar':
                    awards_data.append([award_year, award, result])
            except:
                NameError
        
        if awards_data:
            df = pd.DataFrame(awards_data[0]).T

            df = df.assign(nom = lambda df: pd.Series.str(df[2])[0:] == 'Nominee')
            df = df.assign(win = lambda df: pd.Series.str(df[2])[0:] == 'Winner')


            for i in df.nom:
                if i == True:
                    nom += 1

            for i in df.win:
                if i == True:
                    win += 1
    return nom, win

In [5]:
def get_metadata(movie_url):
    
    #connector = scraping_class.Connector(logfile)

    #response, call_id = connector.get(movie_url, 'get_metadata')
    response = requests.get(movie_url, headers={"Accept-Language":"en-US, en;q=0.5"})

    soup = BeautifulSoup(response.text,'html.parser')
    movie = soup.find_all('div',attrs={'class':'txt-block'})
    metadata = {}

    for i in movie:
        try:
            lookup = ['country', 'language', 'color']
            lookup2 = ['release date', 'aspect ratio']
            valuelist = ['budget', 'cumulative worldwide gross', 'aspect ratio']

            headline = i.find('h4').text.lower().split(':')[0]
            txtblock = i.text

            if headline in lookup:
                link_values = i.find('a').text
                metadata[headline] = link_values

            if headline in lookup2:
                values = txtblock.split('\n')[1].split(':')[1]
                try:
                    values = values.split('(')[0].strip()
                    metadata[headline] = values
                except:
                    values = float(values.strip())
                    metadata[headline] = values

            data = txtblock.split(':')[0].strip().lower()
            values = txtblock.split(':')[1].split('$')[1].strip().lower()

            if data in valuelist:
                values = values.replace(',','')

                try:
                    values = int(values.split('(')[0])
                    metadata[data] = values

                except:
                    values = int(values)
                    metadata[data] = values

        except:
            NameError

    return metadata

In [6]:
def get_genres(df):
    
    genres = []
    row = 0

    for i in df.genre:
        for genre in i:
            if genre not in genres:
                genres.append(genre)

    for genre in genres:
        df[genre] = 0

    for i in df.genre:
        for genre in i:
            df[genre][row] = 1
        row += 1

    return df

In [10]:
def get_data():
    win_nom = []
    
    for movie_list in ['winners', 'nominees']:
        print('... Initializing "%s" scraper ...' %movie_list)
        
        movies   = get_movies(movie_list)
        print('... Movies has been scraped ...')
    
        awards   = [get_awards(i) for i in movies.link_people]
        print('... Awards has been scraped ...')
        
        metadata = [get_metadata(i) for i in movies.link_movie]
        print('... Metadata has been scraped ...')
    
        awards   = pd.DataFrame(awards, columns=['nom_people_sum', 'won_people_sum'])
        metadata = pd.DataFrame(metadata)

        df = movies.merge(awards, left_index = True, right_index = True)
        df = df.merge(metadata, left_index = True, right_index = True)
        
        #df = get_genres([df])
        #print('... Genre dummies have been created ...')
        
        df.to_csv('oscar_%s.csv' % movie_list)
        print('... CSV file: oscar_%s.csv has been created ...' % movie_list)
        
        win_nom.append(df)
        
    return win_nom

In [11]:
win, nom = get_data()

... Initializing "winners" scraper ...
... Movies has been scraped ...
... Awards has been scraped ...
... Metadata has been scraped ...
... CSV file: oscar_winners.csv has been created ...
... Initializing "nominees" scraper ...
... Movies has been scraped ...
... Awards has been scraped ...
... Metadata has been scraped ...
... CSV file: oscar_nominees.csv has been created ...


In [12]:
nom = nom.assign(won_oscar = lambda nom: nom.title.isin(win.title))

In [15]:
nom.drop(column='index')

TypeError: drop() got an unexpected keyword argument 'column'

In [None]:
nom.to_csv('oscar_movies.csv')

In [8]:
t = pd.read_csv('oscar_winners.csv')