In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

In [2]:
def tableDataText(table):    
    """Searches through <tr> (table rows) and inner <td> (table data) tags. 
    Returns a list of rows with inner columns. 
    """
    def rowgetDataText(tr, coltag='td'): # td (data) or th (header)       
        return [td.get_text(strip=True) for td in tr.find_all(coltag)]  
    rows = []
    trs = table.find_all('tr')
    headerow = rowgetDataText(trs[0], 'th')
    if headerow: # if there is a header row include first
        rows.append(headerow)
        trs = trs[1:]
    for tr in trs: # for every table row
        rows.append(rowgetDataText(tr, 'td') ) # data row       
    return rows 

In [3]:
def get_essential(role, data):
    '''finds first value with given role
    returns None if nothing is found'''
    output = None
    for row in data:
        if row[1] == role and output == None:
            output = row[0]
        else:
            continue
    return output

In [4]:
def find_money(list_, name):
    '''searches through a list and returns a numerical value from
    sublist that contains given name'''
    
    output = None
    for sublist in list_:
        if name in sublist and sublist[-1] != '–':
            output = sublist[-1].replace("$", "").replace(",", "")
    return output
            
def find_info(list_, name):
    '''searches through a list and returns a specific value from
    sublist that contains given name'''
    output = None
    for sublist in list_:
        if name in sublist:
            output = sublist[1]
    
    return output

In [5]:
def get_bom_data(movie_codes):
    '''searches through boxofficemojo.com pages and stores essential movie info
    input: imdb movie codes in a format tt0000000
    return: Dataframe with movie info'''
    
    
    movies_bom = [[
                    'movie_id', 'title', 'year', 'trivia', 'mpaa', 'release_date', 'run_time', 'distributor', 'director', 
                    'writer', 'producer', 'composer', 'cinematographer', 'main_actor_1', 'main_actor_2', 'main_actor_3', 
                    'main_actor_4', 'budget', 'domestic', 'international', 'worldwide', 'genre_1', 
                    'genre_2', 'genre_3', 'genre_4', 'html'
                ]]
    
    for movie_code in movie_codes:
        
        try:
        
            movie_id = str(movie_code)

            html = 'https://www.boxofficemojo.com/title/' + movie_id + '/credits/'
            html_page = requests.get(html)
            soup = BeautifulSoup(html_page.text, 'html.parser')


            general_info = soup.find_all('div', {'class': 'a-section a-spacing-none'})
            general_info = [mon.get_text('@',strip=True).replace('(', '').replace(')', '').split('@') for mon in general_info]

            #get Title year trivia
            title_year_trivia = general_info[0]
            for i in range(0,3):
                if i>=len(title_year_trivia):
                    title_year_trivia.append(None)

            title = title_year_trivia[0]
            year = title_year_trivia[1]
            trivia = title_year_trivia[2]

            # get money
            domestic = find_money(general_info, 'Domestic ')
            international = find_money(general_info, 'International ')
            worldwide = find_money(general_info, 'Worldwide')
            budget = find_money(general_info, 'Budget')

            #get picture rating(mpaa), runtime and genre
            distributor = find_info(general_info, 'Domestic Distributor')
            release_date = find_info(general_info, 'Earliest Release Date')
            if release_date:
                release_date = release_date.split(',')
                release_date = release_date[0]

            mpaa = find_info(general_info, 'MPAA')
            run_time = find_info(general_info, 'Running Time')
            genres = find_info(general_info, 'Genres')

            if genres:
                genres = genres.replace('\n', '').split()
            else:
                genres = []

            for i in range(0,4):
                if i>=len(genres):
                    genres.append(None)

            genre_1 = genres[0]
            genre_2 = genres[1]
            genre_3 = genres[2]
            genre_4 = genres[3]


            #get crew
            crew = soup.find('table', {"id": "principalCrew"})
            essential = tableDataText(crew)
            writer = get_essential('Writer', essential)
            director = get_essential('Director', essential)
            producer = get_essential('Producer', essential)
            composer = get_essential('Composer', essential)
            cinematographer = get_essential('Cinematographer', essential)

            #get main actors
            cast = tableDataText(soup.find('table', {"id": "principalCast"}))
            cast = [actor[0] for actor in cast]

            for i in range(0,5):
                if i>=len(cast):
                    cast.append(None)

            main_actor_1 = cast[1]
            main_actor_2 = cast[2]
            main_actor_3 = cast[3]
            main_actor_4 = cast[4]

            movies_bom.append([movie_id, title, year, trivia, mpaa, release_date, run_time, distributor, 
                                        director, writer, producer, composer, cinematographer, main_actor_1, main_actor_2, 
                                        main_actor_3, main_actor_4, budget, domestic, international, worldwide, genre_1, 
                                        genre_2, genre_3, genre_4, html
                                    ])
        except: continue
        
    
    return movies_bom
    

In [6]:
df = pd.read_csv('imdb_codes.csv')
df.head()

Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres
0,tt0000009,Miss Jerry,Miss Jerry,1894,45,Romance
1,tt0000147,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,1897,20,"Documentary,News,Sport"
2,tt0000335,Soldiers of the Cross,Soldiers of the Cross,1900,\N,"Biography,Drama"
3,tt0000502,Bohemios,Bohemios,1905,100,\N
4,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,70,"Biography,Crime,Drama"


In [7]:
movie_codes = df[df['startYear'] >= 1990]
movie_list = list(movie_codes.tconst)
movie_codes.shape

(280738, 6)

In [8]:
import time
start_time = time.time()

film = get_bom_data(movie_list)

print("--- %s ---" % time.strftime('%H:%M:%S', time.gmtime((time.time() - start_time))))

--- 19:05:36 ---


In [9]:
movies = pd.DataFrame(film, columns=film[0])
movies.drop(0, axis=0, inplace=True)
movies.to_csv('Mojo_data_update.csv', index=False)
print(movies.shape)
movies

(223382, 26)


Unnamed: 0,movie_id,title,year,trivia,mpaa,release_date,run_time,distributor,director,writer,...,main_actor_4,budget,domestic,international,worldwide,genre_1,genre_2,genre_3,genre_4,html
1,tt0015724,Dama de noche,1993,,,,1 hr 42 min,,Eva López Sánchez,Eva López Sánchez,...,Regina Orozco,,,,,Drama,Mystery,Romance,Thriller,https://www.boxofficemojo.com/title/tt0015724/...
2,tt0016906,Frivolinas,2014,Film based on variety shows of the 20s 'Arco I...,,,1 hr 20 min,,Arturo Carballo,,...,Miguel Ligero,,,,,Comedy,Musical,,,https://www.boxofficemojo.com/title/tt0016906/...
3,tt0035423,Kate & Leopold,2001,An English Duke from 1876 is inadvertedly drag...,PG-13,December 25,1 hr 58 min,Miramax,James Mangold,Steven Rogers,...,Breckin Meyer,,47121859,28897189,76019048,Comedy,Fantasy,Romance,,https://www.boxofficemojo.com/title/tt0035423/...
4,tt0059900,"Wenn du groß bist, lieber Adam",1990,Adam receives a flashlight with special powers...,,,1 hr 18 min,,Egon Günther,Egon Günther,...,Daisy Granados,,,,,Drama,Fantasy,,,https://www.boxofficemojo.com/title/tt0059900/...
5,tt0062336,El tango del viudo y su espejo deformante,2020,The story of a man whose wife has committed su...,,,1 hr 10 min,,Raoul Ruiz,Raoul Ruiz,...,Shenda Román,,,,,Drama,,,,https://www.boxofficemojo.com/title/tt0062336/...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223378,tt9916622,Rodolpho Teóphilo - O Legado de um Pioneiro,2015,,,,57 min,,Angela Gurgel,Angela Gurgel,...,Cecília Cunha,,,,,Documentary,,,,https://www.boxofficemojo.com/title/tt9916622/...
223379,tt9916680,De la ilusión al desconcierto: cine colombiano...,2007,,,,1 hr 40 min,,Luis Ospina,Luis Ospina,...,Hugo Chaparro,,,,,Documentary,,,,https://www.boxofficemojo.com/title/tt9916680/...
223380,tt9916706,Dankyavar Danka,2013,A man falls in love with woman called Taramati...,,,,,Kanchan Nayak,Sudhir Nikam,...,Ashwini Ekbote,,,,,Comedy,,,,https://www.boxofficemojo.com/title/tt9916706/...
223381,tt9916730,6 Gunn,2017,Vidya Sarvade is a topper of Adarsh Vidyalaya....,,,1 hr 56 min,,Kiran Gawade,Kiran Gawade,...,Pranav Raorane,,,,,,,,,https://www.boxofficemojo.com/title/tt9916730/...


In [10]:
movies.iloc[:,9:]

Unnamed: 0,writer,producer,composer,cinematographer,main_actor_1,main_actor_2,main_actor_3,main_actor_4,budget,domestic,international,worldwide,genre_1,genre_2,genre_3,genre_4,html
1,Eva López Sánchez,Gustavo Montiel Pagés,José Elorza,Rodrigo Prieto,Rafael Sánchez Navarro,Cecilia Toussaint,Miguel Córcega,Regina Orozco,,,,,Drama,Mystery,Romance,Thriller,https://www.boxofficemojo.com/title/tt0015724/...
2,,,José Padilla,Ramón de Baños,José López Alonso,Juan Belmonte,María Caballé,Miguel Ligero,,,,,Comedy,Musical,,,https://www.boxofficemojo.com/title/tt0016906/...
3,Steven Rogers,Cathy Konrad,Rolfe Kent,Stuart Dryburgh,Meg Ryan,Hugh Jackman,Liev Schreiber,Breckin Meyer,,47121859,28897189,76019048,Comedy,Fantasy,Romance,,https://www.boxofficemojo.com/title/tt0035423/...
4,Egon Günther,,Wilhelm Neef,Helmut Grewald,Stephan Jahnke,Gerry Wolff,Manfred Krug,Daisy Granados,,,,,Drama,Fantasy,,,https://www.boxofficemojo.com/title/tt0059900/...
5,Raoul Ruiz,Chamila Rodríguez,Jorge Arriagada,Diego Bonacina,Rubén Sotoconil,Claudia Paz,Luis Alarcón,Shenda Román,,,,,Drama,,,,https://www.boxofficemojo.com/title/tt0062336/...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223378,Angela Gurgel,Ana Célia de Oliveira,,Marcelo Alves,Oldair Soares Ammom,José Augusto Bezerra,Renato Casimiro,Cecília Cunha,,,,,Documentary,,,,https://www.boxofficemojo.com/title/tt9916622/...
223379,Luis Ospina,,,Leonardo Giraldo,Ramiro Arbeláez,Sergio Cabrera,Óscar Campo,Hugo Chaparro,,,,,Documentary,,,,https://www.boxofficemojo.com/title/tt9916680/...
223380,Sudhir Nikam,Meghraj Rajebhosale,,Arjun Jadhav,Makarand Anaspure,Anvay Bendre,Prakash Dhotre,Ashwini Ekbote,,,,,Comedy,,,,https://www.boxofficemojo.com/title/tt9916706/...
223381,Kiran Gawade,Ujjwala Gawde,,Suresh Deshmane,Sunil Barve,Archit Deodhar,Bhushan Pradhan,Pranav Raorane,,,,,,,,,https://www.boxofficemojo.com/title/tt9916730/...
