In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
def get_imdb_stuff(movie_codes):
    
    movies_imdb = [['title', 'year', 'mpaa', 'run_time', 'rating', 'ratings_count', 'budget', 'world_gross', 'genre', 'director', 
                   'writer', 'actor', 'keywords', 'summary', 'synopsis']]
    
    for movie_code in movie_codes:
        
        movie_id = str(movie_code)
        
        try:
            
            #synopsis
            synopsis = 'https://www.imdb.com/title/' + movie_id + '/plotsummary'
            page = requests.get(synopsis)
            soup = BeautifulSoup(page.text, 'html.parser')
            
            synopsis = soup.find('ul', id="plot-synopsis-content").get_text(strip=True)
            
            imdb_title = 'https://www.imdb.com/title/' + movie_id + '/' 
            imdb_page = requests.get(imdb_title)
            imdb_soup = BeautifulSoup(imdb_page.text, 'html.parser')
            
            #get title and year
            main = imdb_soup.find('h1', class_='').get_text(strip=True)
            title = main.split('(')[0]
            year = main.split('(')[1].replace(')', '')
            
            
            #get rating
            rating = imdb_soup.find('span', itemprop='ratingValue').text
            ratings_count = imdb_soup.find('span', itemprop='ratingCount').text

            #get summary
            summary = imdb_soup.find('div', class_='summary_text').get_text(strip=True) 
            
            #get cast
            credits = imdb_soup.find_all('div', class_='credit_summary_item')

            credits = [credit.get_text(strip=True) for credit in credits]

            director = credits[0].split(':')[1].split('|')[0]
            writer = credits[1].split(':')[1].split('|')[0]
            actor = credits[2].split(':')[1].split('|')[0]
                
            misc_data = imdb_soup.find_all('div', class_='txt-block')
            misc_data = [line.get_text(strip=True) for line in misc_data]
            
            #get mpaa
            mpaas = ['G', 'PG', 'R', 'PG-13']
            mpaa = None
            for line in misc_data:
                for mpaa_ in mpaas:
                    if mpaa_ in line.split():
                        mpaa = mpaa_
            #budget            
            budget = None
            for line in misc_data:
                if 'Budget' in line:
                    budget = ''.join([ch for ch in line if ch.isdigit()])
                    
            #revenue        
            world_gross = None
            for line in misc_data:
                if 'Worldwide' in line:
                    world_gross = ''.join([ch for ch in line if ch.isdigit()])
            
            #runtime
            run_time = None
            for line in misc_data:
                if 'Runtime' in line:
                    run_time = ''.join([ch for ch in line if ch.isdigit()])
                    
            #genres and keywords
            genres = imdb_soup.find_all('div', class_='see-more inline canwrap')
            genres = [genre.get_text(strip=True) for genre in genres]
            
            keywords = None
            for line in genres:
                if 'Plot Keywords:' in line:
                    keywords = line.replace(':', '|').split('|')[1:-1]
                    
            genre = None
            for line in genres:
                if 'Genres' in line:
                    genre = line.replace(':', '|').split('|')[1:]

            
            movies_imdb.append([title, year, mpaa, run_time, rating, ratings_count, budget, world_gross, genre, director, 
                   writer, actor, keywords, summary, synopsis])
            
        except:
            continue
    
    return movies_imdb
    
    

In [3]:
df = pd.read_csv('imdb_codes.csv')
df.head()

Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres
0,tt0000009,Miss Jerry,Miss Jerry,1894,45,Romance
1,tt0000147,The Corbett-Fitzsimmons Fight,The Corbett-Fitzsimmons Fight,1897,20,"Documentary,News,Sport"
2,tt0000335,Soldiers of the Cross,Soldiers of the Cross,1900,\N,"Biography,Drama"
3,tt0000502,Bohemios,Bohemios,1905,100,\N
4,tt0000574,The Story of the Kelly Gang,The Story of the Kelly Gang,1906,70,"Biography,Crime,Drama"


In [4]:
df = df[df['runtimeMinutes'] != '\\N']
df.runtimeMinutes = df.runtimeMinutes.astype('int64')
df = df[df['runtimeMinutes'] >= 80]

In [5]:
movie_codes = df[df['startYear'] >= 1990]
movie_list = list(movie_codes.tconst)
len(movie_list)

145304

In [6]:
import time
start_time = time.time()

film = get_imdb_stuff(movie_list)

print("--- %s ---" % time.strftime('%H:%M:%S', time.gmtime((time.time() - start_time))))

--- 23:34:08 ---


In [7]:
movies = pd.DataFrame(film, columns=film[0])
movies.drop(0, axis=0, inplace=True)
movies.to_csv('imdb_data_update.csv', index=False)
print(movies.shape)
movies


(96695, 15)


Unnamed: 0,title,year,mpaa,run_time,rating,ratings_count,budget,world_gross,genre,director,writer,actor,keywords,summary,synopsis
1,Dama de noche,1993,,,6.2,20,,,"[Drama, Mystery, Romance, Thriller]",Eva López Sánchez,"Eva López Sánchez,David Martin del Campo(novel)","Rafael Sánchez Navarro,Cecilia Toussaint,Migue...","[f rated, older man younger woman relationship...",Add a Plot»,It looks like we don't have a Synopsis for thi...
2,Kate & Leopold,2001,PG-13,118123,6.4,77418,48000000,76019048,"[Comedy, Fantasy, Romance]",James Mangold,"Steven Rogers(story),James Mangold(screenplay)","Meg Ryan,Hugh Jackman,Liev Schreiber","[time travel, brooklyn bridge, bridge, time tr...",An English Duke from 1876 is inadvertedly drag...,"In 1876, Leopold Alexis Elijah Walker Gareth T..."
3,The Woman with the Knife,2010,,80,6.6,10,,,"[Drama, Thriller]",Bassori Timite,Bassori Timite(as Timité Bassori),"Bassori Timite,Mary Vieyra,Danielle Alloh",[sexual],A young man returns from Europe obsessed with ...,It looks like we don't have a Synopsis for thi...
4,"Vojtech, receny sirotek",1990,,80,6.5,24,,,[Drama],Zdenek Tyc,"Jaromir Kacer(screenplay),Jirí Soukup(screenplay)","Petr Forman,Barbora Lukesová,Jana Riháková","[skinny dipping, male nudity, male frontal nud...",Add a Plot»,It looks like we don't have a Synopsis for thi...
5,Ucho,1990,,94,7.8,2367,,,"[Drama, Thriller]",Karel Kachyna,"Karel Kachyna,Jan Procházka(screenplay)","Jirina Bohdalová,Radoslav Brzobohatý,Gustav Op...","[czech new wave, sleeplessness, power failure,...",After coming home from a Party gathering one n...,It looks like we don't have a Synopsis for thi...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96691,Padmavyuhathile Abhimanyu,2019,,130,7.9,264,,,[Drama],Vineesh Aaradya,"Vineesh Aaradya(screenplay),Vineesh Aaradya(st...","Anoop Chandran,Indrans,Sona Nair",,Add a Plot»,It looks like we don't have a Synopsis for thi...
96692,No Apology,2019,,102,5.2,32,,,[Drama],Gilbert Allan,Casey Richards,"Lydia Adair,Owen Bishop,Pierce Briggs",[death],When a group of women struggle to deal with th...,It looks like we don't have a Synopsis for thi...
96693,Paradise,2019,,135,8.2,18,,,"[Crime, Drama]",Kevin Jimenez Bernal,Kevin Jimenez Bernal,"Kevin Jimenez Bernal,Feli Cabrera,Olivier Lukunku",[money],Five years after Diego ended school he finds h...,It looks like we don't have a Synopsis for thi...
96694,Sokagin Çocuklari,2019,,98,6.4,194,,2833,"[Drama, Family]",Ahmet Faik Akinci,"Ahmet Faik Akinci,Kasim Uçkan","Ahmet Faik Akinci,Belma Mamati,Metin Keçeci",,Add a Plot»,It looks like we don't have a Synopsis for thi...


In [8]:
movies.head(40)

Unnamed: 0,title,year,mpaa,run_time,rating,ratings_count,budget,world_gross,genre,director,writer,actor,keywords,summary
1,La clase,2007,,,6.8,17,,,[Drama],José Antonio Varela,"Rafael Pinto(as Rafael Antonio),José Antonio V...","Carolina Riveros,Dario Soto,Laureano Olivares",,Add a Plot»
2,Extract,2009,R,92.0,6.1,43970,8000000.0,10848783.0,"[Comedy, Crime, Romance]",Mike Judge,Mike Judge,"Jason Bateman,Kristen Wiig,Ben Affleck","[gigolo, on the job injury, groin injury, con ...","Joel, the owner of an extract plant, tries to ..."
3,Extraordinary Stories,2008,,,8.0,962,,,"[Drama, Mystery]",Mariano Llinás,Mariano Llinás,"Mariano Llinás,Walter Jakob,Agustín Mendilaharzu",[secret],In this adventurous experiment in storytelling...
4,Hype Nation 3D,2014,,84.0,5.6,17,25000000.0,,"[Drama, Music, Musical]","Alan Calzatti,Christian A. Strickland","Jason Lee(screenplay),Daniel Shin","Cary-Hiroyuki Tagawa,Dennis Oh,Jarell Houston","[b boy, hip hop]",An American dance crew goes head to head with ...
5,Retina,2017,,87.0,5.6,646,,41280.0,"[Drama, Thriller]",Carlos Ferrer,Carlos Ferrer(story and screenplay),"Lindsay Goranson,Gary Swanson,Ron Haxton","[mind control, terrorism, unfaithfulness, adul...",A young woman participates in a medical study....
6,"Yo-rhad, un amico dallo spazio",2006,,,2.9,11,,,[Animation],"Vittorio Rambaldi,Camillo Teti","Gina Basso(novel),Vittorio Rambaldi(novel)","Bianca Alessandra Ara,Giada De Blanck,Annalisa...",[euro trash],Add a Plot»
7,The Hungry Ghosts,2009,R,105.0,5.1,167,,,[Drama],Michael Imperioli,Michael Imperioli,"Steve Schirripa,Aunjanue Ellis,Nick Sandow",,A New York City-set drama of interlocking stor...
8,Chi nasce tondo...,2008,,85.0,6.1,78,1500000.0,33486.0,[Comedy],Alessandro Valori,"Adamo Dionisi(screenplay),Valerio Mastandrea(s...","Valerio Mastandrea,Raffaele Vannoli,Glauco Ono...",,Add a Plot»
9,Get Him to the Greek,2010,R,109114.0,6.4,167098,40000000.0,91720255.0,"[Comedy, Music]",Nicholas Stoller,"Nicholas Stoller,Jason Segel(characters)","Jonah Hill,Russell Brand,Elisabeth Moss","[woman on top, london england, kali character,...",A record company intern is hired to accompany ...
10,The Greatest,2009,R,9996.0,6.6,8477,6000000.0,1344544.0,"[Drama, Romance]",Shana Feste,Shana Feste,"Carey Mulligan,Aaron Taylor-Johnson,Pierce Bro...","[loss, teenage mother, self help group, bare c...",A drama that is centered around a troubled tee...
