## Web scrapping de IMDB

In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd

# Download IMDB's Top 250 data
url = 'http://www.imdb.com/chart/top'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')

movies = soup.select('td.titleColumn')
links = [a.attrs.get('href') for a in soup.select('td.titleColumn a')]
crew = [a.attrs.get('title') for a in soup.select('td.titleColumn a')]
ratings = [b.attrs.get('data-value') for b in soup.select('td.posterColumn span[name=ir]')]
votes = [b.attrs.get('data-value') for b in soup.select('td.ratingColumn strong')]

imdb = []

# Store each item into dictionary (data), then put those into a list (imdb)
for index in range(0, len(movies)):
    # Seperate movie into: 'place', 'title', 'year'
    movie_string = movies[index].get_text()
    movie = (' '.join(movie_string.split()).replace('.', ''))
    movie_title = movie[len(str(index))+1:-7]
    year = re.search('\((.*?)\)', movie_string).group(1)
    place = movie[:len(str(index))-(len(movie))]
    data = {"movie_title": movie_title,
            "year": year,
            "place": place,
            "star_cast": crew[index],
            "rating": ratings[index],
            "vote": votes[index],
            "link": links[index]}
    imdb.append(data)
df = pd.DataFrame(imdb)
df

Unnamed: 0,movie_title,year,place,star_cast,rating,vote,link
0,Cadena perpetua,1994,1,"Frank Darabont (dir.), Tim Robbins, Morgan Fre...",9.219691125253322,,/title/tt0111161/
1,El padrino,1972,2,"Francis Ford Coppola (dir.), Marlon Brando, Al...",9.147505037059322,,/title/tt0068646/
2,El padrino: Parte II,1974,3,"Francis Ford Coppola (dir.), Al Pacino, Robert...",8.979863610152188,,/title/tt0071562/
3,El caballero oscuro,2008,4,"Christopher Nolan (dir.), Christian Bale, Heat...",8.96968277564464,,/title/tt0468569/
4,12 hombres sin piedad,1957,5,"Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb",8.934682662518401,,/title/tt0050083/
...,...,...,...,...,...,...,...
245,Neon Genesis Evangelion: The End of Evangelion,1997,246,"Hideaki Anno (dir.), Megumi Ogata, Megumi Haya...",8.01841358074252,,/title/tt0169858/
246,Anand,1971,247,"Hrishikesh Mukherjee (dir.), Rajesh Khanna, Am...",8.017917931333276,,/title/tt0066763/
247,El hombre que mató a Liberty Valance,1962,248,"John Ford (dir.), James Stewart, John Wayne",8.017314836407733,,/title/tt0056217/
248,"París, Texas",1984,249,"Wim Wenders (dir.), Harry Dean Stanton, Nastas...",8.017017401075217,,/title/tt0087884/


In [2]:
df.head(10)

Unnamed: 0,movie_title,year,place,star_cast,rating,vote,link
0,Cadena perpetua,1994,1,"Frank Darabont (dir.), Tim Robbins, Morgan Fre...",9.219691125253322,,/title/tt0111161/
1,El padrino,1972,2,"Francis Ford Coppola (dir.), Marlon Brando, Al...",9.147505037059322,,/title/tt0068646/
2,El padrino: Parte II,1974,3,"Francis Ford Coppola (dir.), Al Pacino, Robert...",8.979863610152188,,/title/tt0071562/
3,El caballero oscuro,2008,4,"Christopher Nolan (dir.), Christian Bale, Heat...",8.96968277564464,,/title/tt0468569/
4,12 hombres sin piedad,1957,5,"Sidney Lumet (dir.), Henry Fonda, Lee J. Cobb",8.934682662518401,,/title/tt0050083/
5,La lista de Schindler,1993,6,"Steven Spielberg (dir.), Liam Neeson, Ralph Fi...",8.90934687292862,,/title/tt0108052/
6,El señor de los anillos: El retorno del rey,2003,7,"Peter Jackson (dir.), Elijah Wood, Viggo Morte...",8.885142305156142,,/title/tt0167260/
7,Pulp Fiction,1994,8,"Quentin Tarantino (dir.), John Travolta, Uma T...",8.83843466614236,,/title/tt0110912/
8,"El bueno, el feo y el malo",1966,9,"Sergio Leone (dir.), Clint Eastwood, Eli Wallach",8.78795530841176,,/title/tt0060196/
9,El señor de los anillos: La comunidad del anillo,2001,1,"Peter Jackson (dir.), Elijah Wood, Ian McKellen",8.776510629993203,,/title/tt0120737/
