# Simple Web-Scraping with BeautifulSoup

In this project, I tried to mine data from TMDB (https://themoviedb.org/) using BeautifulSoup. The data is based on popular movies from TMDB. The data I want to get are names of the movies, movies release dates, certification, runtimes, genres, directors, overview of the movies.

In [36]:
import requests
import urllib.request
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
from urllib.request import urlopen
import json

In [37]:
def links_scraping(first_page, last_page):
    """
    function to get all the links of the movies on each pages.
    usually a page consists 20 movies.
    """
    urls = {}
    for i in range(first_page, last_page + 1):
        html = urlopen('https://www.themoviedb.org/movie?page=' + str(i))
        bsObj = BeautifulSoup(html)
        bsObj.prettify()
        movies = bsObj.find('div', id = "page_" + str(i))
        movies = movies.find_all(class_ = 'content')
        for movie in movies:
            url = movie.a.attrs['href']
            urls[movie.a.attrs['title']] = "https://www.themoviedb.org" + url
    return(urls)

In [40]:
urls = links_scraping(1, 2)
# in this projects, I tried to mine 300 pages from TMDB popular movies page. It took all night to get it done...

# save all the link into json file
with open('tmdb_movie_urls.json', 'w') as fp:
    json.dump(urls, fp, indent=4)

In [41]:
# JSON file
f = open ('tmdb_movie_urls.json', "r")
  
# Reading from file
urls = json.loads(f.read())

# Close file
f.close()

In [49]:
# JSON file
f = open ('tmdb_movie - Copy.json', "r")
  
# Reading from file
movie_db = json.loads(f.read())

# Close file
f.close()

In [50]:
def page_scraping(urls, movie_db):
    """
    input urls is a dictionary contains all the links from the last process,
    movie_db is dictionary to contain the movies data dumps.
    the notebook often error, I guess the server is loaded.
    some of the data are unknown, so i used try except
    """
    for movie in urls:
        movie_db[movie] = {} 
        soup = BeautifulSoup(urlopen(urls[movie]))
        try:
            release = " ".join(soup.find('span', {'class':'release'}).get_text().split()[0:2])
            movie_db[movie]['release'] = release
        except:
            pass
        try:
            certification = "".join(soup.find('span', {'class':'certification'}).get_text().split())
            movie_db[movie]['certification'] = certification
        except:
            pass
        try:
            runtime = "".join(soup.find('span', {'class':'runtime'}).get_text().split())
            movie_db[movie]['runtime'] = runtime
        except:
            pass
        try:
            genres = soup.find('span', {'class':'genres'}).find_all('a')
            i = 1
            for genre in genres:
                item = genre.get_text()
                movie_db[movie]['genres'+str(i)]=item
                i = i + 1
        except:
            pass
        try:
            overview = soup.find('div', {'class':'overview'}).p.get_text()
            movie_db[movie]['overview'] = overview
        except:
            pass
        try:
            img = soup.find('img', {'class':'poster'}).attrs['src']
            movie_db[movie]['img'] = img
        except:
            pass
        try:
            director = soup.find('li', {'class':'profile'}).a.get_text()
            movie_db[movie]['director'] = director
        except:
            pass
    return(movie_db)

In [51]:
page_scraping(urls, movie_db)
print(movie_db)



In [52]:
# save the data to a JSON file
with open('tmdb_movie.json', 'w') as fp:
    json.dump(movie_db, fp, indent=4)