In [11]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import math
import numpy as np
import re

In [38]:
# Scrapes the given URL, selects the desired data and returns it in a Pandas DataFrame
# Based on: https://www.geeksforgeeks.org/scrape-imdb-movie-rating-and-details-using-python/
def scraper():
  url = 'http://www.imdb.com/chart/top'
  response = requests.get(url)
  soup = BeautifulSoup(response.text, "html.parser")

  # Selecting the desired data from the site
  movies = soup.select('td.titleColumn')
  ratings = [b.attrs.get('data-value') for b in soup.select('td.posterColumn span[name=ir]')]
  number_of_ratings = [c.attrs.get('data-value') for c in soup.select('td.posterColumn span[name=nv]')]

  #Getting the indivudual links to each movie to later scrape for Oscars
  links=[]
  oscars = []
  for link in soup.find_all('a'):
    if '/title/' in str(link):
      links.append(link.get('href'))
  links=links[1:41:2]

  # Building the DataFrame from a list of dictionaries containing the data
  _list = []
  for index in range(20):
      movie_string = movies[index].get_text()
      movie = (' '.join(movie_string.split()).replace('.', ''))
      movie_title = movie[len(str(index))+1:-7]
      
      # Scareping each movie's page for the number of Oscars
      movie_url = 'http://www.imdb.com' + links[index]
      movie_response = requests.get(movie_url)
      movie_soup = BeautifulSoup(movie_response.text, "html.parser")
      for award_string in movie_soup.find_all(class_ = 'ipc-metadata-list-item__label ipc-metadata-list-item__label--link', href=re.compile('\/title\/tt[0-9]+\/awards\/\?ref_=tt_awd')):
        award = award_string.string.split(' ')
        if award[0] == 'Won':
          oscars.append(award[1])
        else:
          oscars.append(0)
      
      data = {"movie_title": movie_title,
              "rating": round(float(ratings[index]),1),
              "number_of_ratings": pd.to_numeric(number_of_ratings[index]),
              "number_of_oscars": oscars[index]
              }
      _list.append(data)
  df = pd.DataFrame(_list)
  return df

# Punishes movies for having less ratings
# Finds the maximum number of ratings then substracts 0.1 penalty from 
# each movie per 100k less ratings than the maximum
# Input: DataFrame containing information about the movies
def review_penalizer(movies):
  max = movies['number_of_ratings'].max()
  movies['adjusted_rating'] = movies['rating'] - ((max - movies['number_of_ratings'])/100000).apply(np.floor)/10

# Sorts and saves the data to a file with a given name and format
# Input: DataFrame containing information abouth the movies, the name and format of the output file
def save_data(movies, file_name, format):
  movies.sort_values(by=['adjusted_rating'],ascending=False,inplace=True)
  movies.reset_index(inplace=True, drop=True)
  movies['ranking'] = movies.index+1
  if format.lower() == 'csv':
    movies.to_csv(file_name + '.csv')
    return
  if format.lower() == 'json':
    movies.to_json(file_name + '.json')
    return
  if format.lower() in ['excel','xls']:
    movies.to_excel(file_name + '.xls')
    return
  print('File format not supported')

In [None]:
df = scraper()
review_penalizer(df)
save_data(df,'output','csv')
df