In [1]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import math
import numpy as np
import re

In [71]:
# Scrapes the given URL, selects the desired data and returns it in a Pandas DataFrame
# Based on: https://www.geeksforgeeks.org/scrape-imdb-movie-rating-and-details-using-python/
def scraper():
  url = 'http://www.imdb.com/chart/top'
  headers = {'Accept-Language': 'en-US,en;q=0.8'}
  response = requests.get(url,headers=headers)
  soup = BeautifulSoup(response.text, "html.parser")

  # Selecting the desired data from the site
  movies = soup.select('td.titleColumn')
  ratings = [b.attrs.get('data-value') for b in soup.select('td.posterColumn span[name=ir]')]
  number_of_ratings = [c.attrs.get('data-value') for c in soup.select('td.posterColumn span[name=nv]')]

  # Getting the indivudual links to each movie to later scrape for number of Oscars
  links=[]
  oscars = []
  for link in soup.find_all('a'):
    if '/title/' in str(link):
      links.append(link.get('href'))
  links=links[1:41:2]

  # Building the DataFrame from a list of dictionaries containing the data
  _list = []
  for index in range(20):
      movie_string = movies[index].get_text()
      movie = (' '.join(movie_string.split()).replace('.', ''))
      movie_title = movie[len(str(index))+1:-7]
      
      # Scraping each movie's page for the number of Oscars
      movie_url = 'http://www.imdb.com' + links[index]
      movie_response = requests.get(movie_url)
      movie_soup = BeautifulSoup(movie_response.text, "html.parser")
      for award_string in movie_soup.find_all(class_ = 'ipc-metadata-list-item__label ipc-metadata-list-item__label--link', href=re.compile('\/title\/tt[0-9]+\/awards\/\?ref_=tt_awd')):
        award = award_string.string.split(' ')
        if award[0] == 'Won':
          oscars.append(award[1])
        else:
          oscars.append(0)
      
      data = {"movie_title": movie_title,
              "rating": round(float(ratings[index]),1),
              "adjusted_rating": round(float(ratings[index]),1),
              "number_of_ratings": pd.to_numeric(number_of_ratings[index]),
              "number_of_oscars": int(oscars[index])
              }
      _list.append(data)
  df = pd.DataFrame(_list)
  return df

# Punishes movies for having less ratings
# Finds the maximum number of ratings then substracts 0.1 penalty from 
# each movie per 100k less ratings than the maximum
# Input: DataFrame containing information about the movies
def review_penalizer(movies):
  max = movies['number_of_ratings'].max()
  movies['adjusted_rating'] = movies['rating'] - ((max - movies['number_of_ratings'])/100000).apply(np.floor)/10
  
# Gives bonus point to each movie based on the number of Oscars they won
# Input: a row of a DataFrame with data about the movie
# Returns a row of the DataFrame with the modified rating
def oscar_bonus(row):
  if row['number_of_oscars'] in [1,2]:
    row['adjusted_rating'] += 0.3
  elif row['number_of_oscars'] in [3,4,5]:
    row['adjusted_rating'] += 0.5
  elif row['number_of_oscars'] in [6,7,8,9,10]:
    row['adjusted_rating'] += 1
  elif row['number_of_oscars'] > 10:
    row['adjusted_rating'] += 1.5
  return row
  
# Rewards the movies for having Oscar awards
# Input: DataFrame containing information about the movies
# Returns the DataFrame with the modified rating
def oscar_calculator(movies):
  return movies.apply(oscar_bonus,axis=1)

# Sorts and saves the data to a file with a given name and format
# Supports csv, json and xls formats
# Input: DataFrame containing information abouth the movies, the name and format of the output file
def save_data(movies, file_name, format):
  movies.sort_values(by=['adjusted_rating'],ascending=False,inplace=True)
  movies.reset_index(inplace=True, drop=True)
  movies['ranking'] = movies.index + 1
  if format.lower() == 'csv':
    movies.to_csv(file_name + '.csv',index=False)
    return
  if format.lower() == 'json':
    movies.to_json(file_name + '.json')
    return
  if format.lower() in ['excel','xlsx']:
    movies.to_excel(file_name + '.xlsx',index=False)
    return
  print('File format not supported')

In [None]:
df = scraper()
review_penalizer(df)
df = oscar_calculator(df)
save_data(df,'output','csv')

Unit Tests

In [None]:
test_data = {
    'movie_title': ['A','B','C','D','E'],
    'rating': [9.5,9.0,8.7,8.2,5.0],
    'adjusted_rating': [9.5,9.0,8.7,8.2,5.0],
    'number_of_ratings': [2000000,1900000,2100000,1900001,2099999],
    'number_of_oscars': [2,10,0,14,5]

}

test_df = pd.DataFrame(test_data)
review_penalizer(test_df)
#test_df
assert test_df.at[2,'adjusted_rating'] == 8.7, "Incorrect result in review_penalizer"
assert test_df.at[3,'adjusted_rating'] == 8.1, "Incorrect result in review_penalizer"
assert test_df.at[4,'adjusted_rating'] == 5.0, "Incorrect result in review_penalizer"
print('review_penalizer tests run successfully')


test_df = oscar_calculator(test_df)
np.testing.assert_almost_equal(test_df.at[0,'adjusted_rating'], 9.7,2, "Incorrect result in oscar_calculator")
np.testing.assert_almost_equal(test_df.at[1,'adjusted_rating'], 9.8,2, "Incorrect result in oscar_calculator")
np.testing.assert_almost_equal(test_df.at[4,'adjusted_rating'], 5.5,2, "Incorrect result in oscar_calculator")
print('oscar_calculator tests run successfully')

save_data(test_df,'output','csv')
test_df2 = pd.read_csv('output.csv')
save_data(test_df2,'output','json')
test_df3 = pd.read_json('output.json')
save_data(test_df3,'output','excel')
test_df4 = pd.read_excel('output.xlsx')
try:
  pd.testing.assert_frame_equal(test_df,test_df4)
  print('save_data tests run successfully')
except:
  print('Error with save_data')