In [58]:
from bs4 import BeautifulSoup
import requests
import re
import pandas as pd
import math
import numpy as np

In [70]:
# Scrapes the given URL, selects the desired data and returns it in a Pandas DataFrame
# Based on: https://www.geeksforgeeks.org/scrape-imdb-movie-rating-and-details-using-python/
def scraper():
  url = 'http://www.imdb.com/chart/top'
  response = requests.get(url)
  soup = BeautifulSoup(response.text, "html.parser")

  # Selecting the desired data from the site
  movies = soup.select('td.titleColumn')
  ratings = [b.attrs.get('data-value') for b in soup.select('td.posterColumn span[name=ir]')]
  number_of_ratings = [c.attrs.get('data-value') for c in soup.select('td.posterColumn span[name=nv]')]

  # Building the DataFrame from a list of dictionaries containing the data
  list = []
  for index in range(20):
      movie_string = movies[index].get_text()
      movie = (' '.join(movie_string.split()).replace('.', ''))
      movie_title = movie[len(str(index))+1:-7]
      data = {"movie_title": movie_title,
              "rating": round(float(ratings[index]),1),
              "number_of_ratings": pd.to_numeric(number_of_ratings[index])
              }
      list.append(data)
  df = pd.DataFrame(list)
  return df

# Punishes movies for having less ratings
# Finds the maximum number of ratings then substracts 0.1 penalty from 
# each movie per 100k less ratings than the maximum
def review_penalizer(movies):
  max = movies['number_of_ratings'].max()
  movies['adjusted_rating'] = movies['rating'] -((max - movies['number_of_ratings'])/100000).apply(np.floor)/10

In [None]:
df = scraper()
review_penalizer(df)
df