<a href="https://colab.research.google.com/github/Imppel-9704/scrape_rottentomatoes_top300/blob/main/bs4_rottentomatoes_top300.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [44]:
import requests
import json
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

rank = []
movie = []
link = []

def call_bs4():
  headers = {"User-Agent": "Mozilla/5.0"}
  response = requests.get("https://editorial.rottentomatoes.com/guide/best-movies-of-all-time/",  headers=headers)
  response.raise_for_status()
  return BeautifulSoup(response.text, "html.parser")

def scrape_table():
  soup = call_bs4()
  main = soup.find_all("table", class_= "aligncenter")

  for m in main:

    order = m.find_all("td", attrs={"style": "width: 10%; height: 23px; text-align: center;"})
    rank.extend([o.text.strip() for o in order])

    title = m.find_all("a", class_= "title")
    movie.extend([t.text.strip() for t in title])
    link.extend([t.get("href") for t in title])

  data = {
      r : {"title": m, "link": l} for r, m, l in zip(rank, movie, link)
  }
  return data

def extract_from_data(data):
  for rank, item in list(data.items()):
    response = requests.get(item['link'])
    if response.status_code != 200:
      return None
    soup = BeautifulSoup(response.text, "html.parser")
    details = soup.find_all("div", class_= "media-scorecard no-border")

    for d in details:
      script = d.find('script', {'data-json': 'mediaScorecard', 'id': 'media-scorecard-json'})

      if script: # check if script is not null
        # Convert text to dictionary
        data_dict = json.loads(script.text.strip())
        # get audience score
        audience_score = data_dict.get('audienceScore', {})
        item['description'] = data_dict.get('description')
        item['audience_like'] = audience_score.get('likedCount')
        item['audience_notlike'] = audience_score.get('notLikedCount')
        item['total_audience_review'] = audience_score.get('reviewCount')
        item['audience_score'] = audience_score.get('score')
        item['audience_sentiment'] = audience_score.get('sentiment')
        item['audience_score_percent'] = audience_score.get('scorePercent')
        # Get critics score
        critics_score = data_dict.get('criticsScore', {})
        item['critic_like'] = critics_score.get('likedCount')
        item['critic_notlike'] = critics_score.get('notLikedCount')
        item['total_critic_review'] = critics_score.get('reviewCount')
        item['critic_score'] = critics_score.get('score')
        item['critic_sentiment'] = critics_score.get('sentiment')
        item['critic_score_percent'] = critics_score.get('scorePercent')
    crews = soup.find('div', {'data-modulecastcrewmanager': 'container'})
    if crews:
      director = crews.find("p", class_= "name")
      item['director'] = director.text.strip() if director else ""

    information = soup.find_all("div", class_= "media-hero-wrap")
    for inf in information:
      text = inf.find_all("rt-text", slot= "metadataProp")
      item['metadata'] = [md.text for md in text]
      allgenre = inf.find_all("rt-text", slot= "metadataGenre")
      item['genre'] = [g.text for g in allgenre]
  return data

def transform_data(data):
  for key, item in data.items():
    if "metadata" in item:
      item['rate'], item['released date'], item['duration'] = ([""] + item['metadata'])[-3:]
      item['genre'] = ', '.join(item['genre'])
      item.pop("metadata")

  df = pd.DataFrame.from_dict(data, orient='index').reset_index()
  df.rename(columns={"index": "no"}, inplace=True)
  return df

def save_to_csv(df):
  df.to_csv("top300_rottentomatoes.csv", index=False)

def main():
  data = scrape_table()
  extracted_data = extract_from_data(data)
  dataframe = transform_data(extracted_data)
  save_to_csv(dataframe)

if __name__ == "__main__":
  main()