# IMDB
We obtained for each movie: movie title, IMDB rating, plot description, budget, box office gross, running time and opening weekend gross

In [1]:
import pandas as pd
import numpy as np
import requests
from requests import get
from bs4 import BeautifulSoup
import re

In [None]:
title = []
years = []
runtimes = []
imdb_rating = []
votes = []
budgets = []
weekend_gross = []
gross = []
plot_sums = []
release_date = []
genres = []

In [2]:
headers = {'Accept-Language': 'en-US, en;q=0.5', 'User-Agent': 'Mozilla/5.0'}

In [None]:
pages = np.arange(1, 680, 50)
Genres = [
    'action',
    "adventure",
    "animation",
    "biography",
    "comedy",
    "crime",
    "drama",
    "family",
    "fantasy",
    "film-noir",
    "history",
    "horror",
    "music",
    "musical",
    "mystery",
    "romance",
    "sci-Fi",
    "sport",
    "thriller",
    "war",
    "western"
]

In [None]:
def requestpage(genre, page):
  page = requests.get('https://www.imdb.com/search/title/?title_type=feature&genres=' + genre + '&start=' + str(page) + '&explore=genres&ref_=adv_nxt', headers = headers)
  page = BeautifulSoup(page.text, 'html.parser')
  div = page.find_all('div', class_='lister-item mode-advanced')
  for container in div:
    name = container.h3.a.text
    title.append(name)

    year = container.h3.find('span', class_='lister-item-year').text
    years.append(year)

    runtime = container.find('span', class_='runtime').text if container.p.find('span', class_='runtime') else '-'
    runtimes.append(runtime)

    rating = float(container.strong.text) if container.find('strong') else '-'
    imdb_rating.append(rating)

    vote = container.find('span', attrs={'name':'nv'}) if container.find('span', attrs={'name':'nv'}) else '-'
    votes.append(vote)

    description = container.find_all('p', class_='text-muted') 
    plot_sum = description[1].text 
    plot_sums.append(plot_sum)

    genre = container.find('span', class_='genre').text if container.find('span', class_='genre') else '-'
    genres.append(genre)

    id = container.h3.a.get('href')
    detail = BeautifulSoup(requests.get('https://www.imdb.com' + id, headers = headers).text, 'html.parser')

    budget = detail.find('li', attrs={'data-testid':'title-boxoffice-budget'}).label.text if detail.find('li', {'data-testid':'title-boxoffice-budget'}) else '-'
    budgets.append(budget)

    opening_weekend_gross = detail.find('li', attrs={'data-testid':'title-boxoffice-openingweekenddomestic'}).label.text if detail.find('li', {'data-testid':'title-boxoffice-openingweekenddomestic'}) else '-'
    weekend_gross.append(opening_weekend_gross)

    global_gross = detail.find('li', attrs={'data-testid':'title-boxoffice-cumulativeworldwidegross'}).label.text if detail.find('li', {'data-testid':'title-boxoffice-cumulativeworldwidegross'}) else '-'
    gross.append(global_gross)

    if detail.find('li', {'data-testid':'title-details-releasedate'}):
      date = detail.find('li', attrs={'data-testid':'title-details-releasedate'})
      date = date.find('a', class_='ipc-metadata-list-item__list-content-item ipc-metadata-list-item__list-content-item--link').text
    else: 
      date = '-'
    release_date.append(date)


In [None]:
for genre in Genres:
  for page in pages[126:131]:
    requestpage(genre, page)
    print(genre, page)

In [None]:
len(years)

4750

In [None]:
movies = pd.DataFrame({'movie':title,
                       'year':years,
                       'runtime':runtimes,
                       'imdb_rating':imdb_rating,
                       'release_date':release_date,
                       'plot_summary':plot_sums,
                       'genres':genres,
                       'budget':budgets,
                       'vote':votes,
                       'gross_earning':gross,
                       'opening_weekend_gross':weekend_gross})

movies.head()

In [None]:
movies.drop_duplicates(inplace=True)

In [None]:
movies.info()

In [None]:
movies.to_csv('imdb126_131.csv')

In [None]:
from google.colab import files
files.download("imdb126_131.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# IMDB
obtain: actors and directors in each movie had been involved in Best Picture films

In [11]:
years = []
names = []
roles = []

In [None]:
best_pictures = requests.get('https://www.imdb.com/search/title/?count=100&groups=oscar_best_picture_winners&sort=year%2Cdesc&ref_=nv_ch_osc', headers = headers)
best_pictures = BeautifulSoup(best_pictures.text, 'html.parser')
pictures = best_pictures.find_all('div', class_='lister-item mode-advanced')
for picture in pictures:
  year = picture.h3.find('span', class_='lister-item-year').text
  print(year)
  id = picture.h3.a.get('href')
  detail = BeautifulSoup(requests.get('https://www.imdb.com' + id, headers = headers).text, 'html.parser')
  section = detail.find('section', {'data-testid':'title-cast'})
  actors = section.find_all('a', {'data-testid':'title-cast-item__actor'})
  for actor in actors:
    actor = actor.get_text()
    years.append(year)
    names.append(actor)
    roles.append('actor')
  li = section.find('li', class_='ipc-metadata-list__item')
  directors = li.find_all('a')
  for director in directors:
    director = director.text
    print(director)
    years.append(year)
    names.append(director)
    roles.append('director')

In [14]:
picture_involve = pd.DataFrame({'year':years, 'name':names, 'role':roles})

In [None]:
picture_involve.head()

In [15]:
picture_involve.to_csv('picture_involve.csv')

# Rotten Tomatoes
obtain: critic score, audience score, runtime, MPAA rating, studio, theater release date, DVD release date, list of genres, abridged list of cast, and abridged list of directors

In [None]:
from google.colab import drive
drive.mount('/conten/gdrive')

In [None]:
imdb = pd.read_csv('/content/gdrive/movie/imdb131_136.csv')

In [None]:
for i, idx in enumerate(imdb.index):
  name = imdb.loc[idx]['movie'].replace(' - ', '_').replace('-', '_').replace(': ','_').replace(' ', '_').replace('&', 'and').lower()
  name = re.sub(r'[^A-Za-z0-9-_]+', '', name)
  typ = 'm'
  URL = f'https://www.rottentomatoes.com/{typ}/{name}/'
  print(URL)
  page = requests.get(URL, headers = headers)

  soup = BeautifulSoup(page.content, 'html.parser')
  section = soup.find(id='topSection')
  if section == None: # Error handling for titles that cannot be found     
    continue
  rat_percentage = section.find('score-board')
  imdb.loc[idx,'audience_score'] = rat_percentage.attrs.get("audiencescore")
  imdb.loc[idx,'tomatometer_score'] = rat_percentage.attrs.get("tomatometerscore")
  if soup.find('ul', class_='content-meta info'):
    imdb.loc[idx,'rotten_info'] = soup.find('ul', class_='content-meta info').get_text()
  if soup.find('div', class_='castSection'):
    imdb.loc[idx,'cast'] = soup.find('div', class_='castSection').get_text()
  print(imdb.loc[idx,['audience_score','tomatometer_score', 'rotten_info', 'cast']])
  print(f'Progress: {round(i/len(imdb)*100,2)}%')

In [None]:
imdb.head()

In [None]:
imdb.info()

In [None]:
imdb.to_csv('ir131.csv')

In [None]:
!cp -r 'ir131.csv' '/content/gdrive/movie'