#### **Genre descriptors for movies from 2000-2003 Box Office Mojo**


In [0]:
# make some preparation for packages
# package use
import requests
import urllib
import urllib.request
import time
import re
from bs4 import BeautifulSoup
from IPython.core.display import HTML

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")
import copy
import datetime 
from collections import Counter

In [0]:
# Box Office MOJO links year 2000-2003
year2000_link = "https://www.boxofficemojo.com/year/2000/?ref_=bo_yl_table_21"
year2001_link = "https://www.boxofficemojo.com/year/2001/?ref_=bo_yl_table_21" 
year2002_link = "https://www.boxofficemojo.com/year/2002/?ref_=bo_yl_table_21"
year2003_link = "https://www.boxofficemojo.com/year/2003/?ref_=bo_yl_table_21"

In [0]:
# set up dataframe to record
def read_boxOffice(year_link, num_movies=200):
  # set up return frame
  dataframe = pd.DataFrame()
  dataframe['Name'] = ['']*num_movies
  # scraping
  response = requests.get(year_link)
  soup = BeautifulSoup(response.text, "html.parser")
  # extract names of the top 200 movies
  filter_ = soup.find_all("a", href=re.compile("/release/"))
  for i in range(num_movies):
    # fill in top num_movies rows
    dataframe['Name'][i] = filter_[i].get_text().replace('\n','')
  return dataframe

In [44]:
year2000_frame = read_boxOffice(year2000_link)
year2001_frame = read_boxOffice(year2001_link)
year2002_frame = read_boxOffice(year2002_link)
year2003_frame = read_boxOffice(year2003_link)
year2003_frame.head(5)

Unnamed: 0,Name
0,Finding Nemo
1,Pirates of the Caribbean: The Curse of the Bla...
2,The Matrix Reloaded
3,The Lord of the Rings: The Return of the King
4,Bruce Almighty


##### **GOOGLE SEARCH: Required Function and its parameters**

search(query, tld='com', lang='en', num=10, start=0, stop=None, pause=2.0)
- query : query string that we want to search for.
- tld : tld stands for top level domain which means we want to search our result on google.com or google.in or some other domain.
- lang : lang stands for language.
- num : Number of results we want.
- start : First result to retrieve.
- stop : Last result to retrieve. Use None to keep searching forever.
- pause : Lapse to wait between HTTP requests. Lapse too short may cause Google to block your IP. Keeping significant lapse will make your program slow but its safe and better option.
- Return : Generator (iterator) that yields found URLs. If the stop parameter is None the iterator will loop forever.


In [60]:
# google search find IMDB pages 
from googlesearch import search 
def search_link(name_string):
  # search for IMDB website
  search_string = name_string + " " + "IMDB"
  for j in search(search_string, tld="co.in", num=5, stop=1, pause=2):
    # if found imdb title - break
    if "www.imdb.com/title/" in j:
      return j
  return np.nan
search_link("How the Grinch Stole Christmas")

'https://www.imdb.com/title/tt0170016/'

In [0]:
# fillin IMDB address
def IMDB_fillin(frame_tofill):
  # set up initial columns
  frame_tofill['IMDB_link'] = np.nan
  for i in range(len(frame_tofill)):
    frame_tofill['IMDB_link'][i] = search_link(frame_tofill['Name'][i])
    # check progress
    if i%20 == 0:
      print(i)
  return frame_tofill
# year 2000
year2000_fill = IMDB_fillin(year2000_frame)

In [0]:
# year 2001 to 2003
year2001_fill = IMDB_fillin(year2001_frame)
year2002_fill = IMDB_fillin(year2002_frame)
year2003_fill = IMDB_fillin(year2003_frame)

In [73]:
# check blank process 
print("year 2000 blanks: ", len(year2000_fill[year2000_fill['IMDB_link'].isna()]))
print("year 2001 blanks: ", len(year2001_fill[year2001_fill['IMDB_link'].isna()]))
print("year 2002 blanks: ", len(year2002_fill[year2002_fill['IMDB_link'].isna()]))
print("year 2003 blanks: ", len(year2003_fill[year2003_fill['IMDB_link'].isna()]))

year 2000 blanks:  1
year 2001 blanks:  0
year 2002 blanks:  0
year 2003 blanks:  0


In [76]:
year2000_fill.head(5)

Unnamed: 0,Name,IMDB_link
0,How the Grinch Stole Christmas,https://www.imdb.com/title/tt0170016/
1,Mission: Impossible II,https://www.imdb.com/title/tt0120755/
2,Gladiator,https://www.imdb.com/title/tt0172495/
3,The Perfect Storm,https://www.imdb.com/title/tt0177971/
4,Meet the Parents,https://www.imdb.com/title/tt0212338/


In [0]:
# fillin that blank
year2000_fill[year2000_fill['IMDB_link'].isna()]['IMDB_link'] = "https://www.imdb.com/title/tt0193925/"

In [77]:
year2001_fill.head(5)

Unnamed: 0,Name,IMDB_link
0,Harry Potter and the Sorcerer's Stone,https://www.imdb.com/title/tt0241527/
1,Shrek,https://www.imdb.com/title/tt0126029/
2,"Monsters, Inc.",https://www.imdb.com/title/tt0198781/
3,Rush Hour 2,https://www.imdb.com/title/tt0266915/
4,The Mummy Returns,https://www.imdb.com/title/tt0209163/


In [78]:
year2002_fill.head(5)

Unnamed: 0,Name,IMDB_link
0,Spider-Man,https://www.imdb.com/title/tt0145487/
1,Star Wars: Episode II - Attack of the Clones,https://www.imdb.com/title/tt0121765/
2,Harry Potter and the Chamber of Secrets,https://www.imdb.com/title/tt0295297/
3,Signs,https://www.imdb.com/title/tt0286106/
4,My Big Fat Greek Wedding,https://www.imdb.com/title/tt0259446/


In [79]:
year2003_fill.head(5)

Unnamed: 0,Name,IMDB_link
0,Finding Nemo,https://www.imdb.com/title/tt0266543/
1,Pirates of the Caribbean: The Curse of the Bla...,https://www.imdb.com/title/tt0325980/
2,The Matrix Reloaded,https://www.imdb.com/title/tt0234215/
3,The Lord of the Rings: The Return of the King,https://www.imdb.com/title/tt0167260/
4,Bruce Almighty,https://www.imdb.com/title/tt0315327/


In [0]:
# fillin descriptors for movies each year
import pandas as pd
import pickle
import glob
import numpy as np

import warnings
warnings.filterwarnings("ignore")

from collections import Counter
import requests
import urllib
import urllib.request
import time
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
from IPython.core.display import HTML
import copy
from collections import Counter

In [0]:
# check if link available before scraping
def descriptor_fillin(frame_tofill):
  # initialization
  frame_tofill['descriptor1'] = np.nan
  frame_tofill['descriptor2'] = np.nan
  frame_tofill['descriptor3'] = np.nan
  # fill in
  for idx in range(len(frame_tofill)):
    url = frame_tofill['IMDB_link'][idx]
    # check if url available
    try:
      response = requests.get(url)
      soup = BeautifulSoup(response.text, "html.parser")
      select = soup.find_all("div", {"class": "subtext"})[0]
      filter_ = re.findall(r'genres=(.*?)&amp', str(select))
      # find available descriptors to fill
      if len(filter_) > 0:
        for i in range(len(filter_)):
          frame_tofill['descriptor'+str(i+1)][idx] = filter_[i]
    except:
      pass
    # check progress
    if idx%50 == 0:
      print(idx)
  return frame_tofill

In [0]:
year2001_done = descriptor_fillin(year2001_fill)
year2002_done = descriptor_fillin(year2002_fill)
year2003_done = descriptor_fillin(year2003_fill)

In [96]:
year2000_done = descriptor_fillin(year2000_fill)

0
50
100
150


In [0]:
# fillin that blank
year2000_done[year2000_done['IMDB_link'].isna()]['IMDB_link'] = "https://www.imdb.com/title/tt0193925/"

In [101]:
year2000_done.to_csv("year2000.csv")
year2000_done.head(10)

Unnamed: 0,Name,IMDB_link,descriptor1,descriptor2,descriptor3
0,How the Grinch Stole Christmas,https://www.imdb.com/title/tt0170016/,comedy,family,fantasy
1,Mission: Impossible II,https://www.imdb.com/title/tt0120755/,action,adventure,thriller
2,Gladiator,https://www.imdb.com/title/tt0172495/,action,adventure,drama
3,The Perfect Storm,https://www.imdb.com/title/tt0177971/,action,adventure,drama
4,Meet the Parents,https://www.imdb.com/title/tt0212338/,comedy,romance,
5,X-Men,https://www.imdb.com/title/tt0120903/,action,adventure,sci-fi
6,Scary Movie,https://www.imdb.com/title/tt0175142/,comedy,,
7,What Lies Beneath,https://www.imdb.com/title/tt0161081/,drama,fantasy,horror
8,Dinosaur,https://www.imdb.com/title/tt0130623/,animation,adventure,family
9,Erin Brockovich,https://www.imdb.com/title/tt0195685/,biography,drama,


In [0]:
year2001_done.to_csv("year2001.csv")
year2002_done.to_csv("year2002.csv")
year2003_done.to_csv("year2003.csv")