<a href="https://colab.research.google.com/github/MalikHasnat1999/Web-Scraping/blob/master/Disney_Dataset_Creation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Libraries

In [None]:
# import libraries
import requests
from bs4 import BeautifulSoup

## Load the webpage

In [None]:
# request the webpage connection
r = requests.get("https://en.wikipedia.org/wiki/Tron:_Legacy")

# convert to beautiful soup
soup = BeautifulSoup(r.content)
contents = soup.prettify()
# print(contents)

### Task#1: Creating Dictionary of info box

In [None]:
info_box = soup.find(class_="infobox vevent")
info_rows = info_box.find_all("tr")


def get_content_value(td):
  if td.find("li"):
    return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in td.find_all("li")]
  else:
    return td.get_text(" ", strip=True).replace("\xa0", " ")


movie_info = {}
for index, tr in enumerate(info_rows):
  if index == 0:
    movie_info["title"] = tr.th.get_text(" ", strip=True)
  elif index == 1:
    continue
  else:
    content_key = tr.th.get_text(" ", strip=True)
    content_value = get_content_value(tr.find("td"))
    movie_info[content_key] = content_value

movie_info

### Task#2: Get info box for all movies

In [None]:
# request the webpage connection
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")

# convert to beautiful soup
soup = BeautifulSoup(r.content)
contents = soup.prettify()
# print(contents)

In [None]:
movies = soup.select(".wikitable.sortable i a")
movies[8]['href']

In [None]:

def get_content_value(td):
  if td.find("li"):
    return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in td.find_all("li")]
  elif td.find('br'):
     return [text for text in td.stripped_strings]
  else:
    return td.get_text(" ", strip=True).replace("\xa0", " ")

def clean_tags(soup):
  """
    remove references [1], [2] etc
   """
  for tag in soup.find_all(['sup', 'span']):
    tag.decompose()

def get_info_box(url):

  r = requests.get(url)

  soup = BeautifulSoup(r.content)
  contents = soup.prettify()
  info_box = soup.find(class_="infobox vevent")
  info_rows = info_box.find_all("tr")

  clean_tags(soup)

  movie_info = {}
  for index, tr in enumerate(info_rows):
    if index == 0:
      movie_info["title"] = tr.th.get_text(" ", strip=True)
    # elif index == 1:
    #   continue
    else:
      header = tr.find('th')
      if header:
        content_key = tr.th.get_text(" ", strip=True)
        content_value = get_content_value(tr.find("td"))
        movie_info[content_key] = content_value

  return movie_info


In [None]:
get_info_box("https://en.wikipedia.org/wiki/One_Little_Indian_(film)")

In [None]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
soup = BeautifulSoup(r.content)
movies = soup.select(".wikitable.sortable i a")

base_path = "https://en.wikipedia.org"
movie_info_list = []
for index, movie in enumerate(movies):
  if index % 10 == 0:
    print(index)
  try:
    relative_path = movie['href']
    full_path = base_path + relative_path
    title = movie['title']
  
    movie_info_list.append(get_info_box(full_path))
  except Exception as e:
    print(movie.get_text())
    print(e)
  

In [None]:
len(movie_info_list)

### Task#3: Clean our data

* ~~Clean up references [1], [2] etc~~
* Convert running time into an integer
* Convert Dates into DateTime object
* ~~Split up long strings~~
* Convert Budget & Box office to number


#### Convert running time to an integer

In [None]:
[movie.get('Running time', 'N/A') for movie in movie_info_list]

In [None]:
def minute_to_integer(running_time):
  if running_time == 'N/A':
    return None

  if isinstance(running_time, list):
    entry = running_time[0]
    value = int(entry.split(" ")[0])
    return value
  else:
    value = int(running_time.split(" ")[0])
    return value



for movie in movie_info_list:
  movie["Running time (int)"] = minute_to_integer(movie.get('Running time', 'N/A'))


In [None]:
movie_info_list[0]

In [None]:
[movie.get('Running time (int)', 'N/A') for movie in movie_info_list]

#### Convert Budget & Box office to number


In [None]:
[movie.get('Budget', 'N/a') for movie in movie_info_list]

TODO

* Given either a string ora list of strings as input, return a number (int or float) which is equal to the montary value

* money_conversion("$12.2 million") --> 12200000  ## Word Syntax

* money_coversaion("$790,000") --> 790000  ## Value Syntax

In [None]:
import re

amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

value_re = rf"\${number}"
word_re = rf"\${number}(-|\sto\s)?({number})?\s({amounts})"

# print(re.search(number, '704,222.12').group())

def word_to_value(word):
  value_dict = {"thousand" : 1000, "million":1000000, "billion": 1000000000}
  return value_dict[word]

def parse_value_syntax(string):
  value_string = re.search(number, string).group()
  value = float(value_string.replace(",",""))
  # strip out commas before returningsolution
  return value

def parse_word_syntax(string):
  value_string = re.search(number, string, flags=re.I).group()
  value = float(value_string.replace(",",""))
  word = re.search(amounts, string).group().lower()
  word_value = word_to_value(word)
  return value*word_value

def money_conversion(money):
  if money == 'N/A':
    return None
  if isinstance(money, list):
    money = money[0]
  value_syntax = re.search(value_re, money)
  word_syntax = re.search(word_re, money)
  if word_syntax:
    return parse_word_syntax(word_syntax.group())
  elif value_syntax:
    return parse_value_syntax(value_syntax.group())
  else:
    return None


In [None]:
for movie in movie_info_list:
  movie['Budget (float)'] = money_conversion(movie.get('Budget', 'N/A'))
  movie['Box office (float)'] = money_conversion(movie.get('Box office', 'N/A'))

In [None]:
movie_info_list[1]

#### Convert Dates to DateTime object

In [None]:
[movie.get('Release date', 'N/A') for movie in movie_info_list]

In [None]:
from datetime import datetime

dates = [movie.get('Release date', 'N/A') for movie in movie_info_list]

def clean_date(date):
  return date.split("(")[0].strip()

def date_conversion(date):
  if isinstance(date, list):
    date = date[0]
  if date == 'N/A':
    return None
  date_str = clean_date(date)
  fmts = ["%B %d, %Y", "%d %B %Y", "Y"]
  for fmt in fmts: 
    try:
      return datetime.strptime(date_str, fmt)
    except:
      pass
  return None

In [None]:
for movie in movie_info_list: 
  movie['Release date (datetime)'] = date_conversion(movie.get('Release date', 'N/A'))

In [None]:
movie_info_list[2]

In [None]:
import pickle

def save_data_pickle(name, data):
  with open(name, 'wb') as f:
    pickle.dump(data, f)

In [None]:
import pickle

def load_data_pickle(name):
  with open(name, 'rb') as f:
    return pickle.load(f)

In [None]:
save_data_pickle('disney_movie_cleaned_data.pickle', movie_info_list)

### Task#4: Attach IMDB/Rotten Tommatoes/Metascore ratings Using API

In [None]:
import requests
import os
import urllib

# http://www.omdbapi.com/?apikey=[yourkey]&

def get_omdb_info(title):
  base_url = "http://www.omdbapi.com/?"
  parameters = {'apikey': '2704f42a', 't': title}
  params_encoded = urllib.parse.urlencode(parameters)
  full_url = base_url + params_encoded
  return requests.get(full_url).json()

def get_rotten_tomatoes_score(omdb_info):
  ratings = omdb_info.get('Ratings', [])
  for rating in ratings:
    if rating['Source'] == 'Rotten Tomatoes':
      return rating['Value']
  return None

get_omdb_info

In [None]:
for movie in movie_info_list:
  title = movie['title']
  omdb_info = get_omdb_info(title)
  movie['imdb'] = omdb_info.get('imdbRating', None)
  movie['metascore'] = omdb_info.get('Metascore', None)
  movie['rotten_tomatoes'] = get_rotten_tomatoes_score(omdb_info)

In [None]:
movie_info_list[2]

### Task#5: Save data in CSV/JSON format


In [None]:
# changing the datetime object to string to save the file
for movie in movie_info_list:
  current_date = movie['Release date (datetime)']
  if current_date:
    movie['Release date (datetime)'] = current_date.strftime('%B %d, %Y')
  else:
    movie['Release date (datetime)'] = None

In [None]:
import json

def save_data(name, data):
  with open(name, 'w', encoding='utf-8') as f:
    json.dump(data, f, ensure_ascii=False, indent=2)

In [None]:
save_data('disney_data_final.json', movie_info_list)

In [None]:
import pandas as pd

df = pd.DataFrame(movie_info_list)

df.info()

In [None]:
df.to_csv('disney_data_final.csv')