### Disney Dataset Creation (winth Python BeautifulSoup)
#### Scrape and clean a list of Disney wiki pages to create a dataset to further analyze

### Task 1: Get Info Box (store in Python dict)
#### Import libs

In [8]:
! pip install requests
! pip install bs4
from bs4 import BeautifulSoup as bs
import requests
from pprint import pprint



#### Load the webpage

In [72]:
r = requests.get("https://en.wikipedia.org/wiki/Toy_Story_3")
# Next, convert r to a BS obj
soup = bs(r.content) # make BS obj

# Print out the HTML
contents = soup.prettify()
#  print(contents)

In [73]:
info_box = soup.find(class_="infobox vevent") #  we find a sign with all the information about the film.
#print(info_box.prettify())
info_rows = info_box.find_all("tr") # получаем список всех блоков tr в таблице "infobox vevent"
#for row in info_rows:
#    print(row.prettify()) #  проходимся и красиво печатаем

In [11]:
movie_info = {}
def get_content_value(row_data): 
    # обрабатываем спец случай с несколькими значениями
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", ' ') for li in row_data.find_all("li")]
    else:
        return row_data.get_text(strip=True).replace("\xa0", ' ')

# Заполняем основной словарь полученной с вики информацией, нужно чистить
for index, row in enumerate(info_rows):
    if index == 0:
        movie_info["Title"] = row.find("th").get_text(" ", strip=True)
    elif index == 1:
        continue
    else:
        content_key = row.find('th').get_text(" ", strip=True)
        content_value = get_content_value(row.find("td"))
        movie_info[content_key] = content_value
        

        
pprint(movie_info)

{'Box office': '$1.067 billion[1]',
 'Budget': '$200 million[1]',
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Country': 'United States',
 'Directed by': 'Lee Unkrich',
 'Distributed by': 'Walt Disney StudiosMotion Pictures',
 'Edited by': 'Ken Schretzmann',
 'Language': 'English',
 'Music by': 'Randy Newman',
 'Produced by': 'Darla K. Anderson',
 'Production companies': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Release date': ['June 12, 2010 ( 2010-06-12 ) ( Taormina Film Fest )',
                  'June 18, 2010 ( 2010-06-18 ) (United States)'],
 'Running time': '103 minutes[1]',
 'Screenplay by': 'Michael Arndt',
 'Starring': ['Tom Hanks',
              'Tim Allen',
              'Joan Cusack',
              'Don Rickles',
              'Wallace Shawn',
              'John Ratzenberger',
              'Estelle Harris',
              'Ned Beatty',
              'Michael Keaton',
              'Jodi Benson',
              'John Morris'],
 'Story by': ['John Lassete

### Task 2: Get info box for all movies

In [70]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
# Next, convert r to a BS obj
soup = bs(r.content) # make BS obj

# Print out the HTML
contents = soup.prettify()
#print(contents)

In [14]:
def get_content_value(row_data): 
    # обрабатываем спец случай с несколькими значениями
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", ' ') for li in row_data.find_all("li")]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(strip=True).replace("\xa0", ' ')

def clean_tags(soup):
    for tag in soup.find_all(["sup","span"]):
        tag.decompose()

def get_info_box(url: str):
    r = requests.get(url)
    soup = bs(r.content)
    
    info_box = soup.find(class_="infobox vevent")
    info_rows = info_box.find_all("tr")
    
    clean_tags(soup)

    movie_info = {}
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info["Title"] = row.find("th").get_text(" ", strip=True)
        else:
            header = row.find("th")
            if header:
                content_key = row.find('th').get_text(" ", strip=True)
                content_value = get_content_value(row.find("td"))
                movie_info[content_key] = content_value
    return movie_info


#### Be careful when starting this cell, you have to choose how many pages you need.

In [16]:

r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
soup = bs(r.content)
movies = soup.select(".wikitable.sortable i a")

base_path = "https://en.wikipedia.org/"
MOVIE_INFO_LIST = []

for index, movie in enumerate(movies):
    if index % 10 == 0:
        print(index)
    try:
        href = movie["href"]
        full_path = base_path + href
        title = movie["title"]
        MOVIE_INFO_LIST.append(get_info_box(full_path))    
    except Exception as e:
        print(movie.get_text())
        print(e)

0
10
20
30
40
Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
50
60
70
80
90
100
110
120
True-Life Adventures
'NoneType' object has no attribute 'find_all'
130
140
The London Connection
'NoneType' object has no attribute 'find'
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
Strange World
'NoneType' object has no attribute 'find_all'
490
500
Sister Act 3
'NoneType' object has no attribute 'find'
Tower of Terror
'NoneType' object has no attribute 'find_all'
Tron: Ares
'NoneType' object has no attribute 'find'


In [17]:
len(MOVIE_INFO_LIST)

502

#### Save/Reload Movie Data

In [34]:
import json

def save_data(title: str, data) -> None:
    """Saving your data into json file named 'title'
        title - name of file winth format(.txt,.json..)
        data - your data)
    """
    with open(title, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [75]:
import json

def load_data(title: str):
    with open(title, encoding="utf-8") as f:
        return json.load(f)

In [20]:
save_data("disney_data_сleaned.json", MOVIE_INFO_LIST)

### Task 3: Clean our data!

In [21]:
movie_info_list = load_data("disney_data.json")

#### Subtasks
- Clean up ref [2][3].. -  <font color="green">Check</font>
- Convert running time into an int-  <font color="green">Check</font>
- Convert dates into datetime obj-  <font color="green">Check</font>
- Split up long strings-  <font color="green">Check</font>
- Convert Budget & Box office to numbers

In [22]:
#  Clean Up references (remove [1],[2],[3]...)
#  Done wiht (def clean_tags() funk)

In [23]:
#  Split up the long strings
#  Done wintgh (added elif row_data.find("br") statment in get_content_value() funk)

In [34]:
#  Convert running time into an int
#  Done
def minute_to_int(running_time):
    if running_time == 'N/A':
        return None
    if isinstance(running_time, list):
        return int(running_time[0].split(' ')[0])
    else: #  is a string
        return int(running_time.split(' ')[0])
    
for movie in movie_info_list:
        movie["Running time (int)"] = minute_to_int(movie.get("Running time", 'N/A'))

In [37]:
#  print([movie.get("Running time (int)", "N\A") for movie in movie_info_list])

In [60]:
# Convert dates into DateTime objs

from datetime import datetime
dates = [movie.get("Release date", "N\A") for movie in movie_info_list]

def clean_date(date):
    return date.split("(")[0].strip()

def date_conversion(date):
    if isinstance(date, list):
        date = date[0]
        
    if date == 'N\A':
        return None
        
    date_str = clean_date(date)
    
    frms = ["%B %d, %Y", "%d %B %Y"]
    for fmt in frms:
        try:
            return datetime.strptime(date_str, fmt)
        except:
            pass
    return None

In [64]:
for movie in movie_info_list:
        movie["Release date (datetime)"] = date_conversion(movie.get("Release date","N/A"))

#### Use pickle for save/load data

In [15]:
import pickle
def save_data_pickle(name, data):
    with open(name,  'wb') as f:
        pickle.dump(data, f)

In [16]:
import pickle
def load_data_pickle(name):
    with open(name, "rb") as f:
        return pickle.load(f)

In [17]:
save_data_pickle("disney_data_сleaned_formated.pickle", movie_info_list)

NameError: name 'movie_info_list' is not defined

### Task 4: Attach IMDB/Rotten Tomatoes/Metascore scores

In [18]:
movie_info_list = load_data_pickle("disney_data_сleaned_formated.pickle")

In [19]:
movie_info_list[-60]

{'Title': 'A Wrinkle in Time',
 'Directed by': 'Ava DuVernay',
 'Screenplay by': ['Jennifer Lee', 'Jeff Stockwell'],
 'Based on': ['A Wrinkle in Time', 'by', "Madeleine L'Engle"],
 'Produced by': ['Jim Whitaker', 'Catherine Hand'],
 'Starring': ['Oprah Winfrey',
  'Reese Witherspoon',
  'Mindy Kaling',
  'Gugu Mbatha-Raw',
  'Michael Peña',
  'Zach Galifianakis',
  'Chris Pine'],
 'Cinematography': 'Tobias A. Schliessler',
 'Edited by': 'Spencer Averick',
 'Music by': 'Ramin Djawadi',
 'Production companies': ['Walt Disney Pictures', 'Whitaker Entertainment'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release date': ['February 26, 2018 ( El Capitan Theatre )',
  'March 9, 2018 (United States)'],
 'Running time': '109 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$100–130 million',
 'Box office': '$133.4 million',
 'Running time (int)': 109,
 'Release date (datetime)': datetime.datetime(2018, 2, 26, 0, 0)}

#### I will use the OMDB api

In [20]:
#  http://www.omdbapi.com/?apikey=[yourkey]&

In [21]:
import requests
import urllib
import os

def get_omdb_info(title):
    base_url = "http://www.omdbapi.com/?"
    parameters = {"apikey": os.environ['OMDB_API_KEY'], "t":title }
    parameters_encoded = urllib.parse.urlencode(parameters)
    full_url = base_url + parameters_encoded
    return requests.get(full_url).json()

def get_rotten_tomato_score(omdb_info):
    return get_omdb_info("A Wrinkle in Time")["Ratings"][1]["Value"]

In [26]:
from IPython.display import clear_output
for index, movie in enumerate(movie_info_list):
    if index % 10 == 0:
        clear_output(wait=True)
        print(index)
    title = movie["Title"]
    omdb_info = get_omdb_info(title)
    movie['imdb'] = omdb_info.get("imdbRating", None)
    movie['Metascore'] = omdb_info.get("Metascore", None)
    movie["rotten tomatoes"] = get_rotten_tomato_score(omdb_info)

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500


In [28]:
save_data_pickle("disney_data_сleaned_formated.pickle", movie_info_list)

### Task 5: Save data as JSON & CSV

In [29]:
movie_info_copy = [movie.copy() for movie in movie_info_list]

In [31]:
movie_info_copy[49]["Release date (datetime)"] # it's main issue

datetime.datetime(1960, 12, 21, 0, 0)

In [32]:
for movie in movie_info_copy:
    current_data = movie["Release date (datetime)"]
    if current_data:
        movie["Release date (datetime)"] = current_data.strftime("%B %d,%Y")
    else:
        movie["Release date (datetime)"] = None

In [33]:
movie_info_copy[49]["Release date (datetime)"]

'December 21,1960'

In [35]:
save_data("disney_data_final.json", movie_info_copy)

#### Convert data to CSV

In [36]:
import pandas as pd
df = pd.DataFrame(movie_info_list)


Unnamed: 0,Title,Production company,Distributed by,Release date,Running time,Country,Language,Box office,Running time (int),Release date (datetime),...,Production companies,Color process,Japanese,Hepburn,Adaptation by,Animation by,Traditional,Simplified,Original title,Layouts by
0,Academy Award Review of,Walt Disney Productions,RKO Radio Pictures,"[May 19, 1937]",41 minutes (74 minutes 1966 release),United States,English,$45.472,41.0,1937-05-19,...,,,,,,,,,,
1,Snow White and the Seven Dwarfs,Walt Disney Productions,RKO Radio Pictures,"[December 21, 1937 ( Carthay Circle Theatre )]",83 minutes,United States,English,$418 million,83.0,1937-12-21,...,,,,,,,,,,
2,Pinocchio,Walt Disney Productions,RKO Radio Pictures,"[February 7, 1940 ( Center Theatre ), February...",88 minutes,United States,English,$164 million,88.0,1940-02-07,...,,,,,,,,,,
3,Fantasia,Walt Disney Productions,RKO Radio Pictures,"[November 13, 1940]",126 minutes,United States,English,$76.4–$83.3 million (United States and Canada),126.0,1940-11-13,...,,,,,,,,,,
4,The Reluctant Dragon,Walt Disney Productions,RKO Radio Pictures,"[June 27, 1941]",74 minutes,United States,English,"$960,000 (worldwide rentals)",74.0,1941-06-27,...,,,,,,,,,,


In [39]:
df.to_csv("disney_movie_data_final.csv")