## Data scraping, loading to/from JSON file, data cleaning

### Task #1: Get Wikipedia movie info box (store in Python dictionary)

**Import necessary libraries**

In [1]:
from bs4 import BeautifulSoup as bs
import requests

**Load the webpage**

In [2]:
r = requests.get("https://en.wikipedia.org/wiki/Doctor_Strange_in_the_Multiverse_of_Madness")

# Convert webpage content to a BeautifulSoup object

soup = bs(r.content)

# Print out the HTML

contents = soup.prettify()
print(contents)

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-client-prefs-pinned-disabled vector-feature-night-mode-disabled skin-night-mode-clientpref-0 vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Doctor Strange in the Multiverse of Madness - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1

In [3]:
info_box = soup.find(class_="infobox vevent")
info_rows = info_box.find_all("tr")

for row in info_rows:
    print(row.prettify())

<tr>
 <th class="infobox-above summary" colspan="2" style="font-size: 125%; font-style: italic;">
  Doctor Strange in the
  <br/>
  <span class="nowrap">
   Multiverse of Madness
  </span>
 </th>
</tr>

<tr>
 <td class="infobox-image" colspan="2">
  <span class="mw-default-size" typeof="mw:File/Frameless">
   <a class="mw-file-description" href="/wiki/File:Doctor_Strange_in_the_Multiverse_of_Madness_poster.jpg">
    <img class="mw-file-element" data-file-height="384" data-file-width="259" decoding="async" height="326" src="//upload.wikimedia.org/wikipedia/en/thumb/1/17/Doctor_Strange_in_the_Multiverse_of_Madness_poster.jpg/220px-Doctor_Strange_in_the_Multiverse_of_Madness_poster.jpg" srcset="//upload.wikimedia.org/wikipedia/en/1/17/Doctor_Strange_in_the_Multiverse_of_Madness_poster.jpg 1.5x" width="220"/>
   </a>
  </span>
  <div class="infobox-caption">
   Theatrical release poster
  </div>
 </td>
</tr>

<tr>
 <th class="infobox-label" scope="row" style="white-space: nowrap; padding-r

In [4]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

movie_info = {}
for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['title'] = row.find("th").get_text(" ", strip=True)
    elif index == 1:
        continue
    else:
        content_key = row.find("th").get_text()
        content_value = get_content_value(row.find("td"))
        movie_info[content_key] = content_value

movie_info

{'title': 'Doctor Strange in the Multiverse of Madness',
 'Directed by': 'Sam Raimi',
 'Written by': 'Michael Waldron',
 'Based on': 'Marvel Comics',
 'Produced by': 'Kevin Feige',
 'Starring': ['Benedict Cumberbatch',
  'Elizabeth Olsen',
  'Chiwetel Ejiofor',
  'Benedict Wong',
  'Xochitl Gomez',
  'Michael Stuhlbarg',
  'Rachel McAdams'],
 'Cinematography': 'John Mathieson',
 'Edited by': ['Bob Murawski', 'Tia Nolan'],
 'Music by': 'Danny Elfman',
 'Productioncompany': 'Marvel Studios',
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Release dates': ['May 2, 2022 ( 2022-05-02 ) ( Dolby Theatre )',
  'May 6, 2022 ( 2022-05-06 ) (United States)'],
 'Running time': '126 minutes [1]',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$294.5 million [2]',
 'Box office': '$955.8 million [3] [4]'}

### Task #2: Get info box for all movies

In [5]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")

# Convert webpage content to a BeautifulSoup object

soup = bs(r.content)

# Print out the HTML

contents = soup.prettify()
print(contents)

<!DOCTYPE html>
<html class="client-nojs vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-feature-main-menu-pinned-disabled vector-feature-limited-width-clientpref-1 vector-feature-limited-width-content-enabled vector-feature-custom-font-size-clientpref-0 vector-feature-client-preferences-disabled vector-feature-client-prefs-pinned-disabled vector-feature-night-mode-disabled skin-night-mode-clientpref-0 vector-toc-available" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of Walt Disney Pictures films - Wikipedia
  </title>
  <script>
   (function(){var className="client-js vector-feature-language-in-header-enabled vector-feature-language-in-main-page-header-disabled vector-feature-sticky-header-disabled vector-feature-page-tools-pinned-disabled vector-feature-toc-pinned-clientpref-1 vector-f

In [6]:
# Testing getting link for one movie within the list

movies = soup.select(".wikitable.sortable")
movies[0].a['href'] #Getting the "href" from element a

'/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)'

In [75]:
def get_content_value(row_data):
    # If row data has list elements
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    # If row data has elements supposed to be lists, but they're not
    elif row_data.find("br"):
        # Then get the text from the HTML by stripping the strings
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

def clean_tags(soup):
    for tag in soup.find_all("sup"):
        tag.decompose()
        
def remove_extra_dates(soup):
    for date in soup.select("span"):
        date.decompose()

def get_info_box(url):
    r = requests.get(url)
    soup = bs(r.content)
    info_box = soup.find(class_="infobox vevent")
    info_rows = info_box.find_all("tr")
    
    clean_tags(soup)
    remove_extra_dates(soup)
    
    movie_info = {}
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['title'] = row.find("th").get_text(" ", strip=True)
    #   Commenting this out to account for the NoneType errors:
    #   elif index == 1:
    #   continue
        else:
            # Adding the following line to account for the NoneType errors:
            header = row.find('th')
            if header:
                content_key = row.find("th").get_text(" ", strip=True)
                content_value = get_content_value(row.find("td"))
                movie_info[content_key] = content_value
            
    return movie_info

In [82]:
get_info_box("https://en.wikipedia.org/wiki/Spirited_Away")

AttributeError: 'NoneType' object has no attribute 'find'

In [76]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
soup = bs(r.content)
movies = soup.select(".wikitable.sortable")

for index, movie in enumerate(movies):
    relative_path = movie.a['href']
    title = movie.a['title']
    
    print(relative_path)
    print(title)
    print("")

0
/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)
Snow White and the Seven Dwarfs (1937 film)

/wiki/Cinderella_(1950_film)
Cinderella (1950 film)

/wiki/Toby_Tyler_or_10_Weeks_with_a_Circus_(film)
Toby Tyler or 10 Weeks with a Circus (film)

/wiki/King_of_the_Grizzlies
King of the Grizzlies

/wiki/Midnight_Madness_(1980_film)
Midnight Madness (1980 film)

/wiki/DuckTales_the_Movie:_Treasure_of_the_Lost_Lamp
DuckTales the Movie: Treasure of the Lost Lamp

/wiki/The_Tigger_Movie
The Tigger Movie

/wiki/Alice_in_Wonderland_(2010_film)
Alice in Wonderland (2010 film)

/wiki/Timmy_Failure:_Mistakes_Were_Made
Timmy Failure: Mistakes Were Made

/wiki/Young_Woman_and_the_Sea
Young Woman and the Sea

10
/wiki/Out_of_My_Mind_(film)
Out of My Mind (film)



KeyError: 'title'

In [14]:
# Understand why some movies are throwing an error

r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
soup = bs(r.content)
movies = soup.select(".wikitable.sortable i")

for index, movie in enumerate(movies):
    try:
        relative_path = movie.a['href']
        title = movie.a['title']
    
    except Exception as e:
        print(movie.get_text())
        print(e)
        
# From the output, it looks like some of the movie titles are not linked to in Wikipedia
# Other issues include: table headers not used in some table row elements

Escape from the Dark
'NoneType' object is not subscriptable
Trail of the Panda
'NoneType' object is not subscriptable
Growing Up Wild
'NoneType' object is not subscriptable
Expedition China
'NoneType' object is not subscriptable
29 Dates
'NoneType' object is not subscriptable
Aloha Rodeo
'NoneType' object is not subscriptable
Knights
'NoneType' object is not subscriptable
Merlin
'NoneType' object is not subscriptable
Penelope
'NoneType' object is not subscriptable
Sadé
'NoneType' object is not subscriptable
Society of Explorers and Adventurers
'NoneType' object is not subscriptable
Spooked
'NoneType' object is not subscriptable
Cruella
'NoneType' object is not subscriptable
Jungle Cruise
'NoneType' object is not subscriptable
Mulan
'NoneType' object is not subscriptable
Pirates of the Caribbean
'NoneType' object is not subscriptable
Prince Anders
'NoneType' object is not subscriptable
Pirates of the Caribbean
'NoneType' object is not subscriptable
Frozen
'NoneType' object is not subscr

In [79]:
# Removing the movies from our list that do not have a linked Wikipedia page

r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
soup = bs(r.content)
# Instead of selecting italicized titles, select italicized + linked titles only:
movies = soup.select(".wikitable.sortable i a")

base_path = "https://en.wikipedia.org/"

movie_info_list = []
for index, movie in enumerate(movies):
    if index % 10 == 0:
        print(index)
    try:
        relative_path = movie['href']
        full_path = base_path + relative_path
        title = movie['title']
        movie_info_list.append(get_info_box(full_path))
    
    except Exception as e:
        print(movie.get_text())
        print(e)

0
10
20
30
40
The Sign of Zorro
'NoneType' object has no attribute 'find'
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
Mighty Ducks the Movie: The First Face-Off
'NoneType' object has no attribute 'find'
230
240
250
260
270
Spirited Away
'NoneType' object has no attribute 'find'
280
290
300
310
Howl's Moving Castle
'NoneType' object has no attribute 'find'
320
330
340
350
360
370
Ponyo
'NoneType' object has no attribute 'find'
380
Tales from Earthsea
'NoneType' object has no attribute 'find'
390
400
The Secret World of Arrietty
'NoneType' object has no attribute 'find'
410
420
430
440
450
460
470
480
490
500
The Beatles: Get Back – The Rooftop Concert
'NoneType' object has no attribute 'find'
510
520
530
Toy Story 5
'NoneType' object has no attribute 'find'
540
61
'NoneType' object has no attribute 'find_all'
All Night Long
'NoneType' object has no attribute 'find'
Big Thunder Mountain Railroad
'NoneType' object has no attribute 'find_all'
Keeper of the Lost Citie

In [22]:
movie_info_list[0]

{'title': 'Snow White and the Seven Dwarfs',
 'Directed by': ['David Hand',
  'Perce Pearce',
  'William Cottrell',
  'Larry Morey',
  'Wilfred Jackson',
  'Ben Sharpsteen'],
 'Story by': ['Ted Sears',
  'Richard Creedon',
  'Otto Englander',
  'Dick Rickard',
  'Earl Hurd',
  'Merrill De Maris',
  'Dorothy Ann Blank',
  'Webb Smith'],
 'Based on': '" Snow White " by the Brothers Grimm',
 'Produced by': 'Walt Disney',
 'Starring': ['Adriana Caselotti',
  'Roy Atwell',
  'Pinto Colvig',
  'Otis Harlan',
  'Scotty Mattraw',
  'Billy Gilbert',
  'Eddie Collins'],
 'Music by': ['Frank Churchill', 'Leigh Harline', 'Paul Smith'],
 'Productioncompany': 'Walt Disney Productions',
 'Distributed by': 'RKO Radio Pictures',
 'Release dates': ['December 21, 1937 ( 1937-12-21 ) ( Carthay Circle Theatre )',
  'February 4, 1938 ( 1938-02-04 ) (United States)'],
 'Running time': '83 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$1.5 million [1]',
 'Box office': '$418 millio

In [80]:
len(movie_info_list)

546

#### Save/Reload Movie Data

In [160]:
import json

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [161]:
import json

def load_data(title):
    with open(title, encoding="utf-8") as f:
        return json.load(f)

In [159]:
save_data("disney_movies_data_cleaned.json", movie_info_list)

TypeError: Object of type datetime is not JSON serializable

In [85]:
# Load JSON file data back into the movie_info_list dictionary

movie_info_list = load_data("disney_movies_data_cleaned.json")

### Task #3: Data cleaning

In order to run analysis on the data, it must be cleaned and without any errors:

1. Remove references "[1]" from data
2. Split up remaining strings of names into lists
3. Investigate "'NoneType' object has no attribute" errors for some movies
4. Running time: convert from string to integer
5. Convert dates to datetime objects

### Subtasks

#### 1. Remove references "[1]" from data

In [None]:
'''

This task has been completed in In [55] using the clean_tags()
and remove_extra_dates() functions

'''

#### 2. Split up remaining strings of names into lists

In [None]:
'''

This task has been completed in In [55] lines 6 and 8

'''

#### 3. Investigate 'NoneType' object has no attribute errors for some movies

In [None]:
'''

This task has been completed in In [75] line 29 onwards

'''

#### 4. Running time: convert from string to integer

In [90]:
movie_info_list[-10]

{'title': 'Inspector Gadget',
 'Directed by': 'David Kellogg',
 'Screenplay by': ['Kerry Ehrin', 'Zak Penn', 'Audrey Wells (uncredited)'],
 'Story by': ['Kerry Ehrin', 'Dana Olsen'],
 'Based on': ['Inspector Gadget',
  'by',
  'Bruno Bianchi',
  'Jean Chalopin',
  'Andy Heyward'],
 'Produced by': ['Roger Birnbaum', 'Andy Heyward', 'Jordan Kerner'],
 'Starring': ['Matthew Broderick',
  'Rupert Everett',
  'Joely Fisher',
  'Michelle Trachtenberg',
  'Mike Hagerty',
  'Andy Dick',
  'Cheri Oteri',
  'Dabney Coleman'],
 'Cinematography': 'Adam Greenberg',
 'Edited by': ['Alan Cody', 'Thom Noble'],
 'Music by': 'John Debney',
 'Production companies': ['Walt Disney Pictures',
  'Caravan Pictures',
  'DIC Entertainment',
  'Avnet/Kerner Productions',
  'Roger Birnbaum Productions'],
 'Distributed by': 'Buena Vista Pictures Distribution',
 'Release date': ['July 23, 1999'],
 'Running time': '78 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$75–90 million',
 'Box o

In [111]:
[movie.get('Running time', 'N/A') for movie in movie_info_list]

['83 minutes',
 '88 minutes',
 '126 minutes',
 '74 minutes',
 '64 minutes',
 '70 minutes',
 '42 minutes',
 '65 min',
 '71 minutes',
 '75 minutes',
 '94 minutes',
 '73 minutes',
 '75 minutes',
 '82 minutes',
 '68 minutes',
 '74 minutes',
 '96 minutes',
 '75 minutes',
 '84 minutes',
 '77 minutes',
 '92 minutes',
 '69 minutes',
 '81 minutes',
 ['60 minutes (VHS and Wild Discovery version)', '71 minutes (original)'],
 '127 minutes',
 '93 minutes',
 '76 minutes',
 '75 minutes',
 '73 minutes',
 '85 minutes',
 '81 minutes',
 '70 minutes',
 '90 minutes',
 '80 minutes',
 '75 minutes',
 '84 minutes',
 '83 minutes',
 '72 minutes',
 '97 minutes',
 '75 minutes',
 '104 minutes',
 '93 minutes',
 '105 minutes',
 '95 minutes',
 '97 minutes',
 '134 minutes',
 '69 minutes',
 '92 minutes',
 '126 minutes',
 '79 minutes',
 '97 minutes',
 '128 minutes',
 '73 minutes',
 '91 minutes',
 '105 minutes',
 '98 minutes',
 '130 minutes',
 '89 minutes',
 '93 minutes',
 '67 minutes',
 '98 minutes',
 '100 minutes',
 '11

In [113]:
def min_to_integer(running_time):
    if running_time == 'N/A':
        return None
    if isinstance(running_time, list):
        return int(running_time[0].split(" ")[0])
    else:
        return int(running_time.split(" ")[0])
    
for movie in movie_info_list:
    movie['Running time (int)'] = min_to_integer(movie.get('Running time', 'N/A'))

In [105]:
movie_info_list[10]

{'title': 'Song of the South',
 'Directed by': ['Live action:',
  'Harve Foster',
  'Animation:',
  'Wilfred Jackson'],
 'Screenplay by': ['Live action:',
  'Morton Grant',
  'Maurice Rapf',
  'Dalton S. Reymond',
  'Animation:',
  'Bill Peet',
  'George Stallings',
  'Ralph Wright'],
 'Based on': ['"', 'Uncle Remus', '"', 'by', 'Joel Chandler Harris'],
 'Produced by': ['Walt Disney', 'Perce Pearce'],
 'Starring': ['Ruth Warrick',
  'Lucile Watson',
  'Hattie McDaniel',
  'James Baskett',
  'Bobby Driscoll',
  'Luana Patten'],
 'Cinematography': 'Gregg Toland',
 'Edited by': 'William M. Morgan',
 'Music by': ['Daniele Amfitheatrof', 'Paul J. Smith'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'RKO Radio Pictures',
 'Release dates': ['November 12, 1946 (premiere)', 'November 20, 1946'],
 'Running time': '94 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$2.125 million',
 'Box office': '$65 million',
 'Running time (int)': 83}

All minutes converted to integers:

In [114]:
print([movie.get('Running time (int)', 'N/A') for movie in movie_info_list])

[83, 88, 126, 74, 64, 70, 42, 65, 71, 75, 94, 73, 75, 82, 68, 74, 96, 75, 84, 77, 92, 69, 81, 60, 127, 93, 76, 75, 73, 85, 81, 70, 90, 80, 75, 84, 83, 72, 97, 75, 104, 93, 105, 95, 97, 134, 69, 92, 126, 79, 97, 128, 73, 91, 105, 98, 130, 89, 93, 67, 98, 100, 118, 103, 110, 80, 79, 91, 91, 97, 118, 139, 131, 92, 87, 116, 93, 114, 110, 131, 101, 110, 84, 78, 75, 164, 106, 110, 99, 113, 108, 102, 85, 91, 93, 100, 100, 79, 96, 113, 89, 118, 92, 88, 92, 87, 93, 93, 93, 90, 83, 96, 88, 89, 91, 93, 92, 97, 100, 100, 89, None, 91, 112, 115, 95, 91, 97, 104, 74, 77, 104, 128, 101, 94, 104, 90, 100, 88, 93, 98, 112, 84, 97, 97, 114, 96, 97, 109, 83, 90, 107, 96, 103, 91, 95, 105, 113, 80, 101, 90, 74, 90, 89, 110, 74, 93, 84, 83, 69, 77, 107, 93, 88, 108, 84, 121, 89, 104, 90, 86, 84, 108, 107, 96, 98, 105, 109, 94, 106, 102, 69, 88, 102, 102, 97, 111, 92, 100, 96, 96, 78, 81, 108, 89, 100, 89, 81, 92, 100, 89, 79, 91, 81, 101, 104, 103, 86, 106, 74, 93, 92, 98, 76, 95, 72, 93, 87, 70, 93, 87, 1

#### 5. Convert dates to datetime objects

In [120]:
[movie.get('Release date', 'None') for movie in movie_info_list]

['None',
 'None',
 ['November 13, 1940'],
 ['June 27, 1941'],
 'None',
 'None',
 'None',
 ['July 17, 1943'],
 'None',
 'None',
 'None',
 ['September 27, 1947'],
 'May 27, 1948',
 'None',
 ['October 5, 1949'],
 'None',
 'None',
 'None',
 'None',
 ['February 5, 1953'],
 ['July 23, 1953 (United States)'],
 ['November 9, 1953'],
 'None',
 ['August 17, 1954'],
 ['December 23, 1954'],
 'May 25, 1955',
 ['June 22, 1955'],
 ['September 14, 1955'],
 'December 22, 1955',
 'June 8, 1956',
 ['July 18, 1956'],
 ['September 4, 1956'],
 ['December 20, 1956'],
 'June 19, 1957',
 'August 28, 1957',
 ['December 25, 1957'],
 ['July 8, 1958'],
 ['August 12, 1958'],
 ['December 25, 1958'],
 ['January 29, 1959'],
 ['March 19, 1959'],
 'None',
 ['November 10, 1959'],
 'January 21, 1960 ( Sarasota, FL )',
 ['February 24, 1960'],
 'May 19, 1960',
 'None',
 ['November 1, 1960'],
 ['December 21, 1960'],
 ['January 25, 1961'],
 'March 16, 1961',
 ['June 21, 1961'],
 ['July 12, 1961'],
 ['July 17, 1961'],
 ['Decem

In [145]:
from datetime import datetime

dates = [movie.get('Release date', 'None') for movie in movie_info_list]

def clean_date(date):
    return date.split("(")[0].strip()

def clean_year(date):
    return date.split("–")[0].strip()

def date_to_datetime(date):
    if isinstance(date, list):
        date = date[0]
        
    if date == 'None':
        return None
    
    date_str = clean_date(date)
    date_str = clean_year(date)
    date_formats = ["%B %d, %Y", "%Y"]
    for format_ in date_formats:
        try:
            return datetime.strptime(date_str, format_)
        except:
            pass
    return None
    
for date in dates:
    print(date_to_datetime(date))
    print()

None

None

1940-11-13 00:00:00

1941-06-27 00:00:00

None

None

None

1943-07-17 00:00:00

None

None

None

1947-09-27 00:00:00

1948-05-27 00:00:00

None

1949-10-05 00:00:00

None

None

None

None

1953-02-05 00:00:00

None

1953-11-09 00:00:00

None

1954-08-17 00:00:00

1954-12-23 00:00:00

1955-05-25 00:00:00

1955-06-22 00:00:00

1955-09-14 00:00:00

1955-12-22 00:00:00

1956-06-08 00:00:00

1956-07-18 00:00:00

1956-09-04 00:00:00

1956-12-20 00:00:00

1957-06-19 00:00:00

1957-08-28 00:00:00

1957-12-25 00:00:00

1958-07-08 00:00:00

1958-08-12 00:00:00

1958-12-25 00:00:00

1959-01-29 00:00:00

1959-03-19 00:00:00

None

1959-11-10 00:00:00

None

1960-02-24 00:00:00

1960-05-19 00:00:00

None

1960-11-01 00:00:00

1960-12-21 00:00:00

1961-01-25 00:00:00

1961-03-16 00:00:00

1961-06-21 00:00:00

1961-07-12 00:00:00

1961-07-17 00:00:00

1961-12-14 00:00:00

1962-04-05 00:00:00

1962-05-17 00:00:00

1962-06-06 00:00:00

1962-09-26 00:00:00

None

None

1963-01-16 00:00:00

In [154]:
for movie in movie_info_list:
    movie['Release date (datetime)'] = date_to_datetime(movie.get('Release date', 'None'))

In [155]:
movie_info_list[26]

{'title': 'Lady and the Tramp',
 'Directed by': ['Clyde Geronimi', 'Wilfred Jackson', 'Hamilton Luske'],
 'Story by': ['Erdman Penner',
  'Joe Rinaldi',
  'Ralph Wright',
  'Don DaGradi',
  'Joe Grant'],
 'Based on': ['"Happy Dan, the Cynical Dog"', 'by', 'Ward Greene'],
 'Produced by': 'Walt Disney',
 'Starring': ['Barbara Luddy',
  'Larry Roberts',
  'Bill Thompson',
  'Dallas McKennon',
  'Bill Baucom',
  'Verna Felton',
  'Peggy Lee'],
 'Edited by': 'Don Halliday',
 'Music by': 'Oliver Wallace',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Film Distribution',
 'Release date': ['June 22, 1955'],
 'Running time': '76 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$4 million',
 'Box office': '$187 million',
 'Running time (int)': 76,
 'Release date (datetime)': datetime.datetime(1955, 6, 22, 0, 0)}

In [174]:
# Using Pickle to load and save file with the datetime object

import pickle

def save_data_pickle(name, data):
    with open(name, 'wb') as f:
        pickle.dump(data, f)

In [175]:
import pickle

def load_data_pickle(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

save_data_pickle("disney_movies_data_cleaned_more.pickle", movie_info_list)

In [176]:
a = load_data_pickle("disney_movies_data_cleaned_more.pickle")

In [178]:
a[30]

{'title': 'Davy Crockett and the River Pirates',
 'Directed by': 'Norman Foster',
 'Written by': ['Tom Blackburn', 'Norman Foster'],
 'Produced by': 'Bill Walsh',
 'Starring': ['Fess Parker', 'Buddy Ebsen', 'Jeff York'],
 'Cinematography': 'Bert Glennon',
 'Edited by': 'Stanley Johnson',
 'Music by': ['Thomas W. Blackburn (lyrics)',
  'George Bruns',
  'Edward H. Plumb (orchestration)'],
 'Color process': 'Technicolor',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Film Distribution Co., Inc.',
 'Release date': ['July 18, 1956'],
 'Running time': '81 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Running time (int)': 81,
 'Release date (datetime)': datetime.datetime(1956, 7, 18, 0, 0)}

In [179]:
a == movie_info_list

True