In [1]:
import json
import pandas as pd
import numpy as np

In [2]:
file_dir = "Data"

In [3]:
with open(f'{file_dir}/wikipedia-movies.json', mode='r') as file:
    wiki_movies_raw = json.load(file)

In [4]:
len(wiki_movies_raw)

7311

In [5]:
wiki_movies_raw[:5]

[{'url': 'https://en.wikipedia.org/wiki/The_Adventures_of_Ford_Fairlane',
  'year': 1990,
  'imdb_link': 'https://www.imdb.com/title/tt0098987/',
  'title': 'The Adventures of Ford Fairlane',
  'Directed by': 'Renny Harlin',
  'Produced by': ['Steve Perry', 'Joel Silver'],
  'Screenplay by': ['David Arnott', 'James Cappe', 'Daniel Waters'],
  'Story by': ['David Arnott', 'James Cappe'],
  'Based on': ['Characters', 'by Rex Weiner'],
  'Starring': ['Andrew Dice Clay',
   'Wayne Newton',
   'Priscilla Presley',
   'Lauren Holly',
   'Morris Day',
   'Robert Englund',
   "Ed O'Neill"],
  'Narrated by': 'Andrew "Dice" Clay',
  'Music by': ['Cliff Eidelman', 'Yello'],
  'Cinematography': 'Oliver Wood',
  'Edited by': 'Michael Tronick',
  'Productioncompany ': 'Silver Pictures',
  'Distributed by': '20th Century Fox',
  'Release date': ['July 11, 1990', '(', '1990-07-11', ')'],
  'Running time': '102 minutes',
  'Country': 'United States',
  'Language': 'English',
  'Budget': '$20 million',


In [6]:
kaggle_metadata = pd.read_csv(f'{file_dir}/movies_metadata.csv', low_memory=False)
ratings = pd.read_csv(f'{file_dir}/ratings.csv')

In [7]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [8]:
wiki_movies_df = pd.DataFrame(wiki_movies_raw)
wiki_movies_df.columns.tolist()

['url',
 'year',
 'imdb_link',
 'title',
 'Directed by',
 'Produced by',
 'Screenplay by',
 'Story by',
 'Based on',
 'Starring',
 'Narrated by',
 'Music by',
 'Cinematography',
 'Edited by',
 'Productioncompany ',
 'Distributed by',
 'Release date',
 'Running time',
 'Country',
 'Language',
 'Budget',
 'Box office',
 'Written by',
 'Genre',
 'Theme music composer',
 'Country of origin',
 'Original language(s)',
 'Producer(s)',
 'Editor(s)',
 'Production company(s)',
 'Original network',
 'Original release',
 'Productioncompanies ',
 'Executive producer(s)',
 'Production location(s)',
 'Distributor',
 'Picture format',
 'Audio format',
 'Voices of',
 'Followed by',
 'Composer(s)',
 'Created by',
 'Also known as',
 'Opening theme',
 'No. of episodes',
 'Preceded by',
 'Author',
 'Publisher',
 'Publication date',
 'Media type',
 'Pages',
 'ISBN',
 'OCLC',
 'LC Class',
 'Cover artist',
 'Series',
 'Set in',
 'Adaptation by',
 'Suggested by',
 'Biographical data',
 'Born',
 'Died',
 'Resti

In [9]:
wiki_movies = [movie for movie in wiki_movies_raw
               if ('Director' in movie or 'Directed by' in movie)
                   and 'imdb_link' in movie
                   and 'No. of episodes' not in movie]
len(wiki_movies)

7076

In [10]:
wiki_movies_df = pd.DataFrame(wiki_movies)
wiki_movies_df.sample(n=5)

Unnamed: 0,url,year,imdb_link,title,Directed by,Produced by,Screenplay by,Story by,Based on,Starring,...,Hepburn,Literally,Cantonese,Chinese,Yiddish,Arabic,Romanized,Russian,Hebrew,Polish
5158,https://en.wikipedia.org/wiki/Mother_and_Child...,2010,https://www.imdb.com/title/tt1121977/,Mother and Child,Rodrigo García,"[Lisa Maria Falcone, Julie Lynn]",,,,"[Naomi Watts, Annette Bening, Kerry Washington...",...,,,,,,,,,,
3405,https://en.wikipedia.org/wiki/School_of_Rock,2003,https://www.imdb.com/title/tt0332379/,School of Rock,Richard Linklater,Scott Rudin,,,,"[Jack Black, Joan Cusack, Mike White, Sarah Si...",...,,,,,,,,,,
3493,https://en.wikipedia.org/wiki/Confessions_of_a...,2004,https://www.imdb.com/title/tt0361467/,Confessions of a TeenageDrama Queen,Sara Sugarman,"[Robert Shapiro, Jerry Leider]",Gail Parent,,"[Confessions of a Teenage Drama Queen, by, Dya...","[Lindsay Lohan, Adam Garcia, Glenne Headly, Al...",...,,,,,,,,,,
4434,https://en.wikipedia.org/wiki/I_Am_Legend_(film),2007,https://www.imdb.com/title/tt0480249/,I Am Legend,Francis Lawrence,"[Akiva Goldsman, James Lassiter, David Heyman,...","[Mark Protosevich, Akiva Goldsman]",,"[I Am Legend, by, Richard Matheson]","[Will Smith, Alice Braga, Dash Mihok]",...,,,,,,,,,,
1102,https://en.wikipedia.org/wiki/Speed_(1994_film),1994,https://www.imdb.com/title/tt0111257/,Speed,Jan de Bont,Mark Gordon,,,,"[Keanu Reeves, Dennis Hopper, Sandra Bullock, ...",...,,,,,,,,,,


In [11]:
def clean_movie(movie):
    movie = dict(movie) #create a non-destructive copy
    alt_titles = {}
    # combine alternate titles into one list
    for key in ['Also known as','Arabic','Cantonese','Chinese','French',
                'Hangul','Hebrew','Hepburn','Japanese','Literally',
                'Mandarin','McCune-Reischauer','Original title','Polish',
                'Revised Romanization','Romanized','Russian',
                'Simplified','Traditional','Yiddish']:
        if key in movie:
            alt_titles[key] = movie[key]
            movie.pop(key)
    if len(alt_titles) > 0:
        movie['alt_titles'] = alt_titles

    # merge column names
    def change_column_name(old_name, new_name):
        if old_name in movie:
            movie[new_name] = movie.pop(old_name)
    change_column_name('Adaptation by', 'Writer(s)')
    change_column_name('Country of origin', 'Country')
    change_column_name('Directed by', 'Director')
    change_column_name('Distributed by', 'Distributor')
    change_column_name('Edited by', 'Editor(s)')
    change_column_name('Length', 'Running time')
    change_column_name('Original release', 'Release date')
    change_column_name('Music by', 'Composer(s)')
    change_column_name('Produced by', 'Producer(s)')
    change_column_name('Producer', 'Producer(s)')
    change_column_name('Productioncompanies ', 'Production company(s)')
    change_column_name('Productioncompany ', 'Production company(s)')
    change_column_name('Released', 'Release Date')
    change_column_name('Release Date', 'Release date')
    change_column_name('Screen story by', 'Writer(s)')
    change_column_name('Screenplay by', 'Writer(s)')
    change_column_name('Story by', 'Writer(s)')
    change_column_name('Theme music composer', 'Composer(s)')
    change_column_name('Written by', 'Writer(s)')

    return movie

In [12]:
clean_movies = [clean_movie(movie) for movie in wiki_movies]
wiki_movies_df = pd.DataFrame(clean_movies)
sorted(wiki_movies_df.columns.tolist())

['Animation by',
 'Audio format',
 'Based on',
 'Box office',
 'Budget',
 'Cinematography',
 'Color process',
 'Composer(s)',
 'Country',
 'Created by',
 'Director',
 'Distributor',
 'Editor(s)',
 'Executive producer(s)',
 'Followed by',
 'Genre',
 'Label',
 'Language',
 'McCune–Reischauer',
 'Narrated by',
 'Original language(s)',
 'Original network',
 'Picture format',
 'Preceded by',
 'Producer(s)',
 'Production company(s)',
 'Production location(s)',
 'Recorded',
 'Release date',
 'Running time',
 'Starring',
 'Suggested by',
 'Venue',
 'Voices of',
 'Writer(s)',
 'alt_titles',
 'imdb_link',
 'title',
 'url',
 'year']

In [13]:
clean_movies = [clean_movie(movie) for movie in wiki_movies]
wiki_movies_df = pd.DataFrame(clean_movies)
sorted(wiki_movies_df.columns.tolist())

['Animation by',
 'Audio format',
 'Based on',
 'Box office',
 'Budget',
 'Cinematography',
 'Color process',
 'Composer(s)',
 'Country',
 'Created by',
 'Director',
 'Distributor',
 'Editor(s)',
 'Executive producer(s)',
 'Followed by',
 'Genre',
 'Label',
 'Language',
 'McCune–Reischauer',
 'Narrated by',
 'Original language(s)',
 'Original network',
 'Picture format',
 'Preceded by',
 'Producer(s)',
 'Production company(s)',
 'Production location(s)',
 'Recorded',
 'Release date',
 'Running time',
 'Starring',
 'Suggested by',
 'Venue',
 'Voices of',
 'Writer(s)',
 'alt_titles',
 'imdb_link',
 'title',
 'url',
 'year']

In [14]:
wiki_movies_df['imdb_id'] = wiki_movies_df['imdb_link'].str.extract(r'(tt\d{7})')
print(len(wiki_movies_df))
wiki_movies_df.drop_duplicates(subset='imdb_id', inplace=True)
print(len(wiki_movies_df))
wiki_movies_df.head()

7076
7033


Unnamed: 0,url,year,imdb_link,title,Based on,Starring,Narrated by,Cinematography,Release date,Running time,...,Preceded by,Suggested by,alt_titles,Recorded,Venue,Label,Animation by,Color process,McCune–Reischauer,imdb_id
0,https://en.wikipedia.org/wiki/The_Adventures_o...,1990,https://www.imdb.com/title/tt0098987/,The Adventures of Ford Fairlane,"[Characters, by Rex Weiner]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...","Andrew ""Dice"" Clay",Oliver Wood,"[July 11, 1990, (, 1990-07-11, )]",102 minutes,...,,,,,,,,,,tt0098987
1,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",1990,https://www.imdb.com/title/tt0098994/,"After Dark, My Sweet","[the novel, After Dark, My Sweet, by, Jim Thom...","[Jason Patric, Rachel Ward, Bruce Dern, George...",,Mark Plummer,"[May 17, 1990, (, 1990-05-17, ), (Cannes Film ...",114 minutes,...,,,,,,,,,,tt0098994
2,https://en.wikipedia.org/wiki/Air_America_(film),1990,https://www.imdb.com/title/tt0099005/,Air America,"[Air America, by, Christopher Robbins]","[Mel Gibson, Robert Downey Jr., Nancy Travis, ...",,Roger Deakins,"[August 10, 1990, (, 1990-08-10, )]",113 minutes,...,,,,,,,,,,tt0099005
3,https://en.wikipedia.org/wiki/Alice_(1990_film),1990,https://www.imdb.com/title/tt0099012/,Alice,,"[Alec Baldwin, Blythe Danner, Judy Davis, Mia ...",,Carlo Di Palma,"[December 25, 1990, (, 1990-12-25, )]",106 minutes,...,,,,,,,,,,tt0099012
4,https://en.wikipedia.org/wiki/Almost_an_Angel,1990,https://www.imdb.com/title/tt0099018/,Almost an Angel,,"[Paul Hogan, Elias Koteas, Linda Kozlowski]",,Russell Boyd,"December 19, 1990",95 minutes,...,,,,,,,,,,tt0099018


In [15]:
[[column,wiki_movies_df[column].isnull().sum()] for column in wiki_movies_df.columns]

[['url', 0],
 ['year', 0],
 ['imdb_link', 0],
 ['title', 1],
 ['Based on', 4852],
 ['Starring', 184],
 ['Narrated by', 6752],
 ['Cinematography', 691],
 ['Release date', 32],
 ['Running time', 139],
 ['Country', 236],
 ['Language', 244],
 ['Budget', 2295],
 ['Box office', 1548],
 ['Director', 0],
 ['Distributor', 357],
 ['Editor(s)', 548],
 ['Composer(s)', 518],
 ['Producer(s)', 202],
 ['Production company(s)', 1678],
 ['Writer(s)', 199],
 ['Genre', 6923],
 ['Original language(s)', 6875],
 ['Original network', 6908],
 ['Executive producer(s)', 6936],
 ['Production location(s)', 6986],
 ['Picture format', 6969],
 ['Audio format', 6972],
 ['Voices of', 7031],
 ['Followed by', 7024],
 ['Created by', 7023],
 ['Preceded by', 7023],
 ['Suggested by', 7032],
 ['alt_titles', 7012],
 ['Recorded', 7031],
 ['Venue', 7032],
 ['Label', 7031],
 ['Animation by', 7031],
 ['Color process', 7032],
 ['McCune–Reischauer', 7031],
 ['imdb_id', 0]]

In [16]:
[column for column in wiki_movies_df.columns if wiki_movies_df[column].isnull().sum() < len(wiki_movies_df) * 0.9]

['url',
 'year',
 'imdb_link',
 'title',
 'Based on',
 'Starring',
 'Cinematography',
 'Release date',
 'Running time',
 'Country',
 'Language',
 'Budget',
 'Box office',
 'Director',
 'Distributor',
 'Editor(s)',
 'Composer(s)',
 'Producer(s)',
 'Production company(s)',
 'Writer(s)',
 'imdb_id']

In [17]:
wiki_columns_to_keep = [column for column in wiki_movies_df.columns if wiki_movies_df[column].isnull().sum() < len(wiki_movies_df) * 0.9]
wiki_movies_df = wiki_movies_df[wiki_columns_to_keep]

In [18]:
wiki_movies_df.sample(n=5)

Unnamed: 0,url,year,imdb_link,title,Based on,Starring,Cinematography,Release date,Running time,Country,...,Budget,Box office,Director,Distributor,Editor(s),Composer(s),Producer(s),Production company(s),Writer(s),imdb_id
818,https://en.wikipedia.org/wiki/The_Saint_of_For...,1993,https://www.imdb.com/title/tt0108026/,The Saint of Fort Washington,,"[Matt Dillon, Danny Glover, Rick Aviles, Nina ...",Frederick Elmes,"November 17, 1993",99 mins.,United States,...,$10 million,$5.9 million,Tim Hunter,Warner Bros.,Howard E. Smith,James Newton Howard,"[Nessa Hyams, David V. Picker]",,Lyle Kessler,tt0108026
4979,https://en.wikipedia.org/wiki/Quantum_Quest:_A...,2009,https://www.imdb.com/title/tt0312305/,Quantum Quest: A Cassini Space Odyssey,,"[Chris Pine, Amanda Peet, Samuel L. Jackson, H...",Christopher Courtois,"[January 13, 2010, (, 2010-01-13, ), (United S...",45 minutes,United States,...,,,"[Harry Kloor, Daniel St. Pierre]",Jupiter 9 Productions,Dan Gutman,Shawn K. Clement,"[Harry Kloor, Rayna Napali, Helen Pao-Yun Huan...",Jupiter 9 Productions,Harry Kloor,tt0312305
1950,https://en.wikipedia.org/wiki/Keeping_the_Promise,1997,https://www.imdb.com/title/tt0119454/,Keeping the Promise,,"[Keith Carradine, Annette O'Toole, Brendan Fle...",Ron Stannett,"[1997, (, 1997, )]",93 minutes,,...,,,Sheldon Larry,Atlantis Films Limited Production,,,Martin Katz,,Gerald Di Pego,tt0119454
1990,https://en.wikipedia.org/wiki/Miss_Evers%27_Boys,1997,https://www.imdb.com/title/tt0119679/,Miss Evers' Boys,"[David Feldshuh, (play)]","[Alfre Woodard, Laurence Fishburne, Joe Morton]",Donald M. Morgan,"February 22, 1997",118 minutes,United States,...,,,Joseph Sargent,HBO,Michael Brown,Charles Bernstein,"[Derek Kavanagh, Kip Konwiser]","[Anasazi Productions, HBO NYC Productions]",Walter Bernstein,tt0119679
4483,https://en.wikipedia.org/wiki/Mr._Magorium%27s...,2007,https://www.imdb.com/title/tt0457419/,Mr. Magorium's Wonder Emporium,,"[Dustin Hoffman, Natalie Portman, Jason Batema...",Roman Osin,"[November 16, 2007, (, 2007-11-16, )]",95 minutes,"[United States, Canada]",...,$65 million,$69.5 million,Zach Helm,20th Century Fox,Sabrina Plisco,"[Alexandre Desplat, Aaron Zigman]","[Richard N. Gladstein, James Garavente]","[Mandate Pictures, Walden Media, FilmColony]",Zach Helm,tt0457419


In [19]:
box_office = wiki_movies_df['Box office'].dropna()

In [20]:
def is_not_a_string(x):
    return type(x) != str

In [21]:
box_office[box_office.map(is_not_a_string)]

34                           [US$, 4,212,828]
54      [$6,698,361 (, United States, ), [2]]
74                    [$6,488,144, (US), [1]]
126                [US$1,531,489, (domestic)]
130                          [US$, 4,803,039]
                        ...                  
6980               [$99.6, million, [4], [5]]
6994                   [$365.6, million, [1]]
6995                         [$53.8, million]
7015                     [$435, million, [7]]
7048                   [$529.3, million, [4]]
Name: Box office, Length: 135, dtype: object

In [22]:
box_office[box_office.map(lambda x: type(x) != str)]

34                           [US$, 4,212,828]
54      [$6,698,361 (, United States, ), [2]]
74                    [$6,488,144, (US), [1]]
126                [US$1,531,489, (domestic)]
130                          [US$, 4,803,039]
                        ...                  
6980               [$99.6, million, [4], [5]]
6994                   [$365.6, million, [1]]
6995                         [$53.8, million]
7015                     [$435, million, [7]]
7048                   [$529.3, million, [4]]
Name: Box office, Length: 135, dtype: object

In [23]:
import re

In [24]:
form_one = r'\$\s*\d+\.?\d*\s*[mb]illion'
box_office.str.contains(form_one, flags=re.IGNORECASE, na=False).sum()

3826

In [25]:
form_two = r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illion)'
box_office.str.contains(form_two, flags=re.IGNORECASE, na=False).sum()

1498

In [26]:
matches_form_one = box_office.str.contains(form_one, flags=re.IGNORECASE, na=False)
matches_form_two = box_office.str.contains(form_two, flags=re.IGNORECASE, na=False)

In [27]:
box_office[~matches_form_one & ~matches_form_two]

34                           [US$, 4,212,828]
54      [$6,698,361 (, United States, ), [2]]
74                    [$6,488,144, (US), [1]]
110                        $4.35-4.37 million
126                [US$1,531,489, (domestic)]
                        ...                  
6980               [$99.6, million, [4], [5]]
6994                   [$365.6, million, [1]]
6995                         [$53.8, million]
7015                     [$435, million, [7]]
7048                   [$529.3, million, [4]]
Name: Box office, Length: 161, dtype: object

In [28]:
box_office = box_office.str.replace(r'\$.*[-—–](?![a-z])', '$', regex=True)

In [29]:
form_one = r'\$\s*\d+\.?\d*\s*[mb]illi?on'

In [32]:
box_office.str.extract(f'({form_one}|{form_two})')

def parse_dollars(s):
    # if s is not a string, return NaN
    if type(s) != str:
        return np.nan

    # if input is of the form $###.# million
    if re.match(r'\$\s*\d+\.?\d*\s*milli?on', s, flags=re.IGNORECASE):

        # remove dollar sign and " million"
        s = re.sub('\$|\s|[a-zA-Z]','', s)

        # convert to float and multiply by a million
        value = float(s) * 10**6

        # return value
        return value

    # if input is of the form $###.# billion
    elif re.match(r'\$\s*\d+\.?\d*\s*billi?on', s, flags=re.IGNORECASE):

        # remove dollar sign and " billion"
        s = re.sub('\$|\s|[a-zA-Z]','', s)

        # convert to float and multiply by a billion
        value = float(s) * 10**9

        # return value
        return value

    # if input is of the form $###,###,###
    elif re.match(r'\$\s*\d{1,3}(?:[,\.]\d{3})+(?!\s[mb]illion)', s, flags=re.IGNORECASE):

        # remove dollar sign and commas
        s = re.sub('\$|,','', s)

        # convert to float
        value = float(s)

        # return value
        return value

    # otherwise, return NaN
    else:
        return np.nan

In [33]:
wiki_movies_df['box_office'] = box_office.str.extract(f'({form_one}|{form_two})', flags=re.IGNORECASE)[0].apply(parse_dollars)

In [36]:
wiki_movies_df["box_office"]

0       21400000.0
1        2700000.0
2       57718089.0
3        7331647.0
4        6939946.0
           ...    
7071    41900000.0
7072    76100000.0
7073    38400000.0
7074     5500000.0
7075           NaN
Name: box_office, Length: 7033, dtype: float64

In [37]:
wiki_movies_df.drop('Box office', axis=1, inplace=True)

In [38]:
wiki_movies_df.sample(n=10)

Unnamed: 0,url,year,imdb_link,title,Based on,Starring,Cinematography,Release date,Running time,Country,...,Budget,Director,Distributor,Editor(s),Composer(s),Producer(s),Production company(s),Writer(s),imdb_id,box_office
6455,https://en.wikipedia.org/wiki/The_Boy_(2016_film),2016,https://www.imdb.com/title/tt3882082/,The Boy,,"[Lauren Cohan, Rupert Evans]",Daniel Pearl,"[January 22, 2016, (, 2016-01-22, )]",97 minutes,"[United States, [2], China, [2]]",...,$10 million,William Brent Bell,STXfilms,Brian Berdan,Bear McCreary,"[Matt Berenson, Jodyne Herron, Gary Lucchesi, ...","[Lakeshore Entertainment, Huayi Brothers Pictu...",Stacey Menear,tt3882082,64200000.0
4369,https://en.wikipedia.org/wiki/Daddy_Day_Camp,2007,https://www.imdb.com/title/tt0462244/,Daddy Day Camp,"[Characters, by Geoff Rodkey]","[Cuba Gooding Jr., Lochlyn Munro, Richard Gant...",Geno Salvatori,"[August 8, 2007, (, 2007-08-08, )]",89 minutes,United States,...,$6 million,Fred Savage,TriStar Pictures,Michel Aller,Jim Dooley,"[William Sherak, Jason Shuman]","[Revolution Studios, Davis Entertainment Compa...","[Geoff Rodkey, Joel Cohen, Alec Sokolow]",tt0462244,18200000.0
4425,https://en.wikipedia.org/wiki/Highlander:_The_...,2007,https://www.imdb.com/title/tt0299981/,Highlander: The Source,,Adrian Paul,"[Steve Arnold, Dmitrij Gribanov]","[September 15, 2007, (, 2007-09-15, )]","[US domestic version:, 86 minutes, Original ve...",United States,...,$13 million,Brett Leonard,Lionsgate Films,Les Healey,George Kallis,"[Adrian Paul, Peter S. Davis, William N. Panzer]",,Mark Bradley,tt0299981,
3293,https://en.wikipedia.org/wiki/Elephant_(2003_f...,2003,https://www.imdb.com/title/tt0363589/,Elephant,,"[Alex Frost, Eric Deulen, John Robinson]",Harris Savides,"[May 2003, (, 2003-05, ), (, Cannes, ), Octobe...",81 minutes,United States,...,$3 million,Gus Van Sant,"[Fine Line Features, HBO Films]",Gus Van Sant,,"[Diane Keaton, Dany Wolf, JT LeRoy]",Meno Film Company,Gus Van Sant,tt0363589,10000000.0
1347,https://en.wikipedia.org/wiki/The_Nona_Tapes,1995,https://www.imdb.com/title/tt0317951/,The Nona Tapes,,,,"December 12, 1995","[25, :, 03]",,...,,Rocky Schenck,,,,"[Toby ""Flobee"" Wright, and Alice in Chains]",,,tt0317951,
604,https://en.wikipedia.org/wiki/Reservoir_Dogs,1992,https://www.imdb.com/title/tt0105236/,Reservoir Dogs,,"[Harvey Keitel, Tim Roth, Chris Penn, Steve Bu...",Andrzej Sekuła,"[January 21, 1992, (, 1992-01-21, ), (, Sundan...",99 minutes,United States,...,$1.2–1.5 million,Quentin Tarantino,Miramax Films,Sally Menke,,Lawrence Bender,"[PolyGram Filmed Entertainment, Live America I...",Quentin Tarantino,tt0105236,2800000.0
906,https://en.wikipedia.org/wiki/Cabin_Boy,1994,https://www.imdb.com/title/tt0109361/,Cabin Boy,,"[Chris Elliott, Ritch Brinkley, Brian Doyle-Mu...",Steve Yaconelli,"[January 7, 1994, (, 1994-01-07, )]",81 minutes,United States,...,$10 million,Adam Resnick,Buena Vista Pictures,Jon Poll,Steve Bartek,"[Denise Di Novi, Tim Burton]","[Touchstone Pictures, Tim Burton Productions]","[Chris Elliott, Adam Resnick]",tt0109361,3700000.0
3410,https://en.wikipedia.org/wiki/Shattered_Glass_...,2003,https://www.imdb.com/title/tt0323944/,Shattered Glass,"[Shattered Glass, by, H. G. Bissinger]","[Hayden Christensen, Peter Sarsgaard, Chloë Se...",Mandy Walker,"[August 10, 2003, (, 2003-08-10, ), (, TIFF, )...",94 minutes,"[United States, Canada]",...,$6 million,Billy Ray,Lions Gate Films,Jeffrey Ford,Mychael Danna,"[Craig Baumgarten, Adam Merims, Gaye Hirsch, T...","[Cruise/Wagner Productions, Baumgarten Merims ...",Billy Ray,tt0323944,2900000.0
6583,https://en.wikipedia.org/wiki/The_Founder_(film),2016,https://www.imdb.com/title/tt4276820/,The Founder,,"[Michael Keaton, Nick Offerman, John Carroll L...",John Schwartzman,"[December 7, 2016, (, 2016-12-07, ), (, Arclig...",115 minutes,United States,...,$7-25 million,John Lee Hancock,The Weinstein Company,Robert Frazen,Carter Burwell,"[Don Handfield, Karen Lunder, Jeremy Renner, A...","[FilmNation Entertainment, The Combine, Faliro...",Robert Siegel,tt4276820,24100000.0
1985,https://en.wikipedia.org/wiki/Men_in_Black_(19...,1997,https://www.imdb.com/title/tt0119654/,Men in Black,"[The Men in Black, by, Lowell Cunningham]","[Tommy Lee Jones, Will Smith, Linda Fiorentino...",Don Peterman,"[July 2, 1997, (, 1997-07-02, )]",98 minutes,United States,...,$90 million,Barry Sonnenfeld,Sony Pictures Releasing,Jim Miller,Danny Elfman,"[Walter F. Parkes, Laurie MacDonald]","[Columbia Pictures, Amblin Entertainment, Park...",Ed Solomon,tt0119654,589400000.0


In [40]:
budget = wiki_movies_df['Budget'].dropna()

In [41]:
budget = budget.map(lambda x: ' '.join(x) if type(x) == list else x)

In [42]:
budget = budget.str.replace(r'\$.*[-—–](?![a-z])', '$', regex=True)

In [43]:
matches_form_one = budget.str.contains(form_one, flags=re.IGNORECASE, na=False)
matches_form_two = budget.str.contains(form_two, flags=re.IGNORECASE, na=False)
budget[~matches_form_one & ~matches_form_two]

136                         Unknown
204     60 million Norwegian Kroner
478                         Unknown
973             $34 [3] [4] million
1126               $120 [4] million
1226                        Unknown
1278                            HBO
1374                     £6,000,000
1397                     13 million
1480                   £2.8 million
1734                   CAD2,000,000
1913     PHP 85 million (estimated)
1948                    102,888,900
1953                   3,500,000 DM
1973                     ₤2,300,874
2281                     $14 milion
2451                     ₤6,350,000
3144                   € 40 million
3360               $150 [6] million
3418                        $218.32
3802                   £4.2 million
3906                            N/A
3959                    760,000 USD
4470                       19 crore
4641                    £17 million
5034              $$200 [4] million
5055           $155 [2] [3] million
5419                $40 [4] 

In [44]:
budget = budget.str.replace(r'\[\d+\]\s*', '')
budget[~matches_form_one & ~matches_form_two]

  budget = budget.str.replace(r'\[\d+\]\s*', '')


136                         Unknown
204     60 million Norwegian Kroner
478                         Unknown
973                     $34 million
1126                   $120 million
1226                        Unknown
1278                            HBO
1374                     £6,000,000
1397                     13 million
1480                   £2.8 million
1734                   CAD2,000,000
1913     PHP 85 million (estimated)
1948                    102,888,900
1953                   3,500,000 DM
1973                     ₤2,300,874
2281                     $14 milion
2451                     ₤6,350,000
3144                   € 40 million
3360                   $150 million
3418                        $218.32
3802                   £4.2 million
3906                            N/A
3959                    760,000 USD
4470                       19 crore
4641                    £17 million
5034                  $$200 million
5055                   $155 million
5419                    $40 

In [45]:
wiki_movies_df['budget'] = budget.str.extract(f'({form_one}|{form_two})', flags=re.IGNORECASE)[0].apply(parse_dollars)

In [46]:
wiki_movies_df.drop('Budget', axis=1, inplace=True)

In [47]:
release_date = wiki_movies_df['Release date'].dropna().apply(lambda x: ' '.join(x) if type(x) == list else x)

In [48]:
date_form_one = r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s[123]?\d,\s\d{4}'
date_form_two = r'\d{4}.[01]\d.[0123]\d'
date_form_three = r'(?:January|February|March|April|May|June|July|August|September|October|November|December)\s\d{4}'
date_form_four = r'\d{4}'

In [49]:
release_date.str.extract(f'({date_form_one}|{date_form_two}|{date_form_three}|{date_form_four})', flags=re.IGNORECASE)

Unnamed: 0,0
0,"July 11, 1990"
1,"May 17, 1990"
2,"August 10, 1990"
3,"December 25, 1990"
4,"December 19, 1990"
...,...
7071,"December 25, 2018"
7072,"December 11, 2018"
7073,"November 8, 2018"
7074,"August 31, 2018"


In [50]:
wiki_movies_df['release_date'] = pd.to_datetime(release_date.str.extract(f'({date_form_one}|{date_form_two}|{date_form_three}|{date_form_four})')[0], infer_datetime_format=True)

In [51]:
running_time = wiki_movies_df['Running time'].dropna().apply(lambda x: ' '.join(x) if type(x) == list else x)

In [52]:
running_time.str.contains(r'^\d*\s*minutes$', flags=re.IGNORECASE, na=False).sum()

6528

In [53]:
running_time[running_time.str.contains(r'^\d*\s*minutes$', flags=re.IGNORECASE, na=False) != True]

9                                                 102 min
26                                                 93 min
28                                                32 min.
34                                                101 min
35                                                 97 min
                              ...                        
6500       114 minutes [1] 120 minutes (extended edition)
6643                                             104 mins
6709    90 minutes (theatrical) [1] 91 minutes (unrate...
7057    108 minutes (Original cut) 98 minutes (UK cut)...
7075                Variable; 90 minutes for default path
Name: Running time, Length: 366, dtype: object

In [54]:
running_time.str.contains(r'^\d*\s*m', flags=re.IGNORECASE, na=False).sum()

6877

In [55]:
running_time[running_time.str.contains(r'^\d*\s*m', flags=re.IGNORECASE, na=False) != True]

668                     UK:84 min (DVD version) US:86 min
727                         78-102 min (depending on cut)
840                       Varies (79 [3] –84 [1] minutes)
1347                                              25 : 03
1443    United States: 77 minutes Argentina: 94 minute...
1499                                            1hr 35min
1551                                               varies
1774                    Netherlands:96 min, Canada:95 min
1777                                       approx. 14 min
2273                                           1 h 43 min
2993                                               1h 48m
3925                                              4 hours
4425    US domestic version: 86 minutes Original versi...
4967    Theatrical cut: 97 minutes Unrated cut: 107 mi...
5424                    115 [1] /123 [2] /128 [3] minutes
5447                                    1 hour 32 minutes
7075                Variable; 90 minutes for default path
Name: Running 

In [56]:
running_time_extract = running_time.str.extract(r'(\d+)\s*ho?u?r?s?\s*(\d*)|(\d+)\s*m')

In [57]:
running_time_extract = running_time_extract.apply(lambda col: pd.to_numeric(col, errors='coerce')).fillna(0)

In [58]:
wiki_movies_df['running_time'] = running_time_extract.apply(lambda row: row[0]*60 + row[1] if row[2] == 0 else row[2], axis=1)

In [59]:
wiki_movies_df.drop('Running time', axis=1, inplace=True)