In [1]:
import pandas as pd
import re
import warnings
warnings.filterwarnings('ignore')

df = pd.read_csv("imdb_animated_movies.csv")
df.head()

Unnamed: 0,name,rating,runtime,genre,votescore,metacritic,year,votes,gross,budget_est,director
0,9,PG-13,79 min,"Animation, Action, Adventure",7.0,60.0,(I) (2009),140989,$31.74M,"$30,000,000 (estimated)",Shane Acker
1,A Bug's Life,G,95 min,"Animation, Adventure, Comedy",7.2,77.0,(I) (1998),293639,$162.80M,"$120,000,000 (estimated)",John Lasseter
2,A Christmas Carol,PG,96 min,"Animation, Adventure, Comedy",6.8,55.0,(2009),115942,$137.86M,"$200,000,000 (estimated)",Robert Zemeckis
3,A Goofy Movie,G,78 min,"Animation, Adventure, Comedy",6.9,53.0,(1995),55556,$35.35M,"$18,000,000 (estimated)",Kevin Lima
4,A Scanner Darkly,R,100 min,"Animation, Comedy, Crime",7.0,73.0,(2006),112638,$5.50M,"$8,700,000 (estimated)",Richard Linklater


In [2]:
"""
Rating -> All MPAA-relevant ratings
Runtime (minutes) -> Into an integer
Genre -> Each movie has atleast two others besides Animation. Split into two columns.
votescore -> votescore * 10 (to align with metacritic score)
year -> Year number alone and into an integer
votes -> Into an integer
gross -> IMDb gross only shown in american currency. Into an integer
budget -> Note every movie budget is recorded in the same currency (yen, euro, pound, dollar). Into an Integer.
       -> Make two columns. One with original budget and another column with a budget converted to dollar currency
"""

'\nRating -> All MPAA-relevant ratings\nRuntime (minutes) -> Into an integer\nGenre -> Each movie has atleast two others besides Animation. Split into two columns.\nvotescore -> votescore * 10 (to align with metacritic score)\nyear -> Year number alone and into an integer\nvotes -> Into an integer\ngross -> IMDb gross only shown in american currency. Into an integer\nbudget -> Note every movie budget is recorded in the same currency (yen, euro, pound, dollar). Into an Integer.\n       -> Make two columns. One with original budget and another column with a budget converted to dollar currency\n'

In [3]:
df.isnull().any()

name          False
rating         True
runtime       False
genre         False
votescore     False
metacritic     True
year          False
votes         False
gross          True
budget_est    False
director      False
dtype: bool

In [4]:
df.dtypes

name           object
rating         object
runtime        object
genre          object
votescore     float64
metacritic    float64
year           object
votes          object
gross          object
budget_est     object
director       object
dtype: object

In [5]:
def is_curr(s, curr={'-','$','£','¥','€'}):
    return s[0] in curr

In [6]:
df.insert(len(df.columns)-1, 'budget_est_dollar',-1)

In [7]:
# df['budget_est'] = df['budget_est'].apply(lambda x: x[1:] if not is_curr(x))

for x in range(len(df)):
    if not is_curr(df['budget_est'][x]):
        df['budget_est'][x] = df['budget_est'][x][1:]


In [8]:
df['budget_est'] = df['budget_est'].apply(lambda x: x.split(" ")[0])
df.head()

Unnamed: 0,name,rating,runtime,genre,votescore,metacritic,year,votes,gross,budget_est,budget_est_dollar,director
0,9,PG-13,79 min,"Animation, Action, Adventure",7.0,60.0,(I) (2009),140989,$31.74M,"$30,000,000",-1,Shane Acker
1,A Bug's Life,G,95 min,"Animation, Adventure, Comedy",7.2,77.0,(I) (1998),293639,$162.80M,"$120,000,000",-1,John Lasseter
2,A Christmas Carol,PG,96 min,"Animation, Adventure, Comedy",6.8,55.0,(2009),115942,$137.86M,"$200,000,000",-1,Robert Zemeckis
3,A Goofy Movie,G,78 min,"Animation, Adventure, Comedy",6.9,53.0,(1995),55556,$35.35M,"$18,000,000",-1,Kevin Lima
4,A Scanner Darkly,R,100 min,"Animation, Comedy, Crime",7.0,73.0,(2006),112638,$5.50M,"$8,700,000",-1,Richard Linklater


In [9]:
df.insert(len(df.columns)-1, 'orig_bgt_currency',-1)

In [10]:
df['orig_bgt_currency'] = df['budget_est'].apply(lambda x: 'euro' if x[0] == '£' 
                                                 else ('yen' if x[0] == '¥' 
                                                       else ('pound' if x[0] == '€'
                                                             else 'dollar')))

df['budget_est'] = df['budget_est'].apply(lambda x: x[1:]).replace(',','', regex=True).astype(int)

for x in range(len(df)):
    if df['orig_bgt_currency'][x] == 'dollar':
        df['budget_est_dollar'][x] = df['budget_est'][x]
    
    if df['orig_bgt_currency'][x] == 'euro':
        df['budget_est_dollar'][x] = round(df['budget_est'][x] * 1.04)
    
    if df['orig_bgt_currency'][x] == 'yen':
        df['budget_est_dollar'][x] = round(df['budget_est'][x] * 0.0072)
        
    if df['orig_bgt_currency'][x] == 'pound':
        df['budget_est_dollar'][x] = round(df['budget_est'][x] * 1.19)
        
df.drop(['budget_est'], inplace=True, axis=1)

In [11]:
df[df['budget_est_dollar'] == 1].head(3)

Unnamed: 0,name,rating,runtime,genre,votescore,metacritic,year,votes,gross,budget_est_dollar,orig_bgt_currency,director
65,Final Fantasy VII: Advent Children,PG-13,101 min,"Animation, Action, Adventure",7.2,,(2005),56771,,1,dollar,Tetsuya Nomura
99,Hotel Transylvania 4: Transformania,PG,87 min,"Animation, Adventure, Comedy",6.0,46.0,(2022),30799,,1,dollar,Derek Drymon
267,The Many Adventures of Winnie the Pooh,G,74 min,"Animation, Adventure, Comedy",7.5,,(1977),36892,,1,dollar,John Lounsbery


In [12]:
median_budget_dollar = round(df['budget_est_dollar'].median())
df['budget_est_dollar'] = df['budget_est_dollar'].apply(lambda x: median_budget_dollar if x == 1 else x)

In [13]:
df['gross'].astype(str)

0       $31.74M
1      $162.80M
2      $137.86M
3       $35.35M
4        $5.50M
         ...   
316         nan
317    $189.42M
318      $0.99M
319    $341.27M
320         nan
Name: gross, Length: 321, dtype: object

In [14]:
df['gross'].fillna('$0M', inplace=True)
df['gross'] = df['gross'].apply(lambda x: x[1:-1]).astype(float)
df['gross'] = df['gross'].apply(lambda y: round(y * 1000000))

df = df.rename(columns={"gross": "gross_dollar"})
df['gross_dollar'].head()

0     31740000
1    162800000
2    137860000
3     35350000
4      5500000
Name: gross_dollar, dtype: int64

In [15]:
df.insert(10,'profit_dollar',0)
for x in range(len(df)):
    if df['gross_dollar'][x] == 0:
        df['profit_dollar'][x] = 0
    else:
        df['profit_dollar'][x] = df['gross_dollar'][x] - df['budget_est_dollar'][x]

In [16]:
df.head(100)

Unnamed: 0,name,rating,runtime,genre,votescore,metacritic,year,votes,gross_dollar,budget_est_dollar,profit_dollar,orig_bgt_currency,director
0,9,PG-13,79 min,"Animation, Action, Adventure",7.0,60.0,(I) (2009),140989,31740000,30000000,1740000,dollar,Shane Acker
1,A Bug's Life,G,95 min,"Animation, Adventure, Comedy",7.2,77.0,(I) (1998),293639,162800000,120000000,42800000,dollar,John Lasseter
2,A Christmas Carol,PG,96 min,"Animation, Adventure, Comedy",6.8,55.0,(2009),115942,137860000,200000000,-62140000,dollar,Robert Zemeckis
3,A Goofy Movie,G,78 min,"Animation, Adventure, Comedy",6.9,53.0,(1995),55556,35350000,18000000,17350000,dollar,Kevin Lima
4,A Scanner Darkly,R,100 min,"Animation, Comedy, Crime",7.0,73.0,(2006),112638,5500000,8700000,-3200000,dollar,Richard Linklater
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Grave of the Fireflies,Not Rated,89 min,"Animation, Drama, War",8.5,94.0,(1988),276361,0,3700000,0,dollar,Isao Takahata
96,Hotel Transylvania,PG,91 min,"Animation, Adventure, Comedy",7.0,47.0,(2012),256734,148310000,85000000,63310000,dollar,Genndy Tartakovsky
97,Hotel Transylvania 2,PG,89 min,"Animation, Adventure, Comedy",6.6,44.0,(2015),128626,169700000,80000000,89700000,dollar,Genndy Tartakovsky
98,Hotel Transylvania 3: Summer Vacation,PG,97 min,"Animation, Adventure, Comedy",6.3,54.0,(2018),74853,167510000,80000000,87510000,dollar,Genndy Tartakovsky


In [17]:
df['votes'] = df['votes'].replace(',','', regex=True).astype(int)

In [18]:
df['year'].astype(str)

0      (I) (2009)
1      (I) (1998)
2          (2009)
3          (1995)
4          (2006)
          ...    
316        (2020)
317        (2012)
318        (1968)
319        (2016)
320        (2012)
Name: year, Length: 321, dtype: object

In [19]:
df['year'] = df['year'].apply(lambda x: re.sub('\D', '', x).strip()).astype(int)

In [20]:
year_column = df.pop("year")
# insert column with insert(location, column_name, column_value)

df.insert(2, "year", year_column)

In [21]:
df['votescore'].astype(float)

0      7.0
1      7.2
2      6.8
3      6.9
4      7.0
      ... 
316    8.0
317    7.7
318    7.4
319    8.0
320    8.1
Name: votescore, Length: 321, dtype: float64

In [22]:
df['votescore'] = round(df['votescore']  * 10,1)

In [23]:
df.head(3)

Unnamed: 0,name,rating,year,runtime,genre,votescore,metacritic,votes,gross_dollar,budget_est_dollar,profit_dollar,orig_bgt_currency,director
0,9,PG-13,2009,79 min,"Animation, Action, Adventure",70.0,60.0,140989,31740000,30000000,1740000,dollar,Shane Acker
1,A Bug's Life,G,1998,95 min,"Animation, Adventure, Comedy",72.0,77.0,293639,162800000,120000000,42800000,dollar,John Lasseter
2,A Christmas Carol,PG,2009,96 min,"Animation, Adventure, Comedy",68.0,55.0,115942,137860000,200000000,-62140000,dollar,Robert Zemeckis


In [24]:
df['metacritic'].fillna(df['votescore'],inplace=True)
df.head()

Unnamed: 0,name,rating,year,runtime,genre,votescore,metacritic,votes,gross_dollar,budget_est_dollar,profit_dollar,orig_bgt_currency,director
0,9,PG-13,2009,79 min,"Animation, Action, Adventure",70.0,60.0,140989,31740000,30000000,1740000,dollar,Shane Acker
1,A Bug's Life,G,1998,95 min,"Animation, Adventure, Comedy",72.0,77.0,293639,162800000,120000000,42800000,dollar,John Lasseter
2,A Christmas Carol,PG,2009,96 min,"Animation, Adventure, Comedy",68.0,55.0,115942,137860000,200000000,-62140000,dollar,Robert Zemeckis
3,A Goofy Movie,G,1995,78 min,"Animation, Adventure, Comedy",69.0,53.0,55556,35350000,18000000,17350000,dollar,Kevin Lima
4,A Scanner Darkly,R,2006,100 min,"Animation, Comedy, Crime",70.0,73.0,112638,5500000,8700000,-3200000,dollar,Richard Linklater


In [25]:
df.insert(6,'avg_rating_score', (df.votescore+df.metacritic)/2)

In [26]:
df

Unnamed: 0,name,rating,year,runtime,genre,votescore,avg_rating_score,metacritic,votes,gross_dollar,budget_est_dollar,profit_dollar,orig_bgt_currency,director
0,9,PG-13,2009,79 min,"Animation, Action, Adventure",70.0,65.0,60.0,140989,31740000,30000000,1740000,dollar,Shane Acker
1,A Bug's Life,G,1998,95 min,"Animation, Adventure, Comedy",72.0,74.5,77.0,293639,162800000,120000000,42800000,dollar,John Lasseter
2,A Christmas Carol,PG,2009,96 min,"Animation, Adventure, Comedy",68.0,61.5,55.0,115942,137860000,200000000,-62140000,dollar,Robert Zemeckis
3,A Goofy Movie,G,1995,78 min,"Animation, Adventure, Comedy",69.0,61.0,53.0,55556,35350000,18000000,17350000,dollar,Kevin Lima
4,A Scanner Darkly,R,2006,100 min,"Animation, Comedy, Crime",70.0,71.5,73.0,112638,5500000,8700000,-3200000,dollar,Richard Linklater
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316,Wolfwalkers,PG,2020,103 min,"Animation, Adventure, Family",80.0,83.5,87.0,32505,0,12000000,0,dollar,Tomm Moore
317,Wreck-It Ralph,PG,2012,101 min,"Animation, Adventure, Comedy",77.0,74.5,72.0,422751,189420000,165000000,24420000,dollar,Rich Moore
318,Yellow Submarine,G,1968,85 min,"Animation, Adventure, Comedy",74.0,76.5,79.0,26576,990000,260000,730000,euro,George Dunning
319,Zootopia,PG,2016,108 min,"Animation, Adventure, Comedy",80.0,79.0,78.0,497059,341270000,150000000,191270000,dollar,Byron Howard


In [27]:
df.rating.unique()

array(['PG-13', 'G', 'PG', 'R', 'TV-PG', 'TV-14', 'Not Rated', 'TV-MA',
       nan, 'Approved'], dtype=object)

In [28]:
# Upon further research, movies with the 'Approved' rating indicates that they have an MPAA rating of 'G'
df['rating'] = df['rating'].apply(lambda x: 'G' if x == "Approved" else x)
# df[df['name'] == 'Snow White and the Seven Dwarfs']

In [29]:
df['rating'].fillna('Not Rated',inplace=True)
df.rating.unique()

array(['PG-13', 'G', 'PG', 'R', 'TV-PG', 'TV-14', 'Not Rated', 'TV-MA'],
      dtype=object)

In [30]:
df['genre'] = df['genre'].replace("Animation, ","", regex=True)

In [31]:
df[['genre_a','genre_b']] = df['genre'].str.split(', ',expand=True)

In [32]:
column_to_move1 = df.pop("genre_a")
column_to_move2 = df.pop("genre_b")
# insert column with insert(location, column_name, column_value)

df.insert(3, "genre_a", column_to_move1)
df.insert(4, "genre_b", column_to_move2)
df.pop("genre")

0      Action, Adventure
1      Adventure, Comedy
2      Adventure, Comedy
3      Adventure, Comedy
4          Comedy, Crime
             ...        
316    Adventure, Family
317    Adventure, Comedy
318    Adventure, Comedy
319    Adventure, Comedy
320        Drama, Family
Name: genre, Length: 321, dtype: object

In [33]:
df = df.rename(columns={"runtime": "runtime_minutes"})
df.head()

Unnamed: 0,name,rating,year,genre_a,genre_b,runtime_minutes,votescore,avg_rating_score,metacritic,votes,gross_dollar,budget_est_dollar,profit_dollar,orig_bgt_currency,director
0,9,PG-13,2009,Action,Adventure,79 min,70.0,65.0,60.0,140989,31740000,30000000,1740000,dollar,Shane Acker
1,A Bug's Life,G,1998,Adventure,Comedy,95 min,72.0,74.5,77.0,293639,162800000,120000000,42800000,dollar,John Lasseter
2,A Christmas Carol,PG,2009,Adventure,Comedy,96 min,68.0,61.5,55.0,115942,137860000,200000000,-62140000,dollar,Robert Zemeckis
3,A Goofy Movie,G,1995,Adventure,Comedy,78 min,69.0,61.0,53.0,55556,35350000,18000000,17350000,dollar,Kevin Lima
4,A Scanner Darkly,R,2006,Comedy,Crime,100 min,70.0,71.5,73.0,112638,5500000,8700000,-3200000,dollar,Richard Linklater


In [34]:
df['runtime_minutes'] = df['runtime_minutes'].replace('min','', regex=True).astype(int)
df['runtime_minutes']

0       79
1       95
2       96
3       78
4      100
      ... 
316    103
317    101
318     85
319    108
320    117
Name: runtime_minutes, Length: 321, dtype: int64

In [35]:
df.insert(13,'avg_dollar_budget_per_minute', round(df.budget_est_dollar/df.runtime_minutes,2))

In [36]:
df.tail(5)

Unnamed: 0,name,rating,year,genre_a,genre_b,runtime_minutes,votescore,avg_rating_score,metacritic,votes,gross_dollar,budget_est_dollar,profit_dollar,avg_dollar_budget_per_minute,orig_bgt_currency,director
316,Wolfwalkers,PG,2020,Adventure,Family,103,80.0,83.5,87.0,32505,0,12000000,0,116504.85,dollar,Tomm Moore
317,Wreck-It Ralph,PG,2012,Adventure,Comedy,101,77.0,74.5,72.0,422751,189420000,165000000,24420000,1633663.37,dollar,Rich Moore
318,Yellow Submarine,G,1968,Adventure,Comedy,85,74.0,76.5,79.0,26576,990000,260000,730000,3058.82,euro,George Dunning
319,Zootopia,PG,2016,Adventure,Comedy,108,80.0,79.0,78.0,497059,341270000,150000000,191270000,1388888.89,dollar,Byron Howard
320,Wolf Children,PG,2012,Drama,Family,117,81.0,76.0,71.0,45231,0,54999779,0,470083.58,dollar,Mamoru Hosoda


In [37]:
df.isnull().any()

name                            False
rating                          False
year                            False
genre_a                         False
genre_b                          True
runtime_minutes                 False
votescore                       False
avg_rating_score                False
metacritic                      False
votes                           False
gross_dollar                    False
budget_est_dollar               False
profit_dollar                   False
avg_dollar_budget_per_minute    False
orig_bgt_currency               False
director                        False
dtype: bool

In [38]:
df.dtypes

name                             object
rating                           object
year                              int64
genre_a                          object
genre_b                          object
runtime_minutes                   int64
votescore                       float64
avg_rating_score                float64
metacritic                      float64
votes                             int64
gross_dollar                      int64
budget_est_dollar                 int64
profit_dollar                     int64
avg_dollar_budget_per_minute    float64
orig_bgt_currency                object
director                         object
dtype: object

In [39]:
df

Unnamed: 0,name,rating,year,genre_a,genre_b,runtime_minutes,votescore,avg_rating_score,metacritic,votes,gross_dollar,budget_est_dollar,profit_dollar,avg_dollar_budget_per_minute,orig_bgt_currency,director
0,9,PG-13,2009,Action,Adventure,79,70.0,65.0,60.0,140989,31740000,30000000,1740000,379746.84,dollar,Shane Acker
1,A Bug's Life,G,1998,Adventure,Comedy,95,72.0,74.5,77.0,293639,162800000,120000000,42800000,1263157.89,dollar,John Lasseter
2,A Christmas Carol,PG,2009,Adventure,Comedy,96,68.0,61.5,55.0,115942,137860000,200000000,-62140000,2083333.33,dollar,Robert Zemeckis
3,A Goofy Movie,G,1995,Adventure,Comedy,78,69.0,61.0,53.0,55556,35350000,18000000,17350000,230769.23,dollar,Kevin Lima
4,A Scanner Darkly,R,2006,Comedy,Crime,100,70.0,71.5,73.0,112638,5500000,8700000,-3200000,87000.00,dollar,Richard Linklater
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
316,Wolfwalkers,PG,2020,Adventure,Family,103,80.0,83.5,87.0,32505,0,12000000,0,116504.85,dollar,Tomm Moore
317,Wreck-It Ralph,PG,2012,Adventure,Comedy,101,77.0,74.5,72.0,422751,189420000,165000000,24420000,1633663.37,dollar,Rich Moore
318,Yellow Submarine,G,1968,Adventure,Comedy,85,74.0,76.5,79.0,26576,990000,260000,730000,3058.82,euro,George Dunning
319,Zootopia,PG,2016,Adventure,Comedy,108,80.0,79.0,78.0,497059,341270000,150000000,191270000,1388888.89,dollar,Byron Howard
