In [1]:
import pandas as pd
import re
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 100)

df = pd.read_csv("imdb_animated_movies.csv")
df.head()

Unnamed: 0,name,rating,runtime,genres,story_desc,votescore,metacritic,production_companies,year,aspect_ratio,country,languages,votes,worldwide_gross,na_gross,opening_weekend,budget_est,director,writers
0,9,PG-13,79 min,"Animation, Action, Adventure, Drama, Fantasy, ...",A rag doll that awakens in a postapocalyptic f...,7.0,60.0,"Focus Features, Relativity Media, Arc Productions",(I) (2009),1.85 : 1,"United States, Canada, Luxembourg",English,141553,"$48,428,063","$31,749,894","$10,740,446","$30,000,000 (estimated)","Shane Acker,","Pamela Pettler(screenplay by), Shane Acker(sto..."
1,A Bug's Life,G,95 min,"Animation, Adventure, Comedy, Family, Fantasy","A misfit ant, looking for ""warriors"" to save h...",7.2,77.0,"Pixar Animation Studios, Walt Disney Pictures",(I) (1998),2.39 : 1,United States,English,295606,"$363,258,859","$162,798,565","$291,121","$120,000,000 (estimated)","John Lasseter, Andrew Stanton(co-director),","John Lasseter(original story by), Andrew Stant..."
2,A Christmas Carol,PG,96 min,"Animation, Adventure, Comedy, Drama, Family, F...",An animated retelling of Charles Dickens' clas...,6.8,55.0,"Walt Disney Pictures, ImageMovers Digital, Ima...",(2009),2.39 : 1,United States,English,120272,"$325,286,646","$137,855,863","$30,051,075","$200,000,000 (estimated)","Robert Zemeckis,",Charles Dickens(based on the classic story by)...
3,A Goofy Movie,G,78 min,"Animation, Adventure, Comedy, Family, Musical,...",When Max makes a preposterous promise to a gir...,6.9,53.0,"Walt Disney Pictures, Disney Television Animat...",(1995),1.85 : 1,"United States, Australia, France, Canada",English,56379,"$35,348,597","$35,348,597","$6,129,557","$18,000,000 (estimated)","Kevin Lima,","Jymn Magon(story by), Chris Matheson(screenpla..."
4,A Scanner Darkly,R,100 min,"Animation, Comedy, Crime, Drama, Mystery, Sci-...",An undercover cop in a not-too-distant future ...,7.0,73.0,"Warner Independent Pictures (WIP), Thousand Wo...",(2006),1.85 : 1,United States,English,113012,"$7,659,918","$5,501,616","$391,672","$8,700,000 (estimated)","Richard Linklater,","Philip K. Dick(novel ""A Scanner Darkly""), Rich..."


In [2]:
df.columns

Index(['name', 'rating', 'runtime', 'genres', 'story_desc', 'votescore',
       'metacritic', 'production_companies', 'year', 'aspect_ratio', 'country',
       'languages', 'votes', 'worldwide_gross', 'na_gross', 'opening_weekend',
       'budget_est', 'director', 'writers'],
      dtype='object')

In [3]:
len(df.columns)

19

## Primary columns to be modified/used
* Rating -- All MPAA-relevant ratings
* Runtime (minutes) -- Length of movie screening
* Story description -- IMDb's description of the movie
* Genres -- Any genre the movie is associated with besides Animation. May split into multiple columns.


* votescore -- Score of movie based on user votes -> (Will be multiplied by 10 to align with metacritic score)
* *metacritic -- Movie's Metacritic score 


* production companies -- Companies involved in making of the movie. Will be counted
* year -> Year of movie's official release. Will be set to integer.
* votes -> Amount of user votes affecting votescore of a movie. Will be set to an integer



* *opening weekend -- Gross in the first week
* *na gross -- North American gross (aka domestic gross)
* *worldwide gross -- IMDb gross coverted to American currency. Into an integer



* *budget -- Note every movie budget is recorded in the same currency (yen, euro, pound, dollar). Into an Integer.
* May make two columns. One with original budget and another column with a budget converted to dollar currency



* *aspect ratio -- Ratio of width and height of the movie shown.
* countries -- Any countries involved with the making of the movie. Will be counted.
* languages -- main/official languages translated for screenings/official releases. Will be counted.

* director, writers -> Directors and writers involved in the making of the movie. Each will be counted.



*Not every movie has a value for each column (may leave as -1 or n/a)

In [4]:
df.isnull().any()

name                    False
rating                   True
runtime                 False
genres                  False
story_desc               True
votescore               False
metacritic               True
production_companies     True
year                    False
aspect_ratio             True
country                 False
languages               False
votes                   False
worldwide_gross         False
na_gross                False
opening_weekend         False
budget_est              False
director                False
writers                  True
dtype: bool

### Handling null values for each column
* Rating -> "Not Rated"
* Metacritic -> -1
* Any column involving currency -> -1
* Production Companies -> "Unnamed"
* Aspect Ratio -> N/A
* Writers -> "Unspecified"

In [5]:
df.dtypes

name                     object
rating                   object
runtime                  object
genres                   object
story_desc               object
votescore               float64
metacritic              float64
production_companies     object
year                     object
aspect_ratio             object
country                  object
languages                object
votes                    object
worldwide_gross          object
na_gross                 object
opening_weekend          object
budget_est               object
director                 object
writers                  object
dtype: object

### Marking currencies
IMDb typically uses the country a movie originates from as a reference for the currency used when specifying the budget's estimate. The goal here is to detect as many currencies found as possible so they can be used as another feature for the final dataframe.

In [6]:
df.insert(len(df.columns)-1, 'budget_est_usd',-1)
df.insert(len(df.columns)-1, 'orig_bgt_currency',-1)

In [7]:
df.iloc[250]

name                                                                Mulan
rating                                                                  G
runtime                                                            88 min
genres                  Animation, Adventure, Comedy, Family, Fantasy,...
story_desc              To save her father from death in the army, a y...
votescore                                                             7.6
metacritic                                                           71.0
production_companies    Walt Disney Animation Studios, Walt Disney Fea...
year                                                               (1998)
aspect_ratio                                                          NaN
country                                                     United States
languages                                               English, Mandarin
votes                                                             290,901
worldwide_gross                       

In [8]:
print(df['budget_est'][250][:3])
print(df['budget_est'][226])
print(df['budget_est'][292])

$90
$145,000,000 (estimated)
$145,000,000 (estimated)


In [9]:
df['budget_est'] = df['budget_est'].apply(lambda x: x.split(" (")[0]).apply(lambda y: y.replace(" ",""))
# df['budget_est'] = df['budget_est'].apply(lambda x: x.replace(" ",""))

In [10]:
df['budget_est'][292]

'$145,000,000'

In [11]:
df['budget_est'][365]

'$125,000,000'

In [12]:
df['orig_bgt_currency'] = df['budget_est'].apply(lambda x: 'euro' if x[0] == '£' 
                                                 else ('yen' if x[0] == '¥' 
                                                       else ('pound' if x[0] == '€'
                                                             else ('austrailian dollar' if x[:2] == 'A$'
                                                                 else ('chinese yuan' if x[:3] == "CN¥"
                                                                       else ('danish krone' if x[:3] == "DKK"
                                                                             else ('poland zloty' if x[:3] == "PLN"
                                                                                   else 'usd')))))))

### Removing currency mark from budget_est. Will be relocated to new column

In [13]:
df['budget_est'] = df['budget_est'].apply(lambda x: x[3:] if x[:3] == "CN¥"
                                          else (x[3:] if x[:3] == 'DKK'
                                                else (x[3:] if x[:3] == 'PLN'
                                                    else (x[2:] if x[:2] == 'A$'
                                                          else (x[1:] if x != '-1'
                                                                else -1))))).replace(',','', regex=True).astype(int)

### Budget conversion to USD 

In [14]:
for x in range(len(df)):
    if df['orig_bgt_currency'][x] == 'dollar':
        df['budget_est_usd'][x] = df['budget_est'][x]
    
    if df['orig_bgt_currency'][x] == 'pound':
        df['budget_est_usd'][x] = round(df['budget_est'][x] * 1.19)
    
    if df['orig_bgt_currency'][x] == 'euro':
        df['budget_est_usd'][x] = round(df['budget_est'][x] * 1.04)

    if df['orig_bgt_currency'][x] == 'austrailian dollar':
        df['budget_est_usd'][x] = round(df['budget_est'][x] * 0.67)
      
    if df['orig_bgt_currency'][x] == 'poland zloty':
        df['budget_est_usd'][x] = round(df['budget_est'][x] * 0.22)
    
    if df['orig_bgt_currency'][x] == 'chinese yuan' or 'danish krone':
        df['budget_est_usd'][x] = round(df['budget_est'][x] * 0.14)
        
    if df['orig_bgt_currency'][x] == 'yen':
        df['budget_est_usd'][x] = round(df['budget_est'][x] * 0.0072)
        
df.drop(['budget_est'], inplace=True, axis=1)

In [15]:
df['budget_est_usd'] = df['budget_est_usd'].apply(lambda x: -1 if x == 0 else x)

In [16]:
df.head(3)

Unnamed: 0,name,rating,runtime,genres,story_desc,votescore,metacritic,production_companies,year,aspect_ratio,country,languages,votes,worldwide_gross,na_gross,opening_weekend,director,budget_est_usd,orig_bgt_currency,writers
0,9,PG-13,79 min,"Animation, Action, Adventure, Drama, Fantasy, ...",A rag doll that awakens in a postapocalyptic f...,7.0,60.0,"Focus Features, Relativity Media, Arc Productions",(I) (2009),1.85 : 1,"United States, Canada, Luxembourg",English,141553,"$48,428,063","$31,749,894","$10,740,446","Shane Acker,",4200000,usd,"Pamela Pettler(screenplay by), Shane Acker(sto..."
1,A Bug's Life,G,95 min,"Animation, Adventure, Comedy, Family, Fantasy","A misfit ant, looking for ""warriors"" to save h...",7.2,77.0,"Pixar Animation Studios, Walt Disney Pictures",(I) (1998),2.39 : 1,United States,English,295606,"$363,258,859","$162,798,565","$291,121","John Lasseter, Andrew Stanton(co-director),",16800000,usd,"John Lasseter(original story by), Andrew Stant..."
2,A Christmas Carol,PG,96 min,"Animation, Adventure, Comedy, Drama, Family, F...",An animated retelling of Charles Dickens' clas...,6.8,55.0,"Walt Disney Pictures, ImageMovers Digital, Ima...",(2009),2.39 : 1,United States,English,120272,"$325,286,646","$137,855,863","$30,051,075","Robert Zemeckis,",28000000,usd,Charles Dickens(based on the classic story by)...


### Converting all monetary values to integer

In [17]:
def convert_int(s):
    return s.apply(lambda x: re.sub(',','',x[1:-1]) if not(x == -1) else -1).replace('','-1').astype(int)

In [18]:
df['worldwide_gross'] = convert_int(df['worldwide_gross'])

In [19]:
df = df.rename(columns={"worldwide_gross": "worldwide_gross_usd"})
df['worldwide_gross_usd'].head()

0     4842806
1    36325885
2    32528664
3     3534859
4      765991
Name: worldwide_gross_usd, dtype: int64

In [20]:
df['na_gross'] = convert_int(df['na_gross'])

In [21]:
df = df.rename(columns={"na_gross": "na_gross_usd"})

In [22]:
df['opening_weekend'] = convert_int(df['opening_weekend'])

In [23]:
df = df.rename(columns={"opening_weekend": "opening_weekend_usd"})

In [24]:
df.na_gross_usd

0       3174989
1      16279856
2      13785586
3       3534859
4        550161
         ...   
469    18942288
470       99230
471    10024601
472    34126824
473          -1
Name: na_gross_usd, Length: 474, dtype: int64

### Getting profit data

In [25]:
df.insert(len(df.columns)-1, 'profit_usd', -9)
for x in range(len(df)):
    if (df['budget_est_usd'][x] > -1) and (df['worldwide_gross_usd'][x] > -1):
        df['profit_usd'][x] = df['worldwide_gross_usd'][x] - df['budget_est_usd'][x]
    else:
        df['profit_usd'][x] = -1

In [26]:
type(df.profit_usd)

pandas.core.series.Series

In [27]:
df['profit_usd'].tail(3)

471     9150937
472    83253368
473          -1
Name: profit_usd, dtype: int64

### Converting columns to appropriate data types

In [28]:
df['votes'] = df['votes'].replace(',','', regex=True).astype(int)

In [29]:
df['year'].astype(str)

0      (I) (2009)
1      (I) (1998)
2          (2009)
3          (1995)
4          (2006)
          ...    
469        (2012)
470        (1968)
471        (2010)
472        (2016)
473        (2012)
Name: year, Length: 474, dtype: object

In [30]:
df['year'] = df['year'].apply(lambda x: re.sub('\D', '', x).strip()).astype(int)

In [31]:
year_column = df.pop("year")
# insert column with insert(location, column_name, column_value)

df.insert(2, "year", year_column)

In [32]:
df['year'].head(10)

0    2009
1    1998
2    2009
3    1995
4    2006
5    2019
6    2019
7    1988
8    1992
9    1951
Name: year, dtype: int64

In [33]:
df['votescore'].astype(float)

0      7.0
1      7.2
2      6.8
3      6.9
4      7.0
      ... 
469    7.7
470    7.4
471    4.6
472    8.0
473    8.1
Name: votescore, Length: 474, dtype: float64

In [34]:
df['votescore'] = round(df['votescore']  * 10,1)

In [35]:
df['votescore'].head(3)

0    70.0
1    72.0
2    68.0
Name: votescore, dtype: float64

In [36]:
df['metacritic'].fillna(-1,inplace=True)
df['metacritic'].head(3)

0    60.0
1    77.0
2    55.0
Name: metacritic, dtype: float64

In [37]:
df.insert(6,'avg_rating_score', 0)

In [38]:
for x in range(len(df)):
    if df['metacritic'][x] != -1:
        df['avg_rating_score'][x] = (df['votescore'][x] + df['metacritic'][x]) / 2
    else:
        df['avg_rating_score'][x] = df['votescore'][x]

In [39]:
df['avg_rating_score']

0      65.0
1      74.5
2      61.5
3      61.0
4      71.5
       ... 
469    74.5
470    76.5
471    40.5
472    79.0
473    76.0
Name: avg_rating_score, Length: 474, dtype: float64

### Production companies

In [40]:
df['production_companies'].fillna('Unnamed',inplace=True)

df.insert(9, "prod_company_count", 0)
df['prod_company_count'] =  df['production_companies'].apply(lambda x: x.count(",") + 1)

In [41]:
df[{'production_companies','prod_company_count'}].head(5)

Unnamed: 0,production_companies,prod_company_count
0,"Focus Features, Relativity Media, Arc Productions",3
1,"Pixar Animation Studios, Walt Disney Pictures",2
2,"Walt Disney Pictures, ImageMovers Digital, Ima...",3
3,"Walt Disney Pictures, Disney Television Animat...",3
4,"Warner Independent Pictures (WIP), Thousand Wo...",3


### Countries and Languages

In [42]:
df.insert(13,'country_count', 0)
df.insert(15,'language_count', 0)

for x in range(len(df)):
    df['country_count'][x] = df['country'][x].count(",")+1
    df['language_count'][x] = df['languages'][x].count(",")+1

In [43]:
df = df.rename(columns={"country": "production_countries"})

In [44]:
df[{'production_countries','country_count','languages','language_count','name'}]

Unnamed: 0,production_countries,languages,name,country_count,language_count
0,"United States, Canada, Luxembourg",English,9,3,1
1,United States,English,A Bug's Life,1,1
2,United States,English,A Christmas Carol,1,1
3,"United States, Australia, France, Canada",English,A Goofy Movie,4,1
4,United States,English,A Scanner Darkly,1,1
...,...,...,...,...,...
469,United States,English,Wreck-It Ralph,1,1
470,"United Kingdom, United States",English,Yellow Submarine,2,1
471,"United States, New Zealand",English,Yogi Bear,2,1
472,United States,English,Zootopia,1,1


### Aspect Ratio

In [45]:
df['aspect_ratio'].fillna("Unspecified",inplace=True)

### MPAA Ratings

In [46]:
df.rating.unique()

array(['PG-13', 'G', 'PG', 'R', 'Approved', 'Not Rated', 'TV-Y7-FV', nan,
       'TV-14', 'TV-MA', 'TV-PG', '18+'], dtype=object)

In [47]:
df[df['rating'] == "Approved"]

Unnamed: 0,name,rating,year,runtime,genres,story_desc,avg_rating_score,votescore,metacritic,prod_company_count,production_companies,aspect_ratio,production_countries,country_count,languages,language_count,votes,worldwide_gross_usd,na_gross_usd,opening_weekend_usd,director,budget_est_usd,orig_bgt_currency,profit_usd,writers
20,Animal Farm,Approved,1954,72 min,"Animation, Drama",A successful farmyard revolution by the reside...,72.0,72.0,-1.0,2,"Halas & Batchelor, Central Intelligence Agency...",Unspecified,"United Kingdom, United States",2,English,1,17679,-1,-1,-1,"Joy Batchelor, John Halas,",-1,usd,-1,"George Orwell(based on a story by), Lothar Wol..."
323,Snow White and the Seven Dwarfs,Approved,1937,83 min,"Animation, Adventure, Family, Fantasy, Musical...",Exiled into the dangerous forest by her wicked...,86.0,76.0,96.0,1,Walt Disney Animation Studios,4:3,United States,1,English,1,201683,18492548,18492548,601791,"William Cottrell(sequence director), David Han...",209860,usd,18282688,"Jacob Grimm(fairy tales), Wilhelm Grimm(fairy ..."
355,The Adventures of Ichabod and Mr. Toad,Approved,1949,68 min,"Animation, Comedy, Family, Fantasy, Horror","An animated adaptation of ""The Wind in the Wil...",71.5,69.0,74.0,1,Walt Disney Animation Studios,1.37 : 1,United States,1,English,1,16307,-1,-1,-1,"James Algar, Clyde Geronimi, Jack Kinney,",-1,usd,-1,"Washington Irving(story ""The Legend of Sleepy ..."
428,The Three Caballeros,Approved,1944,71 min,"Animation, Comedy, Family, Fantasy, Musical","Donald receives his birthday gifts, which incl...",74.0,63.0,85.0,1,Walt Disney Animation Studios,1.37 : 1,United States,1,"English, Spanish, Portuguese",3,14456,-1,-1,-1,"Norman Ferguson, Clyde Geronimi(sequence direc...",-1,usd,-1,"Homer Brightman(story), Ernest Terrazas(story)..."


In [48]:
df['rating'].fillna('Not Rated',inplace=True)
df.rating.unique()

array(['PG-13', 'G', 'PG', 'R', 'Approved', 'Not Rated', 'TV-Y7-FV',
       'TV-14', 'TV-MA', 'TV-PG', '18+'], dtype=object)

### Genres

In [49]:
df['genres'] = df['genres'].replace("Animation, ","", regex=True)

In [50]:
df['genres']

0      Action, Adventure, Drama, Fantasy, Sci-Fi, Thr...
1                     Adventure, Comedy, Family, Fantasy
2              Adventure, Comedy, Drama, Family, Fantasy
3            Adventure, Comedy, Family, Musical, Romance
4        Comedy, Crime, Drama, Mystery, Sci-Fi, Thriller
                             ...                        
469           Adventure, Comedy, Family, Fantasy, Sci-Fi
470                  Adventure, Comedy, Fantasy, Musical
471                            Adventure, Comedy, Family
472            Adventure, Comedy, Crime, Family, Mystery
473                               Drama, Family, Fantasy
Name: genres, Length: 474, dtype: object

In [51]:
df.insert(5, "genre_count", 0)

for x in range(len(df)):
    # Genres not including Animation
    df['genre_count'][x] = df['genres'][x].count(",")+1
    
df['genre_count']

0      6
1      4
2      5
3      5
4      6
      ..
469    5
470    4
471    3
472    5
473    3
Name: genre_count, Length: 474, dtype: int64

### Story description

In [52]:
df['story_desc'].fillna("-9",inplace=True)

In [53]:
df.iloc[224]

name                                                             Cat City
rating                                                          Not Rated
year                                                                 1986
runtime                                                            96 min
genres                           Adventure, Comedy, Crime, Family, Sci-Fi
genre_count                                                             5
story_desc                                                             -9
avg_rating_score                                                     83.0
votescore                                                            83.0
metacritic                                                           -1.0
prod_company_count                                                      3
production_companies    Pannónia Filmstúdió, Sefel Pictures Internatio...
aspect_ratio                                                     1.37 : 1
production_countries                  

In [54]:
for x in range(len(df)):
    print(f"{x}{type(df['story_desc'][x])}")

0<class 'str'>
1<class 'str'>
2<class 'str'>
3<class 'str'>
4<class 'str'>
5<class 'str'>
6<class 'str'>
7<class 'str'>
8<class 'str'>
9<class 'str'>
10<class 'str'>
11<class 'str'>
12<class 'str'>
13<class 'str'>
14<class 'str'>
15<class 'str'>
16<class 'str'>
17<class 'str'>
18<class 'str'>
19<class 'str'>
20<class 'str'>
21<class 'str'>
22<class 'str'>
23<class 'str'>
24<class 'str'>
25<class 'str'>
26<class 'str'>
27<class 'str'>
28<class 'str'>
29<class 'str'>
30<class 'str'>
31<class 'str'>
32<class 'str'>
33<class 'str'>
34<class 'str'>
35<class 'str'>
36<class 'str'>
37<class 'str'>
38<class 'str'>
39<class 'str'>
40<class 'str'>
41<class 'str'>
42<class 'str'>
43<class 'str'>
44<class 'str'>
45<class 'str'>
46<class 'str'>
47<class 'str'>
48<class 'str'>
49<class 'str'>
50<class 'str'>
51<class 'str'>
52<class 'str'>
53<class 'str'>
54<class 'str'>
55<class 'str'>
56<class 'str'>
57<class 'str'>
58<class 'str'>
59<class 'str'>
60<class 'str'>
61<class 'str'>
62<class 'str'>
63

In [55]:
df.insert(7, "story_word_count", 0)

for x in range(len(df)):
    if df['story_desc'][x] == "-9":
        continue
    df['story_word_count'][x] = df['story_desc'][x].count(" ")+1

### Runtime

In [56]:
df = df.rename(columns={"runtime": "runtime_minutes"})
df.head()

Unnamed: 0,name,rating,year,runtime_minutes,genres,genre_count,story_desc,story_word_count,avg_rating_score,votescore,metacritic,prod_company_count,production_companies,aspect_ratio,production_countries,country_count,languages,language_count,votes,worldwide_gross_usd,na_gross_usd,opening_weekend_usd,director,budget_est_usd,orig_bgt_currency,profit_usd,writers
0,9,PG-13,2009,79 min,"Action, Adventure, Drama, Fantasy, Sci-Fi, Thr...",6,A rag doll that awakens in a postapocalyptic f...,15,65.0,70.0,60.0,3,"Focus Features, Relativity Media, Arc Productions",1.85 : 1,"United States, Canada, Luxembourg",3,English,1,141553,4842806,3174989,1074044,"Shane Acker,",4200000,usd,642806,"Pamela Pettler(screenplay by), Shane Acker(sto..."
1,A Bug's Life,G,1998,95 min,"Adventure, Comedy, Family, Fantasy",4,"A misfit ant, looking for ""warriors"" to save h...",27,74.5,72.0,77.0,2,"Pixar Animation Studios, Walt Disney Pictures",2.39 : 1,United States,1,English,1,295606,36325885,16279856,29112,"John Lasseter, Andrew Stanton(co-director),",16800000,usd,19525885,"John Lasseter(original story by), Andrew Stant..."
2,A Christmas Carol,PG,2009,96 min,"Adventure, Comedy, Drama, Family, Fantasy",5,An animated retelling of Charles Dickens' clas...,24,61.5,68.0,55.0,3,"Walt Disney Pictures, ImageMovers Digital, Ima...",2.39 : 1,United States,1,English,1,120272,32528664,13785586,3005107,"Robert Zemeckis,",28000000,usd,4528664,Charles Dickens(based on the classic story by)...
3,A Goofy Movie,G,1995,78 min,"Adventure, Comedy, Family, Musical, Romance",5,When Max makes a preposterous promise to a gir...,34,61.0,69.0,53.0,3,"Walt Disney Pictures, Disney Television Animat...",1.85 : 1,"United States, Australia, France, Canada",4,English,1,56379,3534859,3534859,612955,"Kevin Lima,",2520000,usd,1014859,"Jymn Magon(story by), Chris Matheson(screenpla..."
4,A Scanner Darkly,R,2006,100 min,"Comedy, Crime, Drama, Mystery, Sci-Fi, Thriller",6,An undercover cop in a not-too-distant future ...,24,71.5,70.0,73.0,3,"Warner Independent Pictures (WIP), Thousand Wo...",1.85 : 1,United States,1,English,1,113012,765991,550161,39167,"Richard Linklater,",1218000,usd,-452009,"Philip K. Dick(novel ""A Scanner Darkly""), Rich..."


In [57]:
df['runtime_minutes'] = df['runtime_minutes'].replace('min','', regex=True).astype(int)
df['runtime_minutes']

0       79
1       95
2       96
3       78
4      100
      ... 
469    101
470     85
471     80
472    108
473    117
Name: runtime_minutes, Length: 474, dtype: int64

### Some more feature engineering

In [58]:
df.insert(19,'avg_usd_budget_per_minute', -1)

In [59]:
for x in range(len(df)):
    if df['budget_est_usd'][x] != -1:
        df['avg_usd_budget_per_minute'][x] = round(df['budget_est_usd'][x]/df['runtime_minutes'][x],2)
    else:
        df['budget_est_usd'][x] = -1

In [60]:
df[{'budget_est_usd','runtime_minutes','avg_usd_budget_per_minute'}].tail(10)

Unnamed: 0,budget_est_usd,avg_usd_budget_per_minute,runtime_minutes
464,4200000,66666.67,63
465,-1,-1.0,98
466,168000,2100.0,80
467,1680000,16310.68,103
468,12600000,148235.29,85
469,23100000,228712.87,101
470,35000,411.76,85
471,11200000,140000.0,80
472,21000000,194444.44,108
473,-1,-1.0,117


### Directors

In [61]:
dir_column = df.pop("director")
# insert column with insert(location, column_name, column_value)

df.insert(len(df.columns)-1, "directors", dir_column)

In [62]:
df['directors'] = df['directors'].apply(lambda x: re.sub("[\(\[].*?[\)\]]", "", str(x)))
df['writers'] = df['writers'].apply(lambda x: re.sub("[\(\[].*?[\)\]]", "", str(x)) if x is not None else "Unspecified")

In [63]:
df[df['writers'] == "Unspecified"]

Unnamed: 0,name,rating,year,runtime_minutes,genres,genre_count,story_desc,story_word_count,avg_rating_score,votescore,metacritic,prod_company_count,production_companies,aspect_ratio,production_countries,country_count,languages,language_count,votes,avg_usd_budget_per_minute,worldwide_gross_usd,na_gross_usd,opening_weekend_usd,budget_est_usd,orig_bgt_currency,profit_usd,directors,writers


In [64]:
df['directors'] = df['directors'].str.rstrip(", ")
df['writers'] = df['writers'].str.rstrip(", ")

In [65]:
df.insert(len(df.columns)-1, "director_count", 0)
df.insert(len(df.columns), "writer_count", 0)

for x in range(len(df)):
    df['director_count'][x] = df['directors'][x].count(",")+1
    df['writer_count'][x] = df['writers'][x].count(",")+1

In [66]:
df.isnull().any()

name                         False
rating                       False
year                         False
runtime_minutes              False
genres                       False
genre_count                  False
story_desc                   False
story_word_count             False
avg_rating_score             False
votescore                    False
metacritic                   False
prod_company_count           False
production_companies         False
aspect_ratio                 False
production_countries         False
country_count                False
languages                    False
language_count               False
votes                        False
avg_usd_budget_per_minute    False
worldwide_gross_usd          False
na_gross_usd                 False
opening_weekend_usd          False
budget_est_usd               False
orig_bgt_currency            False
profit_usd                   False
directors                    False
director_count               False
writers             

In [67]:
df.dtypes

name                          object
rating                        object
year                           int64
runtime_minutes                int64
genres                        object
genre_count                    int64
story_desc                    object
story_word_count               int64
avg_rating_score             float64
votescore                    float64
metacritic                   float64
prod_company_count             int64
production_companies          object
aspect_ratio                  object
production_countries          object
country_count                  int64
languages                     object
language_count                 int64
votes                          int64
avg_usd_budget_per_minute    float64
worldwide_gross_usd            int64
na_gross_usd                   int64
opening_weekend_usd            int64
budget_est_usd                 int64
orig_bgt_currency             object
profit_usd                     int64
directors                     object
d

In [68]:
df

Unnamed: 0,name,rating,year,runtime_minutes,genres,genre_count,story_desc,story_word_count,avg_rating_score,votescore,metacritic,prod_company_count,production_companies,aspect_ratio,production_countries,country_count,languages,language_count,votes,avg_usd_budget_per_minute,worldwide_gross_usd,na_gross_usd,opening_weekend_usd,budget_est_usd,orig_bgt_currency,profit_usd,directors,director_count,writers,writer_count
0,9,PG-13,2009,79,"Action, Adventure, Drama, Fantasy, Sci-Fi, Thr...",6,A rag doll that awakens in a postapocalyptic f...,15,65.0,70.0,60.0,3,"Focus Features, Relativity Media, Arc Productions",1.85 : 1,"United States, Canada, Luxembourg",3,English,1,141553,53164.56,4842806,3174989,1074044,4200000,usd,642806,Shane Acker,1,"Pamela Pettler, Shane Acker, Ben Gluck",3
1,A Bug's Life,G,1998,95,"Adventure, Comedy, Family, Fantasy",4,"A misfit ant, looking for ""warriors"" to save h...",27,74.5,72.0,77.0,2,"Pixar Animation Studios, Walt Disney Pictures",2.39 : 1,United States,1,English,1,295606,176842.11,36325885,16279856,29112,16800000,usd,19525885,"John Lasseter, Andrew Stanton",2,"John Lasseter, Andrew Stanton, Joe Ranft",3
2,A Christmas Carol,PG,2009,96,"Adventure, Comedy, Drama, Family, Fantasy",5,An animated retelling of Charles Dickens' clas...,24,61.5,68.0,55.0,3,"Walt Disney Pictures, ImageMovers Digital, Ima...",2.39 : 1,United States,1,English,1,120272,291666.67,32528664,13785586,3005107,28000000,usd,4528664,Robert Zemeckis,1,"Charles Dickens, Robert Zemeckis",2
3,A Goofy Movie,G,1995,78,"Adventure, Comedy, Family, Musical, Romance",5,When Max makes a preposterous promise to a gir...,34,61.0,69.0,53.0,3,"Walt Disney Pictures, Disney Television Animat...",1.85 : 1,"United States, Australia, France, Canada",4,English,1,56379,32307.69,3534859,3534859,612955,2520000,usd,1014859,Kevin Lima,1,"Jymn Magon, Chris Matheson, Brian Pimental",3
4,A Scanner Darkly,R,2006,100,"Comedy, Crime, Drama, Mystery, Sci-Fi, Thriller",6,An undercover cop in a not-too-distant future ...,24,71.5,70.0,73.0,3,"Warner Independent Pictures (WIP), Thousand Wo...",1.85 : 1,United States,1,English,1,113012,12180.00,765991,550161,39167,1218000,usd,-452009,Richard Linklater,1,"Philip K. Dick, Richard Linklater",2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
469,Wreck-It Ralph,PG,2012,101,"Adventure, Comedy, Family, Fantasy, Sci-Fi",5,A video game villain wants to be a hero and se...,28,74.5,77.0,72.0,2,"Walt Disney Animation Studios, Walt Disney Pic...",2.39 : 1,United States,1,English,1,425919,228712.87,47122288,18942288,4903871,23100000,usd,24022288,Rich Moore,1,"Rich Moore, Phil Johnston, Jim Reardon",3
470,Yellow Submarine,G,1968,85,"Adventure, Comedy, Fantasy, Musical",4,The Beatles agree to accompany Captain Fred in...,23,76.5,74.0,79.0,3,"Apple Corps, King Features Syndicate, TVC London",1.66 : 1,"United Kingdom, United States",2,English,1,26705,411.76,127326,99230,10710,35000,euro,92326,George Dunning,1,"Lee Minoff, John Lennon, Paul McCartney",3
471,Yogi Bear,PG,2010,80,"Adventure, Comedy, Family",3,A documentary filmmaker travels to Jellystone ...,24,40.5,46.0,35.0,3,"Warner Bros., Sunswept Entertainment, De Line ...",1.85 : 1,"United States, New Zealand",2,English,1,22349,140000.00,20350937,10024601,1641132,11200000,usd,9150937,Eric Brevig,1,"Jennifer Ventimilia, Joshua Sternin, Brad Cope...",3
472,Zootopia,PG,2016,108,"Adventure, Comedy, Crime, Family, Mystery",5,"In a city of anthropomorphic animals, a rookie...",23,79.0,80.0,78.0,2,"Walt Disney Pictures, Walt Disney Animation St...",2.39 : 1,United States,1,English,1,501967,194444.44,104253368,34126824,7506340,21000000,usd,83253368,"Byron Howard, Rich Moore, Jared Bush",3,"Byron Howard, Rich Moore, Jared Bush",3


In [69]:
df.to_csv("imdb_animated_movies_clean.csv", index = False)