# <h1 align="center" style="font-size:200%; color:blue">Microsoft’s entrance to Movie business</h1><br>
<h1 align="center" style="font-size:200%; color:blue">Data loading and cleaning and exploration</h1> <br>

<p style="color:black"> First I shall explore and clean the data files provided for the project. Following table contains the 11 files categorized by the data sources. </p> <br>

|      __IMDB__                |    __Box Office Mojo__   |   __Rotten Tomatoes__  | __TheMovieDB.org__  |
|------------------------------|--------------------------|------------------------|---------------------|
| 1. imdb.name.basics.csv      | 7. bom.movie_gross.csv   | 8. rt.movie_info.tsv   | 10. tmdb.movies.csv |
| 2. imdb.title.akas.csv       |                          | 9. rt.reviews.tsv      |                     |
| 3. imdb.title.basics.csv     |                          |                        |                     |
| 4. imdb.title.crew.csv       |                          |                        |                     |
| 5. imdb.title.principals.csv |                          |                        |                     |
| 6. imdb.title.ratings.csv    |                          |                        |                     |
|------------------------------|--------------------------|------------------------|---------------------|
|11. tn.movie_budgets.csv      |                          |                        |                     |

<p style="font-size:100%; color:green"> Methodology:  <b>R</b>OSEMED</p>
<!--><p style="color:black"> <b>R</b>OSEMED method </p> <-->

In [1]:
# importing libraries
import pandas as pd
import numpy as np

# Data loading and cleaning

## cleaning the-numbers.com data file

In [2]:
"""
original source: https://www.the-numbers.com/movie/budgets
"""
tn_budgets = pd.read_csv('Data/tn.movie_budgets.csv')
display(tn_budgets.head(4), len(tn_budgets))

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
0,1,"Dec 18, 2009",Avatar,"$425,000,000","$760,507,625","$2,776,345,279"
1,2,"May 20, 2011",Pirates of the Caribbean: On Stranger Tides,"$410,600,000","$241,063,875","$1,045,663,875"
2,3,"Jun 7, 2019",Dark Phoenix,"$350,000,000","$42,762,350","$149,762,350"
3,4,"May 1, 2015",Avengers: Age of Ultron,"$330,600,000","$459,005,868","$1,403,013,963"


5782

In [3]:
# id column useless
tn_budgets = tn_budgets.drop('id', axis=1)
# format date
tn_budgets['release_date'] = pd.to_datetime(tn_budgets['release_date'])
# clean movie column
tn_budgets.movie = tn_budgets.movie.str.strip()
# clean movie domestic_gross, production_budget and worldwide_gross
tn_budgets['domestic_gross'] = tn_budgets['domestic_gross'].str.replace(
    '$', '')
tn_budgets['domestic_gross'] = tn_budgets['domestic_gross'].str.replace(
    ',', '')
tn_budgets['domestic_gross'] = tn_budgets['domestic_gross'].astype(float)
tn_budgets['production_budget'] = tn_budgets['production_budget'].str.replace(
    '$', '')
tn_budgets['production_budget'] = tn_budgets['production_budget'].str.replace(
    ',', '')
tn_budgets['production_budget'] = tn_budgets['production_budget'].astype(float)
tn_budgets['worldwide_gross'] = tn_budgets['worldwide_gross'].str.replace(
    '$', '')
tn_budgets['worldwide_gross'] = tn_budgets['worldwide_gross'].str.replace(
    ',', '')
tn_budgets['worldwide_gross'] = tn_budgets['worldwide_gross'].astype(float)
# tn_budgets.isna().sum()  # checked it's clean

## cleaning Box Office Mojo data file

In [4]:
# Box Office Mojo dataset
bom = pd.read_csv('Data/bom.movie_gross.csv')
display(bom.head(2), len(bom))

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
0,Toy Story 3,BV,415000000.0,652000000,2010
1,Alice in Wonderland (2010),BV,334200000.0,691300000,2010


3387

In [5]:
"""
The Null values in domestic_gross and foreign_gross are tragged "-1.0 float64"
and studio with 'unknown'
"""
bom['studio'].fillna(value='unknow', inplace=True)
bom['domestic_gross'] = pd.to_numeric(bom['domestic_gross'], errors='coerce')
bom['domestic_gross'].fillna(value=-1.0, inplace=True)
bom['foreign_gross'] = pd.to_numeric(bom['foreign_gross'], errors='coerce')
bom['foreign_gross'].fillna(value=str(-1.0), inplace=True)
bom = bom[bom.foreign_gross != -1.0]
# bom.isna().sum() # it's clean now

## cleaning Rotten Tomatoes data files

In [6]:
# Rotten Tomatoes
rt = pd.read_csv('Data/rt.movie_info.tsv', sep='\t')
display(rt.head(2), len(rt))

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio
0,1,"This gritty, fast-paced, and innovative police...",R,Action and Adventure|Classics|Drama,William Friedkin,Ernest Tidyman,"Oct 9, 1971","Sep 25, 2001",,,104 minutes,
1,3,"New York City, not-too-distant-future: Eric Pa...",R,Drama|Science Fiction and Fantasy,David Cronenberg,David Cronenberg|Don DeLillo,"Aug 17, 2012","Jan 1, 2013",$,600000.0,108 minutes,Entertainment One


1560

In [7]:
# we'll drop ['synopsis', 'genre', 'box_office']
rt.drop(columns=['synopsis', 'genre', 'box_office',
                 'director', 'writer', 'dvd_date', 'currency', 'box_office', 'studio'], inplace=True)  # has much more
# comprihencive data in IMDB
rt['rating'] = pd.to_numeric(rt['rating'], errors='coerce')
rt['rating'].fillna(value=-1.0, inplace=True)
rt = rt[rt.rating != -1.0]
rt['runtime'] = rt.runtime.map(
    lambda x: x.strip().replace("minutes", ""), na_action='ignore')
rt['runtime'] = pd.to_numeric(rt['runtime'], errors='coerce')
rt['runtime'].fillna(value=-1.0, inplace=True)
rt['theater_date'].fillna(value='unknown', inplace=True)
# rt.isna().sum() # It's clean now

Series([], Name: theater_date, dtype: object)

In [8]:
"""
review's file maybe useful for reviews and ratings. 
encoding worked!!! with correct delimiter
"""
rt_reviews = pd.read_csv('Data/rt.reviews.tsv',
                         delimiter='\t', encoding='latin-1')
display(rt_reviews.head(2), len(rt_reviews))

Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date
0,3,A distinctly gallows take on contemporary fina...,3/5,fresh,PJ Nabarro,0,Patrick Nabarro,"November 10, 2018"
1,3,It's an allegory in search of a meaning that n...,,rotten,Annalee Newitz,0,io9.com,"May 23, 2018"


54432

## cleaning IMDB data files

In [9]:
# IMDB informantion about movie personel
name = pd.read_csv('Data/imdb.name.basics.csv')
display(name.head(3), len(name))

Unnamed: 0,nconst,primary_name,birth_year,death_year,primary_profession,known_for_titles
0,nm0061671,Mary Ellen Bauder,,,"miscellaneous,production_manager,producer","tt0837562,tt2398241,tt0844471,tt0118553"
1,nm0061865,Joseph Bauer,,,"composer,music_department,sound_department","tt0896534,tt6791238,tt0287072,tt1682940"
2,nm0062070,Bruce Baum,,,"miscellaneous,actor,writer","tt1470654,tt0363631,tt0104030,tt0102898"


606648

In [10]:
"""
print(name.isna().sum()) # birth_year(523912), death_year(599865) Null so dropping both,
primary_profession(51340), known_for_titles(30204) will keep because maybe importat to 
 always can scrape for it later 
"""
name = name.drop(['birth_year', 'death_year'], axis=1)

In [11]:
# IMDB informantion about titles
takas = pd.read_csv('Data/imdb.title.akas.csv')
takas.rename(columns={'title_id': 'imdb_id'}, inplace=True)
display(takas.head(3), len(takas))

Unnamed: 0,imdb_id,ordering,title,region,language,types,attributes,is_original_title
0,tt0369610,10,Джурасик свят,BG,bg,,,0.0
1,tt0369610,11,Jurashikku warudo,JP,,imdbDisplay,,0.0
2,tt0369610,12,Jurassic World: O Mundo dos Dinossauros,BR,,imdbDisplay,,0.0


331703

In [12]:
# checking data reducdencies
rand_id = np.random.randint(0, len(takas))
title = takas.loc[rand_id, 'imdb_id']
display(takas.loc[(takas.imdb_id == title)])
"""checked with several random imdb_id's but all had totally convoluted data. 
The titles for the same imdb_id is completely different in most cases. 
This imdb.title.akas.csv data file is unsalvageable. 
file should not be used for further analysis.
"""

Unnamed: 0,imdb_id,ordering,title,region,language,types,attributes,is_original_title
125696,tt2246376,1,Sleeper's Wake,,,original,,1.0
125697,tt2246376,2,Sleeper's Wake,ZA,,,,0.0
125698,tt2246376,3,Trezirea,RO,,imdbDisplay,,0.0
125699,tt2246376,4,Az alvó ébredése,HU,,imdbDisplay,,0.0
125700,tt2246376,5,Wyrwani ze snu,PL,,imdbDisplay,,0.0


"checked with several random imdb_id's but all had totally convoluted data. \nThe titles for the same imdb_id is completely different in most cases. \nThis imdb.title.akas.csv data file is unsalvageable. \nfile should not be used for further analysis.\n"

In [13]:
# IMDB datafile of title information could be important
tbasics = pd.read_csv('Data/imdb.title.basics.csv')
tbasics.rename(columns={'tconst': 'imdb_id'}, inplace=True)
tbasics.rename(columns={'start_year': 'year'}, inplace=True)
display(tbasics.head(2), len(tbasics))

Unnamed: 0,imdb_id,primary_title,original_title,year,runtime_minutes,genres
0,tt0063540,Sunghursh,Sunghursh,2013,175.0,"Action,Crime,Drama"
1,tt0066787,One Day Before the Rainy Season,Ashad Ka Ek Din,2019,114.0,"Biography,Drama"


146144

In [14]:
# check if the imdb_id is consistant here
# this is a really good way to randamly sample the file
rand_id = np.random.randint(0, len(tbasics))
# tbasics.sample(3)             # of course this is the standard way to do this. Wish I knew that before
title = tbasics.loc[rand_id, 'imdb_id']
tbasics.loc[(tbasics.imdb_id == title)]
# checked many imdb_id's and data is good and consistant. need some cleaning

Unnamed: 0,imdb_id,primary_title,original_title,year,runtime_minutes,genres
145551,tt9838346,Future of the Past,Katsute no mirai,2019,,


In [15]:
"""
clean runtime_minutes(31739), genres(5408). I wouldn't drop any null lines 
as primary title is all non-null
so no cleaning needed
"""

tbasics['primary_title'] = tbasics.primary_title.map(
    lambda x: x.strip(), na_action='ignore')
tbasics['original_title'] = tbasics.original_title.map(
    lambda x: x.strip(), na_action='ignore')
tbasics['original_title'].fillna(value='unknown', inplace=True)
tbasics['year'] = pd.to_numeric(tbasics['year'], errors='coerce')
tbasics['year'].fillna(value=3001, inplace=True)
tbasics['runtime_minutes'] = pd.to_numeric(
    tbasics['runtime_minutes'], errors='coerce')
tbasics['runtime_minutes'].fillna(value=-1.0, inplace=True)
tbasics['genres'] = tbasics.genres.map(lambda x: x.strip(), na_action='ignore')
tbasics['genres'].fillna(value='unknown', inplace=True)
# tbasics.isna().sum() # it's clean

In [16]:
# IMDB all crew information per title needs imdb.name.basics.csv file
crew = pd.read_csv('Data/imdb.title.crew.csv')
crew.rename(columns={'tconst': 'imdb_id'}, inplace=True)
display(crew.head(2), len(crew))

Unnamed: 0,imdb_id,directors,writers
0,tt0285252,nm0899854,nm0899854
1,tt0438973,,"nm0175726,nm1802864"


146144

In [17]:
# check if the imdb_id is consistant here
rand_id = np.random.randint(0, len(crew))
title = crew.loc[rand_id, 'imdb_id']
crew.loc[(crew.imdb_id == title)]

Unnamed: 0,imdb_id,directors,writers
22032,tt1866094,nm0294274,nm0294274


In [18]:
#print("before\n", crew.isna().sum())
crew['directors'] = crew.directors.map(lambda x: x.strip(), na_action='ignore')
crew['writers'] = crew.writers.map(lambda x: x.strip(), na_action='ignore')
crew['writers'].fillna(value='unknown', inplace=True)
crew = crew[crew.writers != 'unknown']
crew['directors'].fillna(value='unknown', inplace=True)
crew = crew[crew.directors != 'unknown']
#print("after\n", crew.isna().sum())

In [19]:
# IMDB ratings
ratings = pd.read_csv('Data/imdb.title.ratings.csv')
ratings.rename(columns={'tconst': 'imdb_id'}, inplace=True)
display(ratings.head(5), len(ratings))

Unnamed: 0,imdb_id,averagerating,numvotes
0,tt10356526,8.3,31
1,tt10384606,8.9,559
2,tt1042974,6.4,20
3,tt1043726,4.2,50352
4,tt1060240,6.5,21


73856

In [20]:
title = ['tt10356526', 'tt10384606', 'tt1042974', 'tt1043726', 'tt1060240']
for t in title:
    display(tbasics[tbasics.imdb_id == t])

Unnamed: 0,imdb_id,primary_title,original_title,year,runtime_minutes,genres
3667,tt10356526,Laiye Je Yaarian,Laiye Je Yaarian,2019,117.0,Romance


Unnamed: 0,imdb_id,primary_title,original_title,year,runtime_minutes,genres
4103,tt10384606,Borderless,Borderless,2019,87.0,Documentary


Unnamed: 0,imdb_id,primary_title,original_title,year,runtime_minutes,genres
4721,tt1042974,Just Inès,Just Inès,2010,90.0,Drama


Unnamed: 0,imdb_id,primary_title,original_title,year,runtime_minutes,genres
4825,tt1043726,The Legend of Hercules,The Legend of Hercules,2014,99.0,"Action,Adventure,Fantasy"


Unnamed: 0,imdb_id,primary_title,original_title,year,runtime_minutes,genres
5471,tt1060240,Até Onde?,Até Onde?,2011,73.0,"Mystery,Thriller"


In [21]:
# check if the imdb_id is consistant here
rand_id = np.random.randint(0, len(ratings))
title = ratings.loc[rand_id, 'imdb_id']
ratings.loc[(ratings.imdb_id == title)]

Unnamed: 0,imdb_id,averagerating,numvotes
23062,tt7263974,3.9,81


In [22]:
#print("before:\n", ratings.isna().sum())
ratings['averagerating'] = pd.to_numeric(
    ratings['averagerating'], errors='coerce')
ratings['numvotes'] = pd.to_numeric(ratings['numvotes'], errors='coerce')
# print("after:\n", ratings.isna().sum()) # looks pretty clean
ratings.to_csv('Data/imdb.title.ratings_clean.csv', index=False)

In [23]:
# this data maybe redundent
principals = pd.read_csv('Data/imdb.title.principals.csv')
principals.rename(columns={'tconst': 'imdb_id'}, inplace=True)
display(principals.head(2), len(principals))

Unnamed: 0,imdb_id,ordering,nconst,category,job,characters
0,tt0111414,1,nm0246005,actor,,"[""The Man""]"
1,tt0111414,2,nm0398271,director,,


1028186

In [24]:
#print("before:\n", principals.isna().sum())
principals.dropna(inplace=True)  # null after cleaning. data is useless
# print("after:\n", principals.isna().sum()) # looks pretty clean
# display(principals.head(2),len(principals))

## TheMovieDB data file

In [25]:
# TheMovieDB.org
tmdb = pd.read_csv('Data/tmdb.movies.csv')

# we'll drop ['Unnamed: 0', 'genre_ids', 'popularity']
tmdb.drop(columns=['Unnamed: 0', 'genre_ids', 'popularity'], inplace=True)
display(tmdb.head(3), len(tmdb))

Unnamed: 0,id,original_language,original_title,release_date,title,vote_average,vote_count
0,12444,en,Harry Potter and the Deathly Hallows: Part 1,2010-11-19,Harry Potter and the Deathly Hallows: Part 1,7.7,10788
1,10191,en,How to Train Your Dragon,2010-03-26,How to Train Your Dragon,7.7,7610
2,10138,en,Iron Man 2,2010-05-07,Iron Man 2,6.8,12368


26517

In [26]:
# comprihencive data in TMDB except IMDB_id
tmdb['original_language'] = tmdb.original_language.map(
    lambda x: x.strip(), na_action='ignore')
tmdb['original_title'] = tmdb.original_title.map(
    lambda x: x.strip(), na_action='ignore')
tmdb['release_date'] = tmdb.release_date.map(
    lambda x: x.strip(), na_action='ignore')
tmdb['title'] = tmdb.title.map(lambda x: x.strip(), na_action='ignore')

#print("before:\n", tmdb.isna().sum())
tmdb.dropna(inplace=True)  # null after cleaning. data is useless
# print("after:\n", tmdb.isna().sum()) # looks pretty clean now

# Alternative data from IMDBpro
credit: Jesse Newman for web scraping

In [27]:
revenue = pd.read_csv('Data/revenue.csv')
# credit: Jesse Newman for web scraping
# renamed file of given data
date = pd.read_csv('Data/date.csv')
display(revenue.head(2), len(revenue))
display(date.head(2), len(date))

Unnamed: 0,imdb_id,title,year,director,region_code,rank,budget_usd,us_gross,genres
0,tt8228288,The Platform,2019,Galder Gaztelu-Urrutia,[ES],1,-1.0,-1.0,"Horror, Sci-Fi, Thriller"
1,tt1598778,Contagion,2011,Steven Soderbergh,[US],2,60000000.0,76000000.0,"Action, Drama, Thriller"


10001

Unnamed: 0,imdb_id,title,date
0,tt0848228,The Avengers,2012-04-25
1,tt4154796,Avengers: Endgame,2019-04-24


14466

In [28]:
# clean region_code
revenue['region_code'] = revenue['region_code'].str.replace('\[', '')
revenue['region_code'] = revenue['region_code'].str.replace('\]', '')
revenue.drop('rank', axis=1, inplace=True)
revenue['imdb_id'].fillna(value='unknown', inplace=True)
revenue = revenue[revenue.imdb_id != 'unknown']
revenue['genres'].fillna(value='unknown', inplace=True)
revenue['director'].fillna(value='unknown', inplace=True)
revenue['region_code'].fillna(value='unknown', inplace=True)
# display(revenue.head(2))

In [29]:
display(revenue.info())
display(date.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 0 to 9999
Data columns (total 8 columns):
imdb_id        10000 non-null object
title          10000 non-null object
year           10000 non-null int64
director       10000 non-null object
region_code    10000 non-null object
budget_usd     10000 non-null float64
us_gross       10000 non-null float64
genres         10000 non-null object
dtypes: float64(2), int64(1), object(5)
memory usage: 703.1+ KB


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14466 entries, 0 to 14465
Data columns (total 3 columns):
imdb_id    14466 non-null object
title      14466 non-null object
date       14466 non-null object
dtypes: object(3)
memory usage: 339.2+ KB


None

## Movie budget and gross revenue 

In [30]:
# budget_usd   us_gross
#display(revenue.loc[revenue.budget_usd == -1]['budget_usd'].count())
display(revenue.loc[(revenue.budget_usd == -1)
                    & (revenue.us_gross > 100000000)])
display(revenue.budget_usd.describe())
display(revenue.loc[revenue.us_gross < 0]['us_gross'].count())
display(revenue.us_gross.describe())

Unnamed: 0,imdb_id,title,year,director,region_code,budget_usd,us_gross,genres
1107,tt0091326,The Karate Kid Part II,1986,John G. Avildsen,US,-1.0,115000000.0,"Action, Family, Romance"
3733,tt0034492,Bambi,1942,James Algar,US,-1.0,102000000.0,"Animation, Drama, Family"
4356,tt0082846,On Golden Pond,1981,Mark Rydell,unknown,-1.0,119000000.0,Drama
4682,tt0486946,Wild Hogs,2007,Walt Becker,US,-1.0,168000000.0,"Action, Adventure, Comedy"
5995,tt0081562,Stir Crazy,1980,Sidney Poitier,US,-1.0,101000000.0,"Comedy, Crime"


count    1.000000e+04
mean     2.119376e+07
std      1.539497e+08
min     -1.000000e+00
25%     -1.000000e+00
50%      4.500000e+06
75%      2.500000e+07
max      1.500000e+10
Name: budget_usd, dtype: float64

3233

count    1.000000e+04
mean     2.767767e+07
std      5.780555e+07
min     -1.000000e+00
25%     -1.000000e+00
50%      2.800000e+06
75%      3.200000e+07
max      9.370000e+08
Name: us_gross, dtype: float64

To understand the profitability of the movie both gross revenue and budget is needed. In this dataset with 14,431 entries, 6,534 entries do not have budget information. However, movies like <b>Wild Hogs</b> (\$ 168 million) and <b>Bambi</b> (\$ 102 million) clearly had boxoffice success. Thus, eliminating those data at this stage might not be predent. 

## Join date, runtime and rating to  revenue DataFrame

In [31]:
# joining date to the revenue table
date.drop(['title'], axis=1, inplace=True)
date.sort_values(by=['imdb_id'], inplace=False)
revenue.sort_values(by=['imdb_id'], inplace=False)
date.set_index('imdb_id', inplace=True)
revenue.set_index('imdb_id', inplace=True)
revenue = revenue.join(date, how='left', lsuffix='1', rsuffix='1',  sort=True)
date.reset_index('imdb_id', inplace=True)
revenue.reset_index('imdb_id', inplace=True)
# clean date
#revenue['date'] = revenue['date'].astype('datetime64[ns]')
display(revenue.head(3))

Unnamed: 0,imdb_id,title,year,director,region_code,budget_usd,us_gross,genres,date
0,tt0004972,The Birth of a Nation,1915,D.W. Griffith,US,100000.0,-1.0,"Drama, History, War",
1,tt0010323,The Cabinet of Dr. Caligari,1920,Robert Wiene,DE,18000.0,9000.0,"Fantasy, Horror, Mystery",
2,tt0012349,The Kid,1921,Charles Chaplin,US,250000.0,-1.0,"Comedy, Drama, Family",


In [32]:
# joining runtime to the revenue table
tbasics.drop(['primary_title', 'original_title',
              'year', 'genres'], axis=1, inplace=True)
tbasics.sort_values(by=['imdb_id'], inplace=False)
revenue.sort_values(by=['imdb_id'], inplace=False)
tbasics.set_index('imdb_id', inplace=True)
revenue.set_index('imdb_id', inplace=True)
revenue = revenue.join(tbasics, how='left', lsuffix='1',
                       rsuffix='1',  sort=True)
tbasics.reset_index('imdb_id', inplace=True)
revenue.reset_index('imdb_id', inplace=True)
display(revenue.head(3))

Unnamed: 0,imdb_id,title,year,director,region_code,budget_usd,us_gross,genres,date,runtime_minutes
0,tt0004972,The Birth of a Nation,1915,D.W. Griffith,US,100000.0,-1.0,"Drama, History, War",,
1,tt0010323,The Cabinet of Dr. Caligari,1920,Robert Wiene,DE,18000.0,9000.0,"Fantasy, Horror, Mystery",,
2,tt0012349,The Kid,1921,Charles Chaplin,US,250000.0,-1.0,"Comedy, Drama, Family",,


In [33]:
# joining ratings to the revenue table
ratings.drop(['numvotes'], axis=1, inplace=True)
ratings.sort_values(by=['imdb_id'], inplace=False)
revenue.sort_values(by=['imdb_id'], inplace=False)
ratings.set_index('imdb_id', inplace=True)
revenue.set_index('imdb_id', inplace=True)
revenue = revenue.join(ratings, how='left', lsuffix='1',
                       rsuffix='1',  sort=True)
ratings.reset_index('imdb_id', inplace=True)
revenue.reset_index('imdb_id', inplace=True)
display(revenue.head(3))

Unnamed: 0,imdb_id,title,year,director,region_code,budget_usd,us_gross,genres,date,runtime_minutes,averagerating
0,tt0004972,The Birth of a Nation,1915,D.W. Griffith,US,100000.0,-1.0,"Drama, History, War",,,
1,tt0010323,The Cabinet of Dr. Caligari,1920,Robert Wiene,DE,18000.0,9000.0,"Fantasy, Horror, Mystery",,,
2,tt0012349,The Kid,1921,Charles Chaplin,US,250000.0,-1.0,"Comedy, Drama, Family",,,


In [34]:
revenue.to_csv('Data/movie_main.csv', index=False)

In [35]:
revenue.head(10)

Unnamed: 0,imdb_id,title,year,director,region_code,budget_usd,us_gross,genres,date,runtime_minutes,averagerating
0,tt0004972,The Birth of a Nation,1915,D.W. Griffith,US,100000.0,-1.0,"Drama, History, War",,,
1,tt0010323,The Cabinet of Dr. Caligari,1920,Robert Wiene,DE,18000.0,9000.0,"Fantasy, Horror, Mystery",,,
2,tt0012349,The Kid,1921,Charles Chaplin,US,250000.0,-1.0,"Comedy, Drama, Family",,,
3,tt0013257,Häxan,1922,Benjamin Christensen,unknown,2000000.0,-1.0,"Fantasy, History, Horror",,,
4,tt0013442,Nosferatu,1922,F.W. Murnau,DE,-1.0,-1.0,"Fantasy, Horror",,,
5,tt0015324,Sherlock Jr.,1924,Buster Keaton,US,-1.0,-1.0,"Action, Comedy, Romance",,,
6,tt0015648,Battleship Potemkin,1925,Sergei M. Eisenstein,SUHH,-1.0,51000.0,"Drama, History, Thriller",1925-12-24,,
7,tt0015864,The Gold Rush,1925,Charles Chaplin,US,923000.0,-1.0,"Adventure, Comedy, Drama",,,
8,tt0016220,The Phantom of the Opera,1925,Rupert Julian,US,-1.0,-1.0,Horror,,,
9,tt0016630,Battling Butler,1926,Buster Keaton,US,-1.0,-1.0,"Action, Comedy, Romance",,,
