In [1]:
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

pd.set_option('display.max_rows', 25)
pd.set_option('display.max_columns', 15)

In [2]:
df = pd.read_csv('movies_complete.csv')

In [3]:
df.sample(n=5)

Unnamed: 0,id,title,tagline,release_date,genres,belongs_to_collection,original_language,...,overview,spoken_languages,poster_path,cast,cast_size,crew_size,director
41847,259129,Monster,,2014-03-13,Horror,,ko,...,Bok-Soon (Kim Go-eun-I) runs a street stall wh...,한국어/조선말,<img src='http://image.tmdb.org/t/p/w185//9Y1z...,Lee Min-ki|Kim Go-eun|Kim Roe-Ha|Kim Bu-seon|A...,9,5,Hwang In-Ho
17566,79775,The Old Man and the Sea,,1990-03-25,Action|Drama|Foreign|TV Movie,,en,...,Based on the novel by Ernest Hemingway. Santia...,English,<img src='http://image.tmdb.org/t/p/w185//tlX7...,Anthony Quinn|Gary Cole|Patricia Clarkson|Joe ...,8,3,Jud Taylor
8497,39222,The Kiss of the Vampire,Shocking! - Horrifying! - Macabre!,1963-09-11,Horror,,en,...,"Honeymooning in Bavaria, the young couple beco...",English|Latin,<img src='http://image.tmdb.org/t/p/w185//sCPt...,Clifford Evans|Noel Willman|Edward de Souza|Je...,12,14,Don Sharp
20439,119001,The Big Circus,No one seated during the last 10 minutes so yo...,1959-07-05,Drama,,en,...,With tough and savvy boss Victor Mature in cha...,English,<img src='http://image.tmdb.org/t/p/w185//zcOn...,Victor Mature|Red Buttons|Rhonda Fleming|Kathr...,12,10,Joseph M. Newman
34679,195867,Visitors of the Night,They're watching...They're waiting...They're b...,1995-11-26,Science Fiction,,en,...,Judith notices some very creepy things are hap...,English,<img src='http://image.tmdb.org/t/p/w185//3JBS...,Markie Post|Stephen McHattie|Candace Cameron B...,6,2,Jorge Montesi


In [4]:
df.shape

(44691, 22)

In [5]:
df.columns

Index(['id', 'title', 'tagline', 'release_date', 'genres',
       'belongs_to_collection', 'original_language', 'budget_musd',
       'revenue_musd', 'production_companies', 'production_countries',
       'vote_count', 'vote_average', 'popularity', 'runtime', 'overview',
       'spoken_languages', 'poster_path', 'cast', 'cast_size', 'crew_size',
       'director'],
      dtype='object')

Columns description:
* `id` - the unique movie identifier
* `title` - the movie's official title
* `tagline` - the movie's tagline
* `release_date` - the theatrical release date of the movie
* `genres` - genres associated with the movie
* `belongs_to_collection` - information about series/franchise the particular movie belongs to
* `original_language` - the language in which the movie was originally shot in
* `budget_musd` - the budget of the movie in million dollars
* `revenue_musd` - the total revenue of the movie in million dollars
* `production_companies` - companies involved with the making of the movie
* `production_countries` - countries where the movie was shot/produced in
* `vote_count` - the number of users votes(TMDB)
* `vote_average` - the average rating
* `popularity` - the popularity(TMDB)
* `runtime` - the runtime in minutes
* `overview` - a brief review to the movie
* `poster_path` - the poster image url
* `cast` - main actors
* `cast_size` - number of actors
* `director` - movie's director
* `crew_size` - size of the movie crew

In [6]:
df.dtypes

id                         int64
title                     object
tagline                   object
release_date              object
genres                    object
belongs_to_collection     object
original_language         object
budget_musd              float64
revenue_musd             float64
production_companies      object
production_countries      object
vote_count               float64
vote_average             float64
popularity               float64
runtime                  float64
overview                  object
spoken_languages          object
poster_path               object
cast                      object
cast_size                  int64
crew_size                  int64
director                  object
dtype: object

Column `release_date` has `object` type. Convert it to `datetime`:

In [7]:
df['release_date'] = pd.to_datetime(df['release_date'])

In [8]:
df.dtypes

id                                int64
title                            object
tagline                          object
release_date             datetime64[ns]
genres                           object
belongs_to_collection            object
original_language                object
budget_musd                     float64
revenue_musd                    float64
production_companies             object
production_countries             object
vote_count                      float64
vote_average                    float64
popularity                      float64
runtime                         float64
overview                         object
spoken_languages                 object
poster_path                      object
cast                             object
cast_size                         int64
crew_size                         int64
director                         object
dtype: object

Let's see whether columns have values skipped:

In [9]:
df.isna().sum()

id                           0
title                        0
tagline                  24407
release_date                34
genres                    2105
belongs_to_collection    40228
original_language           10
budget_musd              35837
revenue_musd             37306
production_companies     11335
production_countries      5856
vote_count                   0
vote_average              2614
popularity                   0
runtime                   1512
overview                   951
spoken_languages          3597
poster_path                224
cast                      2189
cast_size                    0
crew_size                    0
director                   731
dtype: int64

In [10]:
df \
    .isna() \
    .mean() \
    .sort_values(ascending=False)

belongs_to_collection    0.900136
revenue_musd             0.834754
budget_musd              0.801884
tagline                  0.546128
production_companies     0.253630
production_countries     0.131033
spoken_languages         0.080486
vote_average             0.058491
cast                     0.048981
genres                   0.047101
runtime                  0.033832
overview                 0.021279
director                 0.016357
poster_path              0.005012
release_date             0.000761
original_language        0.000224
cast_size                0.000000
crew_size                0.000000
id                       0.000000
popularity               0.000000
title                    0.000000
vote_count               0.000000
dtype: float64

Well, shares of data blank is more informative.
Should we fill them out or throw them away? There are many missing values in columns `belongs_to_collection`, `revenue_musd`, `budget_musd` (about `80 %`) and also in some others (`tagline`, `production_companies`, `production_countries`) but less. 
So, it depends on our analysis. 
In our case we won't handle or remove or fill missing values.

****
Let' s get some statistical information on numerical columns:

In [11]:
df.describe()

Unnamed: 0,id,budget_musd,revenue_musd,vote_count,vote_average,popularity,runtime,cast_size,crew_size
count,44691.0,8854.0,7385.0,44691.0,42077.0,44691.0,43179.0,44691.0,44691.0
mean,107186.242845,21.669886,68.968649,111.653778,6.003341,2.95746,97.56685,12.47909,10.313643
std,111806.362236,34.359837,146.608966,495.322313,1.28106,6.040008,34.653409,12.124663,15.892154
min,2.0,1e-06,1e-06,0.0,0.0,0.0,1.0,0.0,0.0
25%,26033.5,2.0,2.40542,3.0,5.3,0.402038,86.0,6.0,2.0
50%,59110.0,8.2,16.872671,10.0,6.1,1.150055,95.0,10.0,6.0
75%,154251.0,25.0,67.642693,35.0,6.8,3.768882,107.0,15.0,12.0
max,469172.0,380.0,2787.965087,14075.0,10.0,547.488298,1256.0,313.0,435.0


And also some summary statistic on non-numerical columns containing strings values and datedi:

In [12]:
df.describe(include=['object', 'datetime'])

  df.describe(include=['object', 'datetime'])


Unnamed: 0,title,tagline,release_date,genres,belongs_to_collection,original_language,production_companies,production_countries,overview,spoken_languages,poster_path,cast,director
count,44691,20284,44657,42586,4463,44681,33356,38835,43740,41094,44467,42502,43960
unique,41605,20171,17225,4044,1691,89,22537,2377,43715,1828,44456,42166,17349
top,Cinderella,Based on a true story.,2008-01-01 00:00:00,Drama,The Bowery Boys,en,Metro-Goldwyn-Mayer (MGM),United States of America,Adaptation of the Jane Austen novel.,English,<img src='http://image.tmdb.org/t/p/w185//qW1o...,Georges Méliès,John Ford
freq,11,7,133,4935,29,31755,741,17723,3,22189,3,24,66
first,,,1874-12-09 00:00:00,,,,,,,,,,
last,,,2017-12-27 00:00:00,,,,,,,,,,


Actually, we could use datetime type in describing numerical values, but we used it just for some descriptive statistic (we don't need percentiles to our datetimes).

It seems that there are duplicates on movie with title `Cinderella` (occures as many `11` times). We have to check it.

In [13]:
df_Cinderella = df.query('title == "Cinderella"')

In [14]:
df_Cinderella

Unnamed: 0,id,title,tagline,release_date,genres,belongs_to_collection,original_language,...,overview,spoken_languages,poster_path,cast,cast_size,crew_size,director
984,11224,Cinderella,The greatest love story ever told.,1950-03-04,Family|Fantasy|Animation|Romance,Cinderella Collection,en,...,Cinderella has faith her dreams of a better li...,English,<img src='http://image.tmdb.org/t/p/w185//avz6...,Ilene Woods|Eleanor Audley|Lucille Bliss|Verna...,11,50,Clyde Geronimi
12988,42884,Cinderella,,1997-11-02,TV Movie|Family|Fantasy|Music|Romance,,en,...,Updated version of the classic Rodgers and Ham...,English,<img src='http://image.tmdb.org/t/p/w185//54aw...,Brandy Norwood|Whitney Houston|Whoopi Goldberg...,5,1,Robert Iscove
23254,92349,Cinderella,,1914-12-28,Fantasy|Drama,,en,...,Based on Charles Perrault's fairy tale: Cinder...,English,<img src='http://image.tmdb.org/t/p/w185//bL1K...,Mary Pickford|Owen Moore|Isabel Vernon|Georgia...,7,2,James Kirkwood
23265,105875,Cinderella,The version children love!,2002-08-06,Animation|Family|Fantasy,,en,...,"Cinderella, the beautiful and kind-hearted rag...",Afrikaans,<img src='http://image.tmdb.org/t/p/w185//rbzG...,Tony Ail|Nathan Aswell|Chera Bailey|Kathleen B...,7,3,Toshiyuki Hiruma
28073,261985,Cinderella,,2011-10-30,Family|TV Movie,,en,...,Once upon a time in post-war Rome: 13-year old...,English,<img src='http://image.tmdb.org/t/p/w185//wjZK...,Vanessa Hessler|Flavio Parenti|Natalia Wörner|...,14,11,Christian Duguay
28340,150689,Cinderella,Midnight is just the beginning.,2015-03-12,Romance|Fantasy|Family|Drama,,en,...,"When her father unexpectedly passes away, youn...",English,<img src='http://image.tmdb.org/t/p/w185//iH4C...,Lily James|Cate Blanchett|Richard Madden|Helen...,78,80,Kenneth Branagh
33805,42651,Cinderella,,1947-11-28,Comedy|Family|Fantasy,,ru,...,"Based on a classic fairytale ""Cinderella"" bril...",Pусский,<img src='http://image.tmdb.org/t/p/w185//p41r...,Yanina Zhejmo|Aleksei Konsovsky|Faina Ranevska...,6,4,Nadezhda Kosheverova
35114,44459,Cinderella,,1957-03-31,Drama|Romance,,en,...,The first of three TV-versions of the classic ...,English,<img src='http://image.tmdb.org/t/p/w185//cB6I...,Julie Andrews|Howard Lindsay|Howard Lindsay|Il...,9,10,Ralph Nelson
35116,289673,Cinderella,,2000-01-01,,,en,...,Cinderella (named Zezolla) and her family live...,English,<img src='http://image.tmdb.org/t/p/w185//gMYk...,Kathleen Turner|Katrin Cartlidge|David Warner|...,10,6,Beeban Kidron
40439,114108,Cinderella,,1899-10-01,Fantasy|Horror|Science Fiction|Family,,fr,...,A fairy godmother magically turns Cinderella's...,No Language,<img src='http://image.tmdb.org/t/p/w185//cf0g...,Georges Méliès|Barral|Bleuette Bernon|Carmely|...,6,2,Georges Méliès


That's ok. These are all different movies/productions (values of `release_date` differ).

In [15]:
df_Cinderella.value_counts('release_date')

release_date
1899-10-01    1
1914-12-28    1
1947-11-28    1
1950-03-04    1
1957-03-31    1
1997-11-02    1
2000-01-01    1
2002-08-06    1
2011-10-30    1
2012-02-14    1
2015-03-12    1
dtype: int64