# Raw data analysis

## Getting Data for Frames

Importing required libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline


Using a helper function to pull in all the DataFrames together

In [73]:
from os.path import isfile, join

def df_from_zipped(path, fn):
    '''
    Takes filepath and name and returns a DataFrame
    '''
    # check formatting
    tsv = 'tsv'in fn
    if tsv:
        delimiter = '\t'
        encoding= 'unicode_escape'
    else:
        delimiter = None
        encoding = 'utf-8'
    
    joined_path = join(zipped_path + fn)
    return pd.read_csv(joined_path, compression='gzip', delimiter=delimiter, encoding = encoding)

Pull in all the files from zippedData and put them all together into a dictionary

In [74]:
from os import listdir

zipped_path = 'zippedData/'
dfs = {}

#for all in zippedData:
for fn in listdir(zipped_path):
    dfs[fn[:-7]] = df_from_zipped(zipped_path, fn)

Now we have a dictionary filled with DataFrame objects

In [66]:
df_names = list(dfs.keys())
df_names

['imdb.title.crew',
 'tmdb.movies',
 'imdb.title.akas',
 'imdb.title.ratings',
 'imdb.name.basics',
 'rt.reviews',
 'imdb.title.basics',
 'rt.movie_info',
 'tn.movie_budgets',
 'bom.movie_gross',
 'imdb.title.principals']

## Exploring Each DataFrame

### DataFrame 1: "imdb_title_crew"

Let's start looking at each DF and finding out what each one contains

In [67]:
imdb_title_crew = dfs[df_names[0]]
imdb_title_crew.sample(5)

Unnamed: 0,tconst,directors,writers
30732,tt8435268,"nm4661972,nm0960495","nm9591496,nm9849133"
72678,tt1980180,nm1540670,
105427,tt5933608,nm8334381,"nm8334384,nm1826037,nm8334385"
93693,tt6734990,"nm3451937,nm3543091,nm8896923,nm1088738","nm3543091,nm3451937,nm1088738"
29467,tt3657200,nm0186610,"nm0186610,nm0560943"


In [98]:
imdb_title_crew.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 3 columns):
tconst       146144 non-null object
directors    140417 non-null object
writers      110261 non-null object
dtypes: object(3)
memory usage: 3.3+ MB


What are these values? Maybe IDs for another DataFrame

### DataFrame 2: 'imdb.movies'

In [68]:
imdb_movies = dfs['tmdb.movies']
imdb_movies.drop('Unnamed: 0', axis=1, inplace=True)

In [69]:
imdb_movies.sample(5)

Unnamed: 0,genre_ids,id,original_language,original_title,popularity,release_date,title,vote_average,vote_count
22160,"[80, 53]",521646,en,The Things We've Seen,1.4,2017-01-28,The Things We've Seen,10.0,2
22180,[],490299,en,The Legend of 420,1.4,2017-12-01,The Legend of 420,7.7,12
15129,"[27, 53]",370234,en,Dementia,2.43,2015-12-04,Dementia,4.7,36
9046,"[18, 36]",127856,fr,Augustine,1.76,2013-05-17,Augustine,5.5,23
25159,[53],533274,en,Stalked by My Doctor: Patient's Revenge,2.655,2018-06-10,Stalked by My Doctor: Patient's Revenge,10.0,1


In [100]:
imdb_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26517 entries, 0 to 26516
Data columns (total 9 columns):
genre_ids            26517 non-null object
id                   26517 non-null int64
original_language    26517 non-null object
original_title       26517 non-null object
popularity           26517 non-null float64
release_date         26517 non-null object
title                26517 non-null object
vote_average         26517 non-null float64
vote_count           26517 non-null int64
dtypes: float64(2), int64(2), object(5)
memory usage: 1.8+ MB


### DataFrame 3: 'imdb.title.akas'

In [71]:
imdb_titles_akas = dfs['imdb.title.akas']
imdb_titles_akas.sample(5)

Unnamed: 0,title_id,ordering,title,region,language,types,attributes,is_original_title
147703,tt2960218,1,LuTo,MX,,,,0.0
272872,tt5336634,1,Approach to Love II,CN,,,,0.0
220289,tt5132086,1,There Where Atilla Passes...,CA,en,imdbDisplay,,0.0
51708,tt1322312,11,Going the Distance,,,original,,1.0
126505,tt2991224,29,Mandarinas,ES,,imdbDisplay,,0.0


In [101]:
imdb_titles_akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 331703 entries, 0 to 331702
Data columns (total 8 columns):
title_id             331703 non-null object
ordering             331703 non-null int64
title                331703 non-null object
region               278410 non-null object
language             41715 non-null object
types                168447 non-null object
attributes           14925 non-null object
is_original_title    331678 non-null float64
dtypes: float64(1), int64(1), object(6)
memory usage: 20.2+ MB


### DataFrame 4: 'imdb.title.ratings'

In [72]:
imdb_title_ratings = dfs['imdb.title.ratings']
imdb_title_ratings.sample(5)

Unnamed: 0,tconst,averagerating,numvotes
1447,tt1772230,5.9,5534
70283,tt2106460,6.8,326
27033,tt5450570,6.9,172
26417,tt3421186,5.3,355
54431,tt5172372,6.9,8


In [102]:
imdb_title_ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 73856 entries, 0 to 73855
Data columns (total 3 columns):
tconst           73856 non-null object
averagerating    73856 non-null float64
numvotes         73856 non-null int64
dtypes: float64(1), int64(1), object(1)
memory usage: 1.7+ MB


### DataFrame 5: 'imdb.name.basics'

In [70]:
imdb_name_basics = dfs['imdb.name.basics']
imdb_name_basics.sample(5)

Unnamed: 0,nconst,primary_name,birth_year,death_year,primary_profession,known_for_titles
23213,nm0690703,Henrik Pontoppidan,1857.0,1943.0,writer,"tt0092103,tt6748178,tt0011765,tt8436026"
581176,nm7454430,Jay Finlayson,1994.0,,"actor,cinematographer,manager","tt2321596,tt5240540,tt2319282"
274009,nm3809260,Julieann Choi,,,actress,tt1600885
568715,nm9224802,Serge Reboul,,,actor,tt7290756
82672,nm0426577,Claire Johnston,,,actress,"tt1068649,tt0335119,tt0340096,tt5420870"


In [103]:
imdb_name_basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 606648 entries, 0 to 606647
Data columns (total 6 columns):
nconst                606648 non-null object
primary_name          606648 non-null object
birth_year            82736 non-null float64
death_year            6783 non-null float64
primary_profession    555308 non-null object
known_for_titles      576444 non-null object
dtypes: float64(2), object(4)
memory usage: 27.8+ MB


This is the table for people, can be joined on the nconst

### DataFrame 6: 'rt.reviews'

In [76]:
df_name_6 = df_names[5]
df_name_6

'rt.reviews'

In [82]:
rt_reviews = dfs[df_name_6]
rt_reviews.sample(5)

Unnamed: 0,id,review,rating,fresh,critic,top_critic,publisher,date
6511,251,The film can be seen as another element that q...,8/10,fresh,Uriel Barco,0,Garuyo,"February 12, 2016"
9589,364,"Like the first of the Addams chronicles, this ...",,rotten,Richard Schickel,1,TIME Magazine,"November 21, 2008"
46893,1762,After setting the scene with vivid characters ...,3/5,fresh,Rich Cline,0,Contactmusic.com,"October 9, 2015"
48745,1819,If Spielberg had found a way to inject even a ...,,rotten,Adam Lubitow,0,Rochester City Newspaper,"August 28, 2018"
48011,1793,"It's a bit somnolent, but the interactions bet...",,fresh,Stephen Farber,0,Movieline,"January 9, 2002"


In [104]:
rt_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 54432 entries, 0 to 54431
Data columns (total 8 columns):
id            54432 non-null int64
review        48869 non-null object
rating        40915 non-null object
fresh         54432 non-null object
critic        51710 non-null object
top_critic    54432 non-null int64
publisher     54123 non-null object
date          54432 non-null object
dtypes: int64(2), object(6)
memory usage: 3.3+ MB


### DataFrame 7: 'imdb.title.basics'

In [84]:
df_name_7 = df_names[6]
df_name_7

'imdb.title.basics'

In [88]:
imdb_title_basics = dfs[df_name_7]
imdb_title_basics.sample(5)

Unnamed: 0,tconst,primary_title,original_title,start_year,runtime_minutes,genres
94081,tt5479882,Cadalso,Cadalso,2012,73.0,Drama
126647,tt7741250,The Adventures of Link,The Adventures of Link,2018,,"Adventure,Fantasy"
70878,tt4054936,Kaul,Kaul,2016,118.0,"Drama,Mystery,Thriller"
118934,tt7133400,Walden,Walden,2017,,
109911,tt6447632,Midnight Record Search,Midnight Record Search,2016,,


In [105]:
imdb_title_basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 146144 entries, 0 to 146143
Data columns (total 6 columns):
tconst             146144 non-null object
primary_title      146144 non-null object
original_title     146123 non-null object
start_year         146144 non-null int64
runtime_minutes    114405 non-null float64
genres             140736 non-null object
dtypes: float64(1), int64(1), object(4)
memory usage: 6.7+ MB


### DataFrame 8: 'rt.movie_info'

In [89]:
df_name_8 = df_names[7]
df_name_8

'rt.movie_info'

In [90]:
rt_movie_info = dfs[df_name_8]
rt_movie_info.sample(5)

Unnamed: 0,id,synopsis,rating,genre,director,writer,theater_date,dvd_date,currency,box_office,runtime,studio
297,392,Featuring Jennifer Lopez in her first major bi...,R,Drama,Gregory Nava,Anna Thomas|Gregory Nava,"May 3, 1995","Apr 6, 2004",,,125 minutes,
331,435,Nella campagna toscana il venticinquenne Mario...,NR,Art House and International|Comedy,Giuseppe Bertolucci,Roberto Benigni|Giuseppe Bertolucci,"Jan 1, 1977","Dec 6, 2005",,,91 minutes,
411,537,"In director Richard Lester's Cuba, Sean Conner...",R,Action and Adventure|Drama|Mystery and Suspens...,Richard Lester,Charles Wood|Charles Wood,"Dec 21, 1979","Apr 16, 2002",,,121 minutes,
1542,1981,"Money, Fame and the Knowledge of English. In I...",NR,Comedy|Drama,Gauri Shinde,Gauri Shinde,"Oct 5, 2012","Nov 20, 2012",$,1416189.0,129 minutes,Eros Entertainment
512,672,A couple's strength and faith are tested after...,PG-13,Drama|Horror,Bill Duke,Brian Bird,"Jan 9, 2009","Apr 7, 2009",$,10572742.0,100 minutes,Screen Gems/SONY PICTURES


In [106]:
rt_movie_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1560 entries, 0 to 1559
Data columns (total 12 columns):
id              1560 non-null int64
synopsis        1498 non-null object
rating          1557 non-null object
genre           1552 non-null object
director        1361 non-null object
writer          1111 non-null object
theater_date    1201 non-null object
dvd_date        1201 non-null object
currency        340 non-null object
box_office      340 non-null object
runtime         1530 non-null object
studio          494 non-null object
dtypes: int64(1), object(11)
memory usage: 146.4+ KB


### DataFrame 9: 'tn.movie_budgets'

In [91]:
df_names[8]

'tn.movie_budgets'

In [92]:
tn_movie_budgets = dfs[df_names[8]]
tn_movie_budgets.sample(5)

Unnamed: 0,id,release_date,movie,production_budget,domestic_gross,worldwide_gross
3337,38,"Mar 1, 2013",Ernest et Celestine,"$12,500,000","$292,562","$9,376,444"
4084,85,"Feb 1, 2008",Hannah Montana/Miley Cyrus: Best of Both World...,"$6,500,000","$65,281,781","$70,712,099"
2127,28,"Aug 10, 1990",Flatliners,"$26,000,000","$61,308,153","$61,308,153"
5328,29,"Sep 27, 2002",Charly,"$950,000","$814,666","$814,666"
217,18,"Feb 26, 2016",Gods of Egypt,"$140,000,000","$31,153,464","$138,836,756"


In [107]:
tn_movie_budgets.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5782 entries, 0 to 5781
Data columns (total 6 columns):
id                   5782 non-null int64
release_date         5782 non-null object
movie                5782 non-null object
production_budget    5782 non-null object
domestic_gross       5782 non-null object
worldwide_gross      5782 non-null object
dtypes: int64(1), object(5)
memory usage: 271.2+ KB


### DataFrame 10: 'bom_movie_gross'

In [93]:
df_names[9]

'bom.movie_gross'

In [94]:
bom_movie_gross = dfs[df_names[9]]
bom_movie_gross.sample(5)

Unnamed: 0,title,studio,domestic_gross,foreign_gross,year
1822,Walk of Shame,FCW,59200.0,,2014
438,The Debt,Focus,31200000.0,14500000.0,2011
2902,Wonder Wheel,Amazon,1400000.0,14500000.0,2017
2949,Kedi,Osci.,2800000.0,,2017
135,Splice,WB,17000000.0,9800000.0,2010


In [108]:
bom_movie_gross.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3387 entries, 0 to 3386
Data columns (total 5 columns):
title             3387 non-null object
studio            3382 non-null object
domestic_gross    3359 non-null float64
foreign_gross     2037 non-null object
year              3387 non-null int64
dtypes: float64(1), int64(1), object(3)
memory usage: 132.4+ KB


### DataFrame 11: 'imdb.title.principals'

In [95]:
df_names[10]

'imdb.title.principals'

In [96]:
imdb_title_principals = dfs[df_names[10]]
imdb_title_principals.sample(5)


Unnamed: 0,tconst,ordering,nconst,category,job,characters
898056,tt9015306,4,nm4987883,writer,story consultant,
436328,tt3790342,1,nm6445182,self,,"[""Himself""]"
936667,tt7063210,9,nm1304832,composer,,
991040,tt9531110,1,nm0009595,director,,
9457,tt10097752,8,nm10580542,actor,,


In [109]:
imdb_title_principals.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1028186 entries, 0 to 1028185
Data columns (total 6 columns):
tconst        1028186 non-null object
ordering      1028186 non-null int64
nconst        1028186 non-null object
category      1028186 non-null object
job           177684 non-null object
characters    393360 non-null object
dtypes: int64(1), object(5)
memory usage: 47.1+ MB
