Student: Satish Byrow

#**1. Goal**
- For this project, you have been hired to produce a MySQL database on Movies from a subset of IMDB's publicly available dataset. Ultimately, you will use this database to analyze what makes a movie successful and will provide recommendations to the stakeholder on how to make a successful movie.

##Dictionary
TBC

#**2. Import and Loading**

## Load Libraries

In [1]:
#Load Libraries
import pandas as pd
import numpy as np

## Load Data

In [35]:
## Loading data from local repository
basics_url="Data/title.basics.tsv.gz"
rating_url="Data/title.ratings.tsv.gz"
akas_url="Data/title.akas.tsv.gz"

basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
ratings = pd.read_csv(rating_url, sep='\t', low_memory=False)
akas = pd.read_csv(akas_url, sep='\t', low_memory=False)


In [32]:
#Remove null values
basics.replace({'\\N':np.nan}, inplace=True)
ratings.replace({'\\N':np.nan}, inplace=True)
akas.replace({'\\N':np.nan}, inplace=True)

#**3. Explore Data Set**

In [4]:
#Check then data
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,,1,"Comedy,Short"


In [5]:
#Check then data
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10243653 entries, 0 to 10243652
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 703.4+ MB


In [6]:
#Check then data
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2002
1,tt0000002,5.8,269
2,tt0000003,6.5,1892
3,tt0000004,5.5,178
4,tt0000005,6.2,2678


In [7]:
#Check then data
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0


In [8]:
#Check then data
akas.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 37510211 entries, 0 to 37510210
Data columns (total 8 columns):
 #   Column           Dtype 
---  ------           ----- 
 0   titleId          object
 1   ordering         int64 
 2   title            object
 3   region           object
 4   language         object
 5   types            object
 6   attributes       object
 7   isOriginalTitle  object
dtypes: int64(1), object(7)
memory usage: 2.2+ GB


#**4. Clean Data Set**

In [9]:
#Check all records with runtime minutes null
basics['runtimeMinutes'].isnull().sum()

7154281

In [10]:
#Check all records with runtime minutes null
indexNames = basics[(basics['runtimeMinutes'].isnull())].index 
# Delete these row indexes from dataFrame
basics.drop(indexNames , inplace=True)
basics['runtimeMinutes'].isnull().sum()

0

In [11]:
#Check all records with genre minutes null
basics['genres'].isnull().sum()

80733

In [12]:
#Check all records with genres null
indexNames = basics[(basics['genres'].isnull())].index 
# Delete these row indexes from dataFrame
basics.drop(indexNames , inplace=True)
basics['genres'].isnull().sum()

0

In [13]:
#Count all records before
basics[~basics['titleType'].str.contains("Movie", case=False)].count()


tconst            2525073
titleType         2525073
primaryTitle      2525072
originalTitle     2525072
isAdult           2525073
startYear         2341222
endYear             52699
runtimeMinutes    2525073
genres            2525073
dtype: int64

In [14]:
#Check all records with titletype that does not have movie
indexNames = basics[~(basics['titleType'].str.contains("Movie", case=False))].index 
# Delete these row indexes from dataFrame
basics.drop(indexNames , inplace=True)

In [15]:
#Check all records after
basics['titleType'].str.contains("Movie", case=False).count()


483566

In [16]:
basics['startYear'].count()

476578

In [37]:
#Check all records with year
basics = basics[basics['startYear'].str.contains('2000|2001|2002', regex=True, case=False)]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
15176,tt0015414,movie,La tierra de los toros,La tierra de los toros,0,2000,\N,60,\N
33800,tt0034413,short,Youth Gets a Break,Youth Gets a Break,0,2001,\N,20,Short
34800,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,\N,118,"Comedy,Fantasy,Romance"
92617,tt0094718,short,Beavers,Beavers,0,2002,\N,31,"Documentary,Short"
93927,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,\N,126,Drama
...,...,...,...,...,...,...,...,...,...
10242765,tt9914878,tvEpisode,Episode #1.969,Episode #1.969,0,2001,\N,22,"Drama,Fantasy,Romance"
10242853,tt9915084,tvEpisode,Making-of,Making-of,0,2000,\N,\N,Comedy
10243215,tt9915932,videoGame,Aero Dancing F,Aero Dancing F,0,2000,\N,\N,\N
10243277,tt9916064,videoGame,AeroWings,Aero Dancing i,0,2001,\N,\N,\N


In [23]:
basics['startYear'].count()

483566

In [24]:
# Exclude movies that are included in the documentary category.
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]


In [25]:
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0


In [38]:
# Filter the basics table down to only include the US by using the filter akas dataframe
is_us = (akas['region'] == 'US')
akas = akas[is_us]
keepers = (basics['tconst'].isin(akas['titleId']))
keepers

15176       False
33800        True
34800        True
92617        True
93927        True
            ...  
10242765    False
10242853    False
10243215    False
10243277    False
10243310    False
Name: tconst, Length: 319434, dtype: bool

In [39]:
basics = basics[keepers]
basics

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
33800,tt0034413,short,Youth Gets a Break,Youth Gets a Break,0,2001,\N,20,Short
34800,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,\N,118,"Comedy,Fantasy,Romance"
92617,tt0094718,short,Beavers,Beavers,0,2002,\N,31,"Documentary,Short"
93927,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,\N,126,Drama
104782,tt0107194,short,Imagine,Imagine,0,2002,\N,22,"Documentary,Short"
...,...,...,...,...,...,...,...,...,...
10239444,tt9907718,short,Homeless Love,Homeless Love,0,2002,\N,\N,"Drama,Short"
10239773,tt9908414,tvSeries,Block 13,Block 13,0,2000,2003,10,Animation
10240402,tt9909874,video,Halloween: Resurrection - On the Set with Jami...,Halloween: Resurrection - On the Set with Jami...,0,2002,\N,4,"Documentary,Horror,Short"
10240422,tt9909916,video,Halloween: Resurrection - Tour of Set with Pro...,Halloween: Resurrection - Tour of Set with Pro...,0,2002,\N,7,"Documentary,Horror,Short"


In [40]:
akas


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
5,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
14,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
33,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
36,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
41,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0
...,...,...,...,...,...,...,...,...
37509737,tt9916560,1,March of Dimes Presents: Once Upon a Dime,US,\N,imdbDisplay,\N,0
37509807,tt9916620,1,The Copeland Case,US,\N,imdbDisplay,\N,0
37509896,tt9916702,1,Loving London: The Playground,US,\N,\N,\N,0
37509939,tt9916756,1,Pretty Pretty Black Girl,US,\N,imdbDisplay,\N,0


In [41]:
# Filter the basics table down to only include the US by using the filter ratings dataframe
#is_us = (akas['region'] == 'US')
#akas = akas[is_us]
keepers1 = (ratings['tconst'].isin(akas['titleId']))
keepers1

0           True
1           True
2          False
3          False
4           True
           ...  
1359499    False
1359500    False
1359501    False
1359502    False
1359503    False
Name: tconst, Length: 1359504, dtype: bool

In [42]:
# Filter the basics table down to only include the US by using the filter ratings dataframe
ratings = ratings[keepers1]
ratings


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2002
1,tt0000002,5.8,269
4,tt0000005,6.2,2678
5,tt0000006,5.0,182
6,tt0000007,5.4,838
...,...,...,...
1359466,tt9916200,8.1,238
1359467,tt9916204,8.2,273
1359474,tt9916348,8.3,18
1359475,tt9916362,6.4,5570


In [None]:
#remove orphaned records from akas
# Get indexes where name column doesn't have value tconst
#akas['titleId'].sum()
#indexNames = akas[~(akas['titleId'].isin(basics['tconst']))].index 
# Delete these row indexes from dataFrame
#akas.drop(indexNames , inplace=True)
#akas['titleId'].sum()

In [None]:
#remove orphaned records from ratings
# Get indexes where name column doesn't have value tconst
#ratings['tconst'].sum()
#indexNames = ratings[~(ratings['tconst'].isin(basics['tconst']))].index 
# Delete these row indexes from dataFrame
#ratings.drop(indexNames , inplace=True)
#ratings['tconst'].sum()


#**4. Write the data**

In [43]:
## Save current dataframe to file with underscore
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

In [44]:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()


Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0034413,short,Youth Gets a Break,Youth Gets a Break,0,2001,\N,20,Short
1,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,\N,118,"Comedy,Fantasy,Romance"
2,tt0094718,short,Beavers,Beavers,0,2002,\N,31,"Documentary,Short"
3,tt0096056,movie,Crime and Punishment,Crime and Punishment,0,2002,\N,126,Drama
4,tt0107194,short,Imagine,Imagine,0,2002,\N,22,"Documentary,Short"


In [45]:
# Open saved file and preview again
ratings = pd.read_csv("Data/title_ratings.csv.gz", low_memory = False)
ratings.head()


Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,2002
1,tt0000002,5.8,269
2,tt0000005,6.2,2678
3,tt0000006,5.0,182
4,tt0000007,5.4,838


In [46]:
# Open saved file and preview again
akas = pd.read_csv("Data/title_akas.csv.gz", low_memory = False)
akas.head()


Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,6,Carmencita,US,\N,imdbDisplay,\N,0
1,tt0000002,7,The Clown and His Dogs,US,\N,\N,literal English title,0
2,tt0000005,10,Blacksmith Scene,US,\N,imdbDisplay,\N,0
3,tt0000005,1,Blacksmithing Scene,US,\N,alternative,\N,0
4,tt0000005,6,Blacksmith Scene #1,US,\N,alternative,\N,0
