# Imported Libraries for Data Analysis

In [1]:
import pandas as pd
import numpy as np

# The Basics Database from the IMDB webside.
A section of this database is shown below to illustrate the names of the columns and rows within the database.

In [2]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


# Information
This information shows the data type that is located in the database.
It shows the names of the columns.
The shape of the database is 9,906,183 rows and 9 columns.
There are no duplicated columns.
There are 37 values that are N/A.  Those were taken care of and replaced so that the data would be used correctly in the analysis.

In [3]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9906183 entries, 0 to 9906182
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 680.2+ MB


In [4]:
basics.shape

(9906183, 9)

In [5]:
basics.duplicated().sum()

0

In [6]:
basics.isna().sum().sum()

37

In [7]:
basics.replace({'\\N':np.nan}, inplace = True)

# Value Counts
This was used to show how many movies are in each time frame of the movie.

In [8]:
basics['runtimeMinutes'].value_counts()

30      219641
60      167503
22      161731
15       78463
44       75985
         ...  
736          1
529          1
673          1
830          1
2088         1
Name: runtimeMinutes, Length: 890, dtype: int64

# Dropped Columns
The runtumeMinutes column was dropped.
The genres column was dropped.

In [9]:
basics.dropna(subset= ['runtimeMinutes'], inplace=True)
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 1
startYear          171828
endYear           2876979
runtimeMinutes          0
genres              77023
dtype: int64

In [10]:
basics.dropna(subset= ['genres'], inplace=True)
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear          166893
endYear           2801554
runtimeMinutes          0
genres                  0
dtype: int64

# Value Counts
The value counts of the title type is listed below.

In [11]:
basics['titleType'].value_counts()

tvEpisode       1456595
short            603479
movie            383454
video            181018
tvMovie           91897
tvSeries          90793
tvSpecial         18285
tvMiniSeries      17287
tvShort            8729
videoGame           322
Name: titleType, dtype: int64

# Formation
The data is formed into variables that are needed to complete the analysis.

In [12]:
basics['titleType']=='movie'
FLMovie = basics['titleType']=='movie'

In [13]:
basics.loc[FLMovie,:]
basics = basics.loc[FLMovie,:]

In [14]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 383454 entries, 8 to 9906133
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          383454 non-null  object
 1   titleType       383454 non-null  object
 2   primaryTitle    383454 non-null  object
 3   originalTitle   383454 non-null  object
 4   isAdult         383454 non-null  object
 5   startYear       376986 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  383454 non-null  object
 8   genres          383454 non-null  object
dtypes: object(9)
memory usage: 29.3+ MB


In [15]:
#year  change to a float- .astype(float)
basics['startYear'].value_counts().astype(float)

2017    14386.0
2018    14348.0
2019    14096.0
2016    13965.0
2015    13476.0
         ...   
1899        1.0
1904        1.0
1897        1.0
1896        1.0
1894        1.0
Name: startYear, Length: 130, dtype: float64

# Focus
The focus of this dataset are movies created beginning in year 2000 to 2022.

In [16]:
basics = basics[(basics['startYear']>='2000') & (basics['startYear']<'2022')]

In [17]:
# Exclude movies that are included in the documentary category.
#Check order in LP
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [31]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81778 entries, 0 to 81777
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          81778 non-null  object 
 1   titleType       81778 non-null  object 
 2   primaryTitle    81778 non-null  object 
 3   originalTitle   81778 non-null  object 
 4   isAdult         81778 non-null  int64  
 5   startYear       81778 non-null  int64  
 6   endYear         0 non-null      float64
 7   runtimeMinutes  81778 non-null  int64  
 8   genres          81778 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 5.6+ MB


# The Akas Database from the IMDB webside.
A section of this database is shown below to illustrate the names of the columns and rows within the database.

In [18]:
akas="https://datasets.imdbws.com/title.akas.tsv.gz"
akas = pd.read_csv(akas, sep='\t', low_memory=False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


# Value Counts
The value counts of the region is listed below.

In [19]:
akas['region'].value_counts()

DE    4325238
FR    4321049
JP    4319343
IN    4260982
ES    4240774
       ...   
FM          2
TV          1
PW          1
NR          1
NU          1
Name: region, Length: 248, dtype: int64

#  Replace
Those were taken care of and replaced so that the data would be used correctly in the analysis.

In [20]:
akas.replace({'\\N':np.nan})

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0
...,...,...,...,...,...,...,...,...
36106098,tt9916852,5,Episódio #3.20,PT,pt,,,0
36106099,tt9916852,6,Episodio #3.20,IT,it,,,0
36106100,tt9916852,7,एपिसोड #3.20,IN,hi,,,0
36106101,tt9916856,1,The Wind,DE,,imdbDisplay,,0


# Focus
The focus of this dataset is the region of the database.

In [21]:
akas = akas[(akas['region'] == 'US')]

In [22]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers = basics['tconst'].isin(akas['titleId'])
basics =  basics[keepers]

In [23]:
akas['region'].value_counts()

US    1441687
Name: region, dtype: int64

In [34]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1441687 entries, 5 to 36105847
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1441687 non-null  object
 1   ordering         1441687 non-null  int64 
 2   title            1441687 non-null  object
 3   region           1441687 non-null  object
 4   language         1441687 non-null  object
 5   types            1441687 non-null  object
 6   attributes       1441687 non-null  object
 7   isOriginalTitle  1441687 non-null  object
dtypes: int64(1), object(7)
memory usage: 99.0+ MB


# The Ratings Database from the IMDB webside.
A section of this database is shown below to illustrate the names of the columns and rows within the database.

In [24]:
ratings="https://datasets.imdbws.com/title.ratings.tsv.gz"
ratings = pd.read_csv(ratings, sep='\t', low_memory=False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1978
1,tt0000002,5.8,264
2,tt0000003,6.5,1831
3,tt0000004,5.6,179
4,tt0000005,6.2,2621


#  Replace
Those were taken care of and replaced so that the data would be used correctly in the analysis.

In [25]:
ratings.replace({'\\N':np.nan})

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1978
1,tt0000002,5.8,264
2,tt0000003,6.5,1831
3,tt0000004,5.6,179
4,tt0000005,6.2,2621
...,...,...,...
1319458,tt9916730,8.3,10
1319459,tt9916766,7.0,21
1319460,tt9916778,7.2,36
1319461,tt9916840,7.5,7


# Focus
The focus of this dataset is the title ID of the database.

In [26]:
keepers = ratings['tconst'].isin(akas['titleId'])
ratings = ratings[keepers]

# New Table
The following boxes takes all the individual tables and puts them into one table that is used for the analysis.  Because the missing values have already been taken care of above, we know they are not an consideration now.  This dataset shows the movies that are full length movies, fictional movies, movies released from 2000 to 2022, and were released only in the United States.

In [27]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

['title_basics.csv.gz']

In [28]:
['title_basics.csv.gz']

['title_basics.csv.gz']

In [29]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [30]:
# Open saved file and preview again
basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001,,118,"Comedy,Fantasy,Romance"
1,tt0043139,movie,Life of a Beijing Policeman,Wo zhe yi bei zi,0,2013,,120,"Drama,History"
2,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El tango del viudo y su espejo deformante,0,2020,,70,Drama
3,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018,,122,Drama
4,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005,,100,"Comedy,Horror,Sci-Fi"


In [32]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500861 entries, 0 to 1319438
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         500861 non-null  object 
 1   averageRating  500861 non-null  float64
 2   numVotes       500861 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 15.3+ MB
