# Imported Libraries for Data Analysis

In [1]:
import pandas as pd
import numpy as np

# The Basics Database from the IMDB webside.
A section of this database is shown below to illustrate the names of the columns and rows within the database.

In [2]:
basics_url="https://datasets.imdbws.com/title.basics.tsv.gz"
basics = pd.read_csv(basics_url, sep='\t', low_memory=False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


# Information
This information shows the data type that is located in the database.
It shows the names of the columns.
The shape of the database is 9,906,183 rows and 9 columns.
There are no duplicated columns.
There are 37 values that are N/A.  Those were taken care of and replaced so that the data would be used correctly in the analysis.

In [3]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9925060 entries, 0 to 9925059
Data columns (total 9 columns):
 #   Column          Dtype 
---  ------          ----- 
 0   tconst          object
 1   titleType       object
 2   primaryTitle    object
 3   originalTitle   object
 4   isAdult         object
 5   startYear       object
 6   endYear         object
 7   runtimeMinutes  object
 8   genres          object
dtypes: object(9)
memory usage: 681.5+ MB


In [4]:
basics.shape

(9925060, 9)

In [5]:
basics.duplicated().sum()

0

In [6]:
basics.isna().sum().sum()

37

In [7]:
basics.replace({'\\N':np.nan}, inplace = True)

# Value Counts
This was used to show how many movies are in each time frame of the movie.

In [8]:
basics['runtimeMinutes'].value_counts()

30      220006
60      167810
22      164968
15       81921
44       76053
         ...  
456          1
736          1
529          1
673          1
2088         1
Name: runtimeMinutes, Length: 890, dtype: int64

# Dropped Columns
The runtumeMinutes column was dropped.
The genres column was dropped.

In [9]:
basics.dropna(subset= ['runtimeMinutes'], inplace=True)
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 1
startYear          173814
endYear           2886950
runtimeMinutes          0
genres              76737
dtype: int64

In [10]:
basics.dropna(subset= ['genres'], inplace=True)
basics.isna().sum()

tconst                  0
titleType               0
primaryTitle            1
originalTitle           1
isAdult                 0
startYear          168882
endYear           2811810
runtimeMinutes          0
genres                  0
dtype: int64

# Value Counts
The value counts of the title type is listed below.

In [11]:
basics['titleType'].value_counts()

tvEpisode       1465640
short            604156
movie            383743
video            181238
tvMovie           91996
tvSeries          90897
tvSpecial         18319
tvMiniSeries      17325
tvShort            8689
videoGame           322
Name: titleType, dtype: int64

# Formation
The data is formed into variables that are needed to complete the analysis.

In [12]:
basics['titleType']=='movie'
FLMovie = basics['titleType']=='movie'

In [13]:
basics.loc[FLMovie,:]
basics = basics.loc[FLMovie,:]

In [14]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 383743 entries, 8 to 9925010
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          383743 non-null  object
 1   titleType       383743 non-null  object
 2   primaryTitle    383743 non-null  object
 3   originalTitle   383743 non-null  object
 4   isAdult         383743 non-null  object
 5   startYear       377268 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  383743 non-null  object
 8   genres          383743 non-null  object
dtypes: object(9)
memory usage: 29.3+ MB


In [15]:
#year  change to a float- .astype(float)
basics['startYear'].value_counts().astype(float)

2017    14384.0
2018    14346.0
2019    14100.0
2016    13965.0
2015    13479.0
         ...   
1899        1.0
1904        1.0
1897        1.0
1896        1.0
1894        1.0
Name: startYear, Length: 130, dtype: float64

# Focus
The focus of this dataset are movies created beginning in year 2000 to 2022.

In [16]:
basics = basics[(basics['startYear']>='2000') & (basics['startYear']<'2022')]

In [17]:
# Exclude movies that are included in the documentary category.
#Check order in LP
is_documentary = basics['genres'].str.contains('documentary',case=False)
basics = basics[~is_documentary]

In [18]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 138608 entries, 34803 to 9924910
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype 
---  ------          --------------   ----- 
 0   tconst          138608 non-null  object
 1   titleType       138608 non-null  object
 2   primaryTitle    138608 non-null  object
 3   originalTitle   138608 non-null  object
 4   isAdult         138608 non-null  object
 5   startYear       138608 non-null  object
 6   endYear         0 non-null       object
 7   runtimeMinutes  138608 non-null  object
 8   genres          138608 non-null  object
dtypes: object(9)
memory usage: 10.6+ MB


# The Akas Database from the IMDB webside.
A section of this database is shown below to illustrate the names of the columns and rows within the database.

# Value Counts
The value counts of the region is listed below.

In [19]:
akas="https://datasets.imdbws.com/title.akas.tsv.gz"
akas = pd.read_csv(akas, sep='\t', low_memory=False)
akas.head()

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,\N,imdbDisplay,\N,0
1,tt0000001,2,Carmencita,DE,\N,\N,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,\N,imdbDisplay,\N,0
3,tt0000001,4,Καρμενσίτα,GR,\N,imdbDisplay,\N,0
4,tt0000001,5,Карменсита,RU,\N,imdbDisplay,\N,0


In [20]:
akas['region'].value_counts()

DE    4338777
FR    4334263
JP    4332470
IN    4274356
ES    4253972
       ...   
FM          2
TV          1
PW          1
NR          1
NU          1
Name: region, Length: 248, dtype: int64

#  Replace
Those were taken care of and replaced so that the data would be used correctly in the analysis.

In [21]:
akas.replace({'\\N':np.nan})

Unnamed: 0,titleId,ordering,title,region,language,types,attributes,isOriginalTitle
0,tt0000001,1,Карменсіта,UA,,imdbDisplay,,0
1,tt0000001,2,Carmencita,DE,,,literal title,0
2,tt0000001,3,Carmencita - spanyol tánc,HU,,imdbDisplay,,0
3,tt0000001,4,Καρμενσίτα,GR,,imdbDisplay,,0
4,tt0000001,5,Карменсита,RU,,imdbDisplay,,0
...,...,...,...,...,...,...,...,...
36208232,tt9916852,5,Episódio #3.20,PT,pt,,,0
36208233,tt9916852,6,Episodio #3.20,IT,it,,,0
36208234,tt9916852,7,एपिसोड #3.20,IN,hi,,,0
36208235,tt9916856,1,The Wind,DE,,imdbDisplay,,0


# Focus
The focus of this dataset is the region of the database.

In [22]:
akas = akas[(akas['region'] == 'US')]

In [23]:
# Filter the basics table down to only include the US by using the filter akas dataframe
keepers = basics['tconst'].isin(akas['titleId'])
basics =  basics[keepers]

In [24]:
akas['region'].value_counts()

US    1444607
Name: region, dtype: int64

In [25]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1444607 entries, 5 to 36207981
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1444607 non-null  object
 1   ordering         1444607 non-null  int64 
 2   title            1444607 non-null  object
 3   region           1444607 non-null  object
 4   language         1444607 non-null  object
 5   types            1444607 non-null  object
 6   attributes       1444607 non-null  object
 7   isOriginalTitle  1444607 non-null  object
dtypes: int64(1), object(7)
memory usage: 99.2+ MB


# The Ratings Database from the IMDB webside.
A section of this database is shown below to illustrate the names of the columns and rows within the database.

In [26]:
ratings="https://datasets.imdbws.com/title.ratings.tsv.gz"
ratings = pd.read_csv(ratings, sep='\t', low_memory=False)
ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1979
1,tt0000002,5.8,265
2,tt0000003,6.5,1832
3,tt0000004,5.6,179
4,tt0000005,6.2,2621


#  Replace
Those were taken care of and replaced so that the data would be used correctly in the analysis.

In [27]:
ratings.replace({'\\N':np.nan})

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1979
1,tt0000002,5.8,265
2,tt0000003,6.5,1832
3,tt0000004,5.6,179
4,tt0000005,6.2,2621
...,...,...,...
1318731,tt9916730,8.3,10
1318732,tt9916766,7.0,21
1318733,tt9916778,7.2,36
1318734,tt9916840,7.5,7


# Focus
The focus of this dataset is the title ID of the database.

In [28]:
keepers = ratings['tconst'].isin(akas['titleId'])
ratings = ratings[keepers]

# New Table
The following boxes takes all the individual tables and puts them into one table that is used for the analysis.  Because the missing values have already been taken care of above, we know they are not an consideration now.  This dataset shows the movies that are full length movies, fictional movies, movies released from 2000 to 2022, and were released only in the United States.

In [29]:
# example making new folder with os
import os
os.makedirs('Data/',exist_ok=True) 
# Confirm folder created
os.listdir("Data/")

['.ipynb_checkpoints']

In [30]:
['title_basics.csv.gz']

['title_basics.csv.gz']

In [34]:
# Open saved file and preview again
#basics = pd.read_csv("Data/title_basics.csv.gz", low_memory = False)
#basics.head()

In [35]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 501008 entries, 0 to 1318711
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         501008 non-null  object 
 1   averageRating  501008 non-null  float64
 2   numVotes       501008 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 15.3+ MB


In [36]:
## Save current dataframe to file.
basics.to_csv("Data/title_basics.csv.gz",compression='gzip',index=False)

In [37]:
basics.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 81762 entries, 0 to 81761
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   tconst          81762 non-null  object 
 1   titleType       81762 non-null  object 
 2   primaryTitle    81762 non-null  object 
 3   originalTitle   81762 non-null  object 
 4   isAdult         81762 non-null  int64  
 5   startYear       81762 non-null  int64  
 6   endYear         0 non-null      float64
 7   runtimeMinutes  81762 non-null  int64  
 8   genres          81762 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 5.6+ MB


In [38]:
## Save current dataframe to file.
akas.to_csv("Data/title_akas.csv.gz",compression='gzip',index=False)

In [39]:
## Save current dataframe to file.
ratings.to_csv("Data/title_ratings.csv.gz",compression='gzip',index=False)

In [40]:
akas.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1444607 entries, 5 to 36207981
Data columns (total 8 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   titleId          1444607 non-null  object
 1   ordering         1444607 non-null  int64 
 2   title            1444607 non-null  object
 3   region           1444607 non-null  object
 4   language         1444607 non-null  object
 5   types            1444607 non-null  object
 6   attributes       1444607 non-null  object
 7   isOriginalTitle  1444607 non-null  object
dtypes: int64(1), object(7)
memory usage: 99.2+ MB


In [41]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 501008 entries, 0 to 1318711
Data columns (total 3 columns):
 #   Column         Non-Null Count   Dtype  
---  ------         --------------   -----  
 0   tconst         501008 non-null  object 
 1   averageRating  501008 non-null  float64
 2   numVotes       501008 non-null  int64  
dtypes: float64(1), int64(1), object(1)
memory usage: 15.3+ MB
