# Movies_Project.

In [28]:
import pandas as pd
import numpy as np

In [29]:
#t_basics_url = "https://datasets.imdbws.com/title.basics.tsv.gz"

In [30]:
basics= pd.read_csv(t_basics_url, sep ='\t', low_memory =False)
basics.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0000001,short,Carmencita,Carmencita,0,1894,\N,1,"Documentary,Short"
1,tt0000002,short,Le clown et ses chiens,Le clown et ses chiens,0,1892,\N,5,"Animation,Short"
2,tt0000003,short,Pauvre Pierrot,Pauvre Pierrot,0,1892,\N,4,"Animation,Comedy,Romance"
3,tt0000004,short,Un bon bock,Un bon bock,0,1892,\N,12,"Animation,Short"
4,tt0000005,short,Blacksmith Scene,Blacksmith Scene,0,1893,\N,1,"Comedy,Short"


Filtering out movies with null values in genre or runtime



In [31]:
#Converting the \N null value into a pandas recognizable format of np.nan
basics =basics.replace({"\\N":np.nan})

Checking the null values

In [32]:
basics.isnull().sum()

tconst                  0
titleType               0
primaryTitle           11
originalTitle          11
isAdult                 1
startYear         1182517
endYear           8783997
runtimeMinutes    6482276
genres             404554
dtype: int64

In [33]:
#Dropping null values in genre and runtime
basics = basics.dropna(subset=['genres', 'runtimeMinutes'])

In [34]:
#Confirming changes
basics.isnull().sum()

tconst                  0
titleType               0
primaryTitle            0
originalTitle           0
isAdult                 0
startYear           34155
endYear           2282294
runtimeMinutes          0
genres                  0
dtype: int64

Filtering only movies with full-length (titleType = "movie")

In [35]:
basics = basics[basics["titleType"] == 'movie']

In [36]:
#Confirming changes
basics.head(5)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
570,tt0000574,movie,The Story of the Kelly Gang,The Story of the Kelly Gang,0,1906,,70,"Action,Adventure,Biography"
587,tt0000591,movie,The Prodigal Son,L'enfant prodigue,0,1907,,90,Drama
672,tt0000679,movie,The Fairylogue and Radio-Plays,The Fairylogue and Radio-Plays,0,1908,,120,"Adventure,Fantasy"
1172,tt0001184,movie,Don Juan de Serrallonga,Don Juan de Serrallonga,0,1910,,58,"Adventure,Drama"
1273,tt0001285,movie,The Life of Moses,The Life of Moses,0,1909,,50,"Biography,Drama,Family"


Filtering only movies that were released between 2000 and 2021(2000, and 2021 inclusive)

In [50]:
basics['startYear'] = basics['startYear'].astype(int)

In [38]:
basics['startYear'] >=2000 

570        False
587        False
672        False
1172       False
1273       False
           ...  
8873593     True
8873677     True
8873718     True
8873745     True
8873778     True
Name: startYear, Length: 359011, dtype: bool

In [39]:
basics['startYear'] <=2021

570        True
587        True
672        True
1172       True
1273       True
           ... 
8873593    True
8873677    True
8873718    True
8873745    True
8873778    True
Name: startYear, Length: 359011, dtype: bool

In [40]:
basics = basics[(basics['startYear'] >=2000 ) & (basics['startYear'] <=2021)]

In [51]:
#Confirming changes
basics.tail(10)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8872616,tt9914192,movie,No Gogó do Paulinho,No Gogó do Paulinho,0,2020,,98,Comedy
8872657,tt9914286,movie,Sokagin Çocuklari,Sokagin Çocuklari,0,2019,,98,"Drama,Family"
8872913,tt9914828,movie,The War of Godzilla,The War of Godzilla,0,2015,,102,"Action,Comedy,Family"
8872964,tt9914942,movie,Life Without Sara Amat,La vida sense la Sara Amat,0,2019,,74,Drama
8873360,tt9915872,movie,The Last White Witch,My Girlfriend is a Wizard,0,2019,,97,"Comedy,Drama,Fantasy"
8873500,tt9916170,movie,The Rehearsal,O Ensaio,0,2019,,51,Drama
8873509,tt9916190,movie,Safeguard,Safeguard,0,2020,,90,"Action,Adventure,Thriller"
8873548,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020,,84,Thriller
8873593,tt9916362,movie,Coven,Akelarre,0,2020,,92,"Drama,History"
8873677,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,2019,,123,Drama


**Filtering only fictional movies(not from documentary genre)**

In [42]:
non_documentary = basics['genres'].str.contains('Documentary', case = False)

In [43]:
basics = basics[~non_documentary]

In [44]:
#confirming changes
basics.tail()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
8873500,tt9916170,movie,The Rehearsal,O Ensaio,0,2019.0,,51,Drama
8873509,tt9916190,movie,Safeguard,Safeguard,0,2020.0,,90,"Action,Adventure,Thriller"
8873548,tt9916270,movie,Il talento del calabrone,Il talento del calabrone,0,2020.0,,84,Thriller
8873593,tt9916362,movie,Coven,Akelarre,0,2020.0,,92,"Drama,History"
8873677,tt9916538,movie,Kuambil Lagi Hatiku,Kuambil Lagi Hatiku,0,2019.0,,123,Drama


**Saving the filtered basics dataset temporary as a .csv file**

In [45]:
basics.to_csv("Title_Basics.csv", index =False)

In [48]:
#confirming saved file
df = pd.read_csv("Title_Basics.csv")
df.head()

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0035423,movie,Kate & Leopold,Kate & Leopold,0,2001.0,,118,"Comedy,Fantasy,Romance"
1,tt0062336,movie,The Tango of the Widower and Its Distorting Mi...,El Tango del Viudo y Su Espejo Deformante,0,2020.0,,70,Drama
2,tt0069049,movie,The Other Side of the Wind,The Other Side of the Wind,0,2018.0,,122,Drama
3,tt0079644,movie,November 1828,November 1828,0,2001.0,,140,"Drama,War"
4,tt0088751,movie,The Naked Monster,The Naked Monster,0,2005.0,,100,"Comedy,Horror,Sci-Fi"


**The Title_Basic file was successfully saved**