## TMDB - Systematische Evaluierung

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime

In [4]:
columns = [
    "adult",
    "belongs_to_collection",
    "budget",
    "genres",
    "id",
    "original_language",
    "release_date",
    "runtime",
    "spoken_languages",
    "status",
    "tagline",
    "title",
    "vote_average",
    "vote_count"
]
df = pd.read_csv("tmdb_movies.csv", sep="\t")
df

Unnamed: 0.1,Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,0,False,/s2bpgVhpWODDfoADW78IpMDCMTR.jpg,,1783810,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,79782,tt1684935,pl,...,2010-06-11,0,110,"[{'english_name': 'Czech', 'iso_639_1': 'cs', ...",Released,,Venice,False,7.000,13
1,1,False,,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 27, 'nam...",,141210,tt2250194,en,...,2012-10-12,0,6,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,The Sleepover,False,6.600,8
2,2,False,,,0,"[{'id': 18, 'name': 'Drama'}]",http://www.thefarmerswifefilm.co.uk/,143750,tt2140519,en,...,2012-06-20,0,18,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,The Farmer's Wife,False,10.000,1
3,3,False,,,0,"[{'id': 99, 'name': 'Documentary'}]",,84198,tt1736049,en,...,2012-03-22,0,84,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,One Nation. Underfed.,A Place at the Table,False,6.700,20
4,4,False,,,0,"[{'id': 10749, 'name': 'Romance'}, {'id': 18, ...",,171982,tt2378428,en,...,2012-10-09,0,27,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,Romance,False,6.000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19958,19958,False,/n7kr24jkZBg6EERpJBdKvOjMMdV.jpg,,0,"[{'id': 99, 'name': 'Documentary'}, {'id': 36,...",,646282,tt8132166,es,...,2019-11-08,0,107,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,What is happening in that room?,The Painting,False,8.000,2
19959,19959,False,/4evYVAzIHXSSVFxCQhBgkgj52pH.jpg,,0,"[{'id': 18, 'name': 'Drama'}, {'id': 36, 'name...",,595924,tt10199670,fr,...,2019-09-04,0,132,"[{'english_name': 'French', 'iso_639_1': 'fr',...",Released,,Liberte,False,5.400,22
19960,19960,False,/ekVWMz32hsrRuSLf5KTjg3PvcUa.jpg,,0,"[{'id': 36, 'name': 'History'}]",,622831,tt10551150,zh,...,2019-09-20,15030400,0,"[{'english_name': 'Mandarin', 'iso_639_1': 'zh...",Released,,Mao Zedong 1949,False,5.700,6
19961,19961,False,/3kb5b8IQCX4vd3baNBoZqAboP41.jpg,,0,"[{'id': 18, 'name': 'Drama'}]",,499546,tt6671244,nl,...,2018-07-12,0,100,"[{'english_name': 'Dutch', 'iso_639_1': 'nl', ...",Released,,We,False,5.938,56


In [5]:
df.overview

0        An atmospheric coming-of-age story featuring a...
1        The town of Derry has a secret, but no one tol...
2        As her surroundings are invaded by outsiders, ...
3        Using personal stories, this powerful document...
4        She's as hot as Britney Spears. Hotter. She pa...
                               ...                        
19958    For three and a half centuries, from the same ...
19959    1774, shortly before the French Revolution, so...
19960                                                  NaN
19961    During a scorching summer in a Belgian-Dutch b...
19962    33-years old Tamás Merthner is heartbroken, af...
Name: overview, Length: 19963, dtype: object

### fix genres, belongs_to_collection, spoken languages, remove missing data, fix data types

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19963 entries, 0 to 19962
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  19963 non-null  bool   
 1   belongs_to_collection  1886 non-null   object 
 2   budget                 19963 non-null  int64  
 3   genres                 19963 non-null  object 
 4   id                     19963 non-null  int64  
 5   original_language      19963 non-null  object 
 6   release_date           19963 non-null  object 
 7   runtime                19963 non-null  int64  
 8   spoken_languages       19963 non-null  object 
 9   status                 19963 non-null  object 
 10  tagline                8793 non-null   object 
 11  title                  19963 non-null  object 
 12  vote_average           19963 non-null  float64
 13  vote_count             19963 non-null  int64  
dtypes: bool(1), float64(1), int64(4), object(8)
memory usa

In [22]:
# change datatype of release date
df.release_date = pd.to_datetime(df.release_date)

In [23]:
# remove rows with release dates after 2021-01-01
df = df[df.release_date <= datetime.strptime("2020-12-31", "%Y-%m-%d")]

In [24]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 19949 entries, 0 to 19962
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   adult                  19949 non-null  bool          
 1   belongs_to_collection  1885 non-null   object        
 2   budget                 19949 non-null  int64         
 3   genres                 19949 non-null  object        
 4   id                     19949 non-null  int64         
 5   original_language      19949 non-null  object        
 6   release_date           19949 non-null  datetime64[ns]
 7   runtime                19949 non-null  int64         
 8   spoken_languages       19949 non-null  object        
 9   status                 19949 non-null  object        
 10  tagline                8785 non-null   object        
 11  title                  19949 non-null  object        
 12  vote_average           19949 non-null  float64       
 13  v