## TMDB - Systematische Evaluierung

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime

In [2]:
df_links = pd.read_csv("ml-25m/links.csv")
df_tmdb_movies = pd.read_csv("tmdb_movies.csv", sep="\t")

# rename id column so that we can merge later
df_tmdb_movies.rename(columns={'id':'tmdbId'}, inplace=True)

In [3]:
# merge movielens movies with tmdb movies
df_movies = pd.merge(df_links, df_tmdb_movies, on='tmdbId')
df_movies.head()

Unnamed: 0.1,movieId,imdbId,tmdbId,Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,895,105729,79782.0,0,False,/s2bpgVhpWODDfoADW78IpMDCMTR.jpg,,1783810,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,...,2010-06-11,0,110,"[{'english_name': 'Czech', 'iso_639_1': 'cs', ...",Released,,Venice,False,7.0,13
1,895,105729,79782.0,13349,False,/s2bpgVhpWODDfoADW78IpMDCMTR.jpg,,1783810,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,...,2010-06-11,0,110,"[{'english_name': 'Czech', 'iso_639_1': 'cs', ...",Released,,Venice,False,7.0,13
2,181393,1684935,79782.0,0,False,/s2bpgVhpWODDfoADW78IpMDCMTR.jpg,,1783810,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,...,2010-06-11,0,110,"[{'english_name': 'Czech', 'iso_639_1': 'cs', ...",Released,,Venice,False,7.0,13
3,181393,1684935,79782.0,13349,False,/s2bpgVhpWODDfoADW78IpMDCMTR.jpg,,1783810,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",,...,2010-06-11,0,110,"[{'english_name': 'Czech', 'iso_639_1': 'cs', ...",Released,,Venice,False,7.0,13
4,1115,114472,141210.0,1,False,,,0,"[{'id': 35, 'name': 'Comedy'}, {'id': 27, 'nam...",,...,2012-10-12,0,6,"[{'english_name': 'English', 'iso_639_1': 'en'...",Released,,The Sleepover,False,6.6,8


In [4]:
# extract data from dictionaries and list, separate key values by '|'

def extract_values(data):
    return data.apply(lambda x: '|'.join([i['name'] for i in eval(x)]))

df_movies['genres'] = extract_values(df_movies["genres"])
df_movies['spoken_languages'] = extract_values(df_movies["spoken_languages"])
df_movies['production_companies'] = extract_values(df_movies['production_companies'])
df_movies['production_countries'] = extract_values(df_movies['production_countries'])

# belongs_to_collection

In [5]:
df_movies

Unnamed: 0.1,movieId,imdbId,tmdbId,Unnamed: 0,adult,backdrop_path,belongs_to_collection,budget,genres,homepage,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,895,105729,79782.0,0,False,/s2bpgVhpWODDfoADW78IpMDCMTR.jpg,,1783810,Drama|Romance,,...,2010-06-11,0,110,Český|Deutsch|Polski|Pусский,Released,,Venice,False,7.000,13
1,895,105729,79782.0,13349,False,/s2bpgVhpWODDfoADW78IpMDCMTR.jpg,,1783810,Drama|Romance,,...,2010-06-11,0,110,Český|Deutsch|Polski|Pусский,Released,,Venice,False,7.000,13
2,181393,1684935,79782.0,0,False,/s2bpgVhpWODDfoADW78IpMDCMTR.jpg,,1783810,Drama|Romance,,...,2010-06-11,0,110,Český|Deutsch|Polski|Pусский,Released,,Venice,False,7.000,13
3,181393,1684935,79782.0,13349,False,/s2bpgVhpWODDfoADW78IpMDCMTR.jpg,,1783810,Drama|Romance,,...,2010-06-11,0,110,Český|Deutsch|Polski|Pусский,Released,,Venice,False,7.000,13
4,1115,114472,141210.0,1,False,,,0,Comedy|Horror,,...,2012-10-12,0,6,English,Released,,The Sleepover,False,6.600,8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19974,209143,8132166,646282.0,19958,False,/n7kr24jkZBg6EERpJBdKvOjMMdV.jpg,,0,Documentary|History|Animation,,...,2019-11-08,0,107,English|Français|Italiano|Español,Released,What is happening in that room?,The Painting,False,8.000,2
19975,209145,10199670,595924.0,19959,False,/4evYVAzIHXSSVFxCQhBgkgj52pH.jpg,,0,Drama|History,,...,2019-09-04,0,132,Français|Italiano|Deutsch,Released,,Liberte,False,5.400,22
19976,209151,10551150,622831.0,19960,False,/ekVWMz32hsrRuSLf5KTjg3PvcUa.jpg,,0,History,,...,2019-09-20,15030400,0,普通话,Released,,Mao Zedong 1949,False,5.700,6
19977,209157,6671244,499546.0,19961,False,/3kb5b8IQCX4vd3baNBoZqAboP41.jpg,,0,Drama,,...,2018-07-12,0,100,Nederlands,Released,,We,False,5.938,56


In [6]:
# adult is dropped because there are only 3 adult films. which makes it useless to use for our model
cols = [
    "imdbId",
    "Unnamed: 0",
    "backdrop_path",
    "homepage",
    "poster_path",
    "imdb_id",
    "original_title",
    "video",
    "status",
    "adult"
]

# drop unused columns
df_movies2 = df_movies.drop(df_movies[cols], axis=1)

In [7]:
# reorder columns
col_order = [
    "movieId", 
    "title", 
    "genres", 
    "overview", 
    "tagline",
    "release_date", 
    "vote_average", 
    "vote_count", 
    "popularity", 
    "runtime", 
    "original_language",
    "spoken_languages", 
    "production_companies",
    "production_countries",
    "revenue",
    "budget",
    "belongs_to_collection",
    
]

df_movies2 = df_movies2[col_order]
df_movies2

Unnamed: 0,movieId,title,genres,overview,tagline,release_date,vote_average,vote_count,popularity,runtime,original_language,spoken_languages,production_companies,production_countries,revenue,budget,belongs_to_collection
0,895,Venice,Drama|Romance,An atmospheric coming-of-age story featuring a...,,2010-06-11,7.000,13,1.246,110,pl,Český|Deutsch|Polski|Pусский,Akson Studio|ITI Cinema,Poland,0,1783810,
1,895,Venice,Drama|Romance,An atmospheric coming-of-age story featuring a...,,2010-06-11,7.000,13,1.246,110,pl,Český|Deutsch|Polski|Pусский,Akson Studio|ITI Cinema,Poland,0,1783810,
2,181393,Venice,Drama|Romance,An atmospheric coming-of-age story featuring a...,,2010-06-11,7.000,13,1.246,110,pl,Český|Deutsch|Polski|Pусский,Akson Studio|ITI Cinema,Poland,0,1783810,
3,181393,Venice,Drama|Romance,An atmospheric coming-of-age story featuring a...,,2010-06-11,7.000,13,1.246,110,pl,Český|Deutsch|Polski|Pусский,Akson Studio|ITI Cinema,Poland,0,1783810,
4,1115,The Sleepover,Comedy|Horror,"The town of Derry has a secret, but no one tol...",,2012-10-12,6.600,8,0.600,6,en,English,,United States of America,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19974,209143,The Painting,Documentary|History|Animation,"For three and a half centuries, from the same ...",What is happening in that room?,2019-11-08,8.000,2,0.600,107,es,English|Français|Italiano|Español,TVE|TeleMadrid|Mare Films,Spain,0,0,
19975,209145,Liberte,Drama|History,"1774, shortly before the French Revolution, so...",,2019-09-04,5.400,22,3.829,132,fr,Français|Italiano|Deutsch,CNC|Medienboard Berlin-Brandenburg|Idéale Audi...,Germany|Spain|France|Portugal,0,0,
19976,209151,Mao Zedong 1949,History,,,2019-09-20,5.700,6,0.899,0,zh,普通话,博纳影业集团股份有限公司|阿里巴巴影业（北京）,China,15030400,0,
19977,209157,We,Drama,During a scorching summer in a Belgian-Dutch b...,,2018-07-12,5.938,56,6.512,100,nl,Nederlands,New AMS Film Company,Belgium|Netherlands,0,0,


In [8]:
# remove duplicate rows (with same movieId)
df2 = df_movies2.drop_duplicates('movieId')

In [9]:
# replace empty field with np.nan
df2 = df2.replace("", np.nan)

In [10]:
# print number of NA's in each column
print(df2.isna().sum())

movieId                      0
title                        0
genres                     320
overview                   161
tagline                  11170
release_date                 0
vote_average                 0
vote_count                   0
popularity                   0
runtime                      0
original_language            0
spoken_languages          1131
production_companies      3611
production_countries      1869
revenue                      0
budget                       0
belongs_to_collection    18077
dtype: int64


In [11]:
# drop rows with NAN overwie and genres --> these values are very important for the recommender model
df2.dropna(subset=['overview', 'genres'])

Unnamed: 0,movieId,title,genres,overview,tagline,release_date,vote_average,vote_count,popularity,runtime,original_language,spoken_languages,production_companies,production_countries,revenue,budget,belongs_to_collection
0,895,Venice,Drama|Romance,An atmospheric coming-of-age story featuring a...,,2010-06-11,7.000,13,1.246,110,pl,Český|Deutsch|Polski|Pусский,Akson Studio|ITI Cinema,Poland,0,1783810,
2,181393,Venice,Drama|Romance,An atmospheric coming-of-age story featuring a...,,2010-06-11,7.000,13,1.246,110,pl,Český|Deutsch|Polski|Pусский,Akson Studio|ITI Cinema,Poland,0,1783810,
4,1115,The Sleepover,Comedy|Horror,"The town of Derry has a secret, but no one tol...",,2012-10-12,6.600,8,0.600,6,en,English,,United States of America,0,0,
5,2223,The Farmer's Wife,Drama,"As her surroundings are invaded by outsiders, ...",,2012-06-20,10.000,1,0.600,18,en,English,,,0,0,
6,2679,A Place at the Table,Documentary,"Using personal stories, this powerful document...",One Nation. Underfed.,2012-03-22,6.700,20,2.337,84,en,English,,United States of America,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19973,209133,The Riot and the Dance: Earth,Documentary,"This nature/science documentary, showcases the...",A Cinematic Celebration of Creation,2018-03-19,3.500,2,0.600,110,en,,,,0,0,
19974,209143,The Painting,Documentary|History|Animation,"For three and a half centuries, from the same ...",What is happening in that room?,2019-11-08,8.000,2,0.600,107,es,English|Français|Italiano|Español,TVE|TeleMadrid|Mare Films,Spain,0,0,
19975,209145,Liberte,Drama|History,"1774, shortly before the French Revolution, so...",,2019-09-04,5.400,22,3.829,132,fr,Français|Italiano|Deutsch,CNC|Medienboard Berlin-Brandenburg|Idéale Audi...,Germany|Spain|France|Portugal,0,0,
19977,209157,We,Drama,During a scorching summer in a Belgian-Dutch b...,,2018-07-12,5.938,56,6.512,100,nl,Nederlands,New AMS Film Company,Belgium|Netherlands,0,0,


Der Dataframe is nun vollständig bereinigt worden. 

## Testing

In [14]:
# we deleted movies without overview (160 movies). correct?
# delete movies without genres (320 movies). correct?
# tagline more relevant then overview?
# 300 + movies with 0 runtime
# amount of ratings --> drop movies with under 10 vote counts?? (4000 movies)
# Venice movie duplicate --> why are you here?? are there other duplicates?
# Many movies with 0 revenue and 0 budget
# movie cast??
# ratings.csv von github lesen --> git lfs

In [12]:
# nur Filme, die mehr als 10 Votes haben.
df2[df2["vote_count"] >= 10]

Unnamed: 0,movieId,title,genres,overview,tagline,release_date,vote_average,vote_count,popularity,runtime,original_language,spoken_languages,production_companies,production_countries,revenue,budget,belongs_to_collection
0,895,Venice,Drama|Romance,An atmospheric coming-of-age story featuring a...,,2010-06-11,7.000,13,1.246,110,pl,Český|Deutsch|Polski|Pусский,Akson Studio|ITI Cinema,Poland,0,1783810,
2,181393,Venice,Drama|Romance,An atmospheric coming-of-age story featuring a...,,2010-06-11,7.000,13,1.246,110,pl,Český|Deutsch|Polski|Pусский,Akson Studio|ITI Cinema,Poland,0,1783810,
6,2679,A Place at the Table,Documentary,"Using personal stories, this powerful document...",One Nation. Underfed.,2012-03-22,6.700,20,2.337,84,en,English,,United States of America,0,0,
8,103208,A Place at the Table,Documentary,"Using personal stories, this powerful document...",One Nation. Underfed.,2012-03-22,6.700,20,2.337,84,en,English,,United States of America,0,0,
12,4484,"Camille Claudel, 1915",Drama,"Winter, 1915. Confined by her family to an asy...",,2013-03-13,6.200,58,0.600,95,fr,Français,ARTE France Cinéma|Canal+|3B Productions|Picta...,France,115860,3512454,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19968,209085,The Mistletoe Secret,Romance|TV Movie,Aria Eubank convinces a famous travel writer t...,,2019-11-10,6.722,18,2.336,80,en,English,,United States of America,0,0,
19969,209119,Up to the World,Comedy|Drama,Davide and Loris are brothers and live in a ve...,,2014-05-01,5.200,24,1.527,108,it,Italiano,,,0,0,
19975,209145,Liberte,Drama|History,"1774, shortly before the French Revolution, so...",,2019-09-04,5.400,22,3.829,132,fr,Français|Italiano|Deutsch,CNC|Medienboard Berlin-Brandenburg|Idéale Audi...,Germany|Spain|France|Portugal,0,0,
19977,209157,We,Drama,During a scorching summer in a Belgian-Dutch b...,,2018-07-12,5.938,56,6.512,100,nl,Nederlands,New AMS Film Company,Belgium|Netherlands,0,0,


In [13]:
# Tiefste und höchste Popularität
print(df2["popularity"].min())
print(df2["popularity"].max())

# Popularität unter 1
df2[df2["popularity"] < 1]

0.6
501.133


Unnamed: 0,movieId,title,genres,overview,tagline,release_date,vote_average,vote_count,popularity,runtime,original_language,spoken_languages,production_companies,production_countries,revenue,budget,belongs_to_collection
4,1115,The Sleepover,Comedy|Horror,"The town of Derry has a secret, but no one tol...",,2012-10-12,6.6,8,0.600,6,en,English,,United States of America,0,0,
5,2223,The Farmer's Wife,Drama,"As her surroundings are invaded by outsiders, ...",,2012-06-20,10.0,1,0.600,18,en,English,,,0,0,
12,4484,"Camille Claudel, 1915",Drama,"Winter, 1915. Confined by her family to an asy...",,2013-03-13,6.2,58,0.600,95,fr,Français,ARTE France Cinéma|Canal+|3B Productions|Picta...,France,115860,3512454,
14,112062,"Camille Claudel, 1915",Drama,"Winter, 1915. Confined by her family to an asy...",,2013-03-13,6.2,58,0.600,95,fr,Français,ARTE France Cinéma|Canal+|3B Productions|Picta...,France,115860,3512454,
22,47962,The Scar,Drama,The Scar draws an intense psychological suspen...,,2013-03-03,5.5,2,0.600,90,fr,Français,Cicatrice Film inc,Canada,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19971,209129,Destination Titan,Documentary,A story about four British scientists who have...,,2011-04-10,7.0,1,0.600,60,en,,,,0,0,
19972,209131,Last Days of the Arctic,Documentary,"The Icelandic photographer Ragnar Alexsson, a....",,2011-08-17,5.5,3,0.600,90,en,Dansk|English|Íslenska,ARTE|BBC|NDR|Gebrueder Beetz Filmproduktion|IT...,Iceland,0,0,
19973,209133,The Riot and the Dance: Earth,Documentary,"This nature/science documentary, showcases the...",A Cinematic Celebration of Creation,2018-03-19,3.5,2,0.600,110,en,,,,0,0,
19974,209143,The Painting,Documentary|History|Animation,"For three and a half centuries, from the same ...",What is happening in that room?,2019-11-08,8.0,2,0.600,107,es,English|Français|Italiano|Español,TVE|TeleMadrid|Mare Films,Spain,0,0,
