In [1]:
import pandas as pd
import ast
import numpy as np
import json

In [None]:
##Merging TMDB and IMDB datasets (till Jan 1,2020) 


#Import both datasets
movies=pd.read_csv('IMDB movies.csv')
new_movies=pd.read_csv('new_movie.csv')

In [None]:
#Drop irrelevant columns

movies.drop(columns=['usa_gross_income', 'worlwide_gross_income', 'metascore',
       'reviews_from_users', 'reviews_from_critics','duration','language','date_published','budget','title','writer','votes','production_company'],inplace=True)

new_movies.drop(columns=['original_title','adult','overview'],inplace=True)

#Remove nulls
movies.dropna(inplace=True)

#Rename columns 
movies.rename(columns={'imdb_title_id': 'imdb_id'}, inplace=True)

#Merge based on imdb_id
final=pd.merge(new_movies,movies,on='imdb_id',how='inner')

#Keep only unique entries
final=final.drop_duplicates(subset='imdb_id', keep="first")

#Save to disk
final.to_csv('movie_data.csv',index=False)

In [2]:
##Preprocessing TMDB data (after Jan 1, 2020)
final=pd.read_csv('movie_data.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
df=pd.read_csv('tmdb_2020.csv')

In [4]:
#Process column names so that to match with original data

#Drop columns
#Keeping title for now as it will be used later
df.drop(columns=['budget','adult','homepage','production_companies','revenue', 'runtime', 'spoken_languages', 'tagline','video','vote_count'],inplace=True)

#Rename columns
df.rename(columns={'id':'tmdb_id','genres':'genre','credits.cast':'actors','credits.crew':'director','release_date':'year','production_countries':'country','overview':'description','vote_average':'external_rating'},inplace=True)

#Add rating column
df['rating']=0.



df.head(1)

Unnamed: 0,genre,tmdb_id,imdb_id,original_language,original_title,description,popularity,poster_path,country,year,title,external_rating,actors,director,rating
0,"[{'id': 16, 'name': 'Animation'}]",747527,,en,From My Desert,​Temperature: 21. Humidity: 55 %. Optimum cond...,0.6,/y1EvAdEvto3UkXmJWYf6Pn1QG5.jpg,"[{'iso_3166_1': 'DE', 'name': 'Germany'}]",2019-07-12,From My Desert,0.0,"[{'cast_id': 4, 'character': '', 'credit_id': ...","[{'credit_id': '5f711f36e4b5760036e7eace', 'de...",0.0


In [5]:
#Get genres
def get_genre(inp):
    df=pd.json_normalize(ast.literal_eval(inp))
    if 'name' not in df:
        return np.nan
    else:
        df=df[df.name.notna()]
        temp=df.name.tolist()
        return ', '.join(temp)

In [6]:
#Get poster
def get_poster(inp):
    if type(inp) == str:
         return 'http://image.tmdb.org/t/p/original'+ inp

In [7]:
#Get actors (atmost 5)
def get_actors(inp):
    try:
        df=pd.json_normalize(ast.literal_eval(inp))
        if 'name' not in df:
            return np.nan
        else:
            df=df[df.name.notna()]
            df=df.sort_values('cast_id')[:5]
            temp=df.name.tolist()
            return ', '.join(temp)
    except:
        return np.nan
        

In [8]:
#Get Director

def get_director(inp):
    try:      
        df=pd.json_normalize(ast.literal_eval(inp))
        if 'name' not in df:
            return np.nan
        else:
            df=df[df.job == 'Director']
            if df.empty:
                return np.nan
            df=df[df.name.notna()]
            temp=df.name.tolist()
            return ', '.join(temp)
    except:
        return np.nan

In [9]:
#Get Countries

def get_countries(inp):
    df=pd.json_normalize(ast.literal_eval(inp))
    if 'name' not in df:
        return np.nan
    else:
        df=df[df.name.notna()]
        temp=df.name.tolist()
        return ', '.join(temp)

In [10]:
#Get year
def get_year(inp):
    return int(inp[:4])

In [11]:
#Translate original_title for some languages
def modify_title(lan,original,title):
    if lan in ['fr','es','it','ff','tl']:
        return original
    else:
        return title
    
    


In [12]:
df['genre']=df['genre'].apply(get_genre)

In [13]:
df['actors']=df['actors'].apply(get_actors)

In [14]:
df['country']=df['country'].apply(get_countries)

In [15]:
df['director']=df['director'].apply(get_director)

In [16]:
df['poster']=df['poster_path'].apply(get_poster)

In [17]:
df['year']=df['year'].apply(get_year)

In [18]:
df['original_title']=df.apply(lambda x: modify_title(x.original_language,x.original_title,x.title),axis=1)

In [19]:
df=df[final.columns.tolist()]

In [25]:
#Remove NANs
df.dropna(subset=['genre','description','original_title'],inplace=True)

In [26]:
df.shape

(14636, 14)

In [27]:
df.to_csv('tmdb_2020_processed.csv')

In [55]:
final.shape

(77622, 14)

In [None]:
## 3. Convert language codes to names

In [28]:
#Append to movie_data.csv

df.to_csv('movie_data.csv', mode='a',header=False,index=False)

In [29]:
df=pd.read_csv('movie_data.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [33]:
#Initialise ratings to 0

df.rating=0.

In [38]:
import tmdbsimple as ts

ts.API_KEY='YOUR_API_KEY'

In [47]:
#Get list of language codes and corresponding names

lan=pd.read_json(json.dumps(ts.Configuration().languages()))

#Convert to dictionary (code:english_name)
lan_codes= dict(zip(lan.iso_639_1, lan.english_name))

In [54]:
#Convert codes to english names

df.original_language=df.original_language.apply(lambda x: lan_codes[x])

In [56]:
#Final movie data shape

df.shape

(92258, 14)

In [57]:
#Save to disk

df.to_csv('movie_data.csv',index=False)