In [1]:
# basic python data handling analysis modules
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
import pickle, os, gc, re
# small utility functions
from utility import *

# interactive jupyter widgets!
# https://towardsdatascience.com/interactive-controls-for-jupyter-notebooks-f5c94829aee6
import ipywidgets as widgets
from ipywidgets import interact, interact_manual

%matplotlib inline

In [2]:
Dataset_directory = ".."+os.sep+"Datasets"+os.sep

In [3]:
## (1) PERSONALITY DATASET: PERS
## (2) SERENDIPTY DATASET: SER
## (3) LEARNING DATASET: LEARN
## (4) HETREC DATASET: HETREC
## (5) ML LATEST DATASET: ML

## (6) the-numbers.com DATASET: NUM

## (7) themoviedb.org DATASET: TMB (yes it's missing a D, but I'm not changing my scripts now!)


In [4]:
## (1) PERSONALITY DATASET

# Dataset Citation: Nguyen, T.T., Maxwell Harper, F., Terveen, L. et al. Inf Syst Front (2018) 20: 1173.
# https://doi.org/10.1007/s10796-017-9782-y

# The personality-data.csv file contains the data about the personalities and the
# movie preferences of 1834 users.
# The ratings.csv file contains the ratings of users in the personality-data.csv
# file contributed.
personality_dataset = "personality-isf2018"

# userid,openness,agreeableness,emotional_stability,conscientiousness,extraversion,assigned metric,
# assigned condition,movie_1,predicted_rating_1,movie_2,predicted_rating_2,movie_3,predicted_rating_3,movie_4,
# predicted_rating_4,movie_5,predicted_rating_5,movie_6,predicted_rating_6,movie_7,predicted_rating_7,movie_8,
# predicted_rating_8,movie_9,predicted_rating_9,movie_10,predicted_rating_10,movie_11,predicted_rating_11,
# movie_12,predicted_rating_12,is_personalized,enjoy_watching (1834, 34)
PERS_personality_data = pd.read_csv(Dataset_directory+ personality_dataset +os.sep+"personality-data.csv")
# Userid: the hashed user_id.
# Openness: an assessment score (from 1 to 7) assessing user tendency to prefer new experience. 1 means the user has tendency NOT to prefer new experience, 7 means the user has tendency to prefer new experience.
# Agreeableness: an assessment score (from 1 to 7) assessing user tendency to be compassionate and cooperative rather than suspicious and antagonistic towards others. 1 means the user has tendency to NOT be compassionate and cooperative. 7 means the user has tendency to be compassionate and cooperative.
# Emotional Stability: an assessment score (from 1 to 7) assessing user tendency to have psychological stress. 1 means the user has tendency to have psychological stress, and 7 means the user has tendency to NOT have psychological stress.
# Conscientiousness: an assessment score (from 1 to 7) assessing user tendency to be organized and dependable, and show self-discipline. 1 means the user does not have such a tendency, and 7 means the user has such tendency.
# Extraversion: an assessment score (from 1 to 7) assessing user tendency to be outgoing. 1 means the user does not have such a tendency, and 7 means the user has such a tendency.

# Assigned Metric: one of the follows (serendipity, popularity, diversity, default). Each user, besides being assessed their personality, was evaluated their preferences for a list of 12 movies manipulated with serendipity, popularity, diversity value or none (default option).
# Assigned Condition: one of the follows (high, medium, low). Based on the assigned metric, and this assigned condition, the list of movies was generated for the users. For example: if the assigned metric is serendipity and the assigned condition is high, the movies in the list are highly serendipitous. We document how we manipulated the movie list based on the assigned condition and assigned metric in page 6 of our research paper mentioned above.
# Movie_x (x is from 1 to 12): The list consists of 12 movies. These fields contain the ids of the twelve movies in the list.
# Predicted_rating_x (x is from 1 to 12): the predicted rating of the corresponding movie_x for the user.
# Is_Personalized:  The response of the user to the question `This list is personalized for me`. Users answered on the 5-point Likert scale. (1: Strongly Disagree, 5: Strongly Agree).
# Enjoy_watching: The response of the user to the question `This list contains movies I think I enjoyed watching`. Users answered on the 5-point Likert scale. (1: Strongly Disagree, 5: Strongly Agree)

# userid,movieId,rating,tstamp (1028751, 4)
PERS_ratings = pd.read_csv(Dataset_directory+ personality_dataset +os.sep+"ratings.csv")
# userId: the hashed user_id.
# movieId: the id of the movie that the user (corresponding to userId) rated.
# rating: the rating (from 0.5 to 5 stars) provided by the user.
# tstamp: when the user rated the movie.


In [5]:
# remove random spaces in column names!
PERS_personality_data.columns = [x.strip() for x in PERS_personality_data.columns]

# remove random spaces and deal with typo in userid
PERS_ratings.columns = ['userid', 'movieId', 'rating', 'tstamp']

In [6]:
# PERS_ratings -> PERS_movie_ratings
PERS_movie_ratings = PERS_ratings[["movieId","rating"]]\
                        .groupby("movieId").apply(lambda x: [x["rating"].count(), x["rating"].mean(), x["rating"].std()])\
                        .apply(pd.Series)
PERS_movie_ratings.columns = ["ratings_n","ratings_mean","ratings_std"]


In [7]:
big5 = ['openness', 'agreeableness', 'emotional_stability',
       'conscientiousness', 'extraversion']

def get_big5_corr_test(df):
    corr_fn = corr_simple_pearsonr
    list_of_outputs = {}
    if df.shape[0]>1:
        for pers in big5:
            (list_of_outputs[pers+"_r"], list_of_outputs[pers+"_p"], list_of_outputs[pers+"_n"]) = corr_fn( df["rating"] , df[pers] )
            list_of_outputs[pers+"_mean"]  = df[pers].mean()
            list_of_outputs[pers+"_std"]  = df[pers].std()
    else:
        for pers in big5:
            list_of_outputs[pers+"_r"] = np.nan
            list_of_outputs[pers+"_p"] = np.nan
            list_of_outputs[pers+"_n"] = 1
            (list_of_outputs[pers+"_r"], list_of_outputs[pers+"_p"], list_of_outputs[pers+"_n"]) = corr_fn( df["rating"] , df[pers] )
            list_of_outputs[pers+"_mean"]  = df[pers].mean()
            list_of_outputs[pers+"_std"]  = df[pers].std()        

    return list_of_outputs

# 35196 movieIds
# 11092 movies with only 1 rating
# 20311 movies with only <5 ratings

In [8]:
# average big5 rating for each user (they sometimes were sampled more than once)
# we're implicitly assuming these are static with noise
PERS_userid_by_big5 = PERS_personality_data[['userid']+big5].groupby('userid').mean()

# get ratings and add mean-big5 values by the userid
PERS_ratings_and_big5 = PERS_ratings[["movieId","rating"]].copy()
PERS_ratings_and_big5[big5] = PERS_userid_by_big5.loc[ PERS_ratings["userid"] ].reset_index()[big5]

In [9]:
%%time
# ~13 mins
df = PERS_ratings_and_big5.groupby('movieId').apply(lambda x: get_big5_corr_test(x) ).apply(pd.Series)

n_variables = [x for x in df.columns if "_n" in x]
df["N"] = df[n_variables[0]]
df.drop(n_variables, axis=1, inplace=True)
df["N"] = df["N"].astype('int')


  r = r_num / r_den


Wall time: 7min 57s


In [10]:
# OUTPUT: PERS_ratings_and_big5

# <timestamp on ratings/big5 data>
# <big5 experiment data - the list they assigned, their predicted ratings, user responses>
# we have ~1million ratings for those users ... but maybe ~180,000 are movies on lists they were given as part of the experiment
# maybe we should check/separate those?

PERS_ratings_and_big5 = df.copy()
PERS_ratings_and_big5[PERS_movie_ratings.columns] = PERS_movie_ratings
PERS_ratings_and_big5.drop("N",axis=1,inplace=True)
PERS_ratings_and_big5["movieId"] = PERS_ratings_and_big5.index
PERS_ratings_and_big5.to_csv(Dataset_directory+"Processed"+os.sep+"PERS_ratings_and_big5.csv")

In [11]:
# del PERS_movie_ratings,PERS_ratings

In [12]:
####################################################################################################################
#------------------------------------------------------------------------------------------------------------------#
####################################################################################################################

In [13]:
## (2) SERENDIPTY DATASET: SER

## (6) serendipity-sac2018
# https://grouplens.org/datasets/serendipity-2018/

dataset = "serendipity-sac2018"

# userId,movieId (3840, 2)
SER_recommendations = pd.read_csv(Dataset_directory+ dataset +os.sep+"recommendations.csv")

# userId,movieId,rating,timestamp,predictedRating,s1,s2,s3,s4,s5,s6,s7,s8,q,
# s_ser_rel,s_ser_find,s_ser_imp,s_ser_rec,m_ser_rel,m_ser_find,m_ser_imp,m_ser_rec (3840, 2)
# survey likert questions about 'serendipitous' (generally unpopular but they like) movies
# and then some inferred binary variables based on those answers
SER_answers = pd.read_csv(Dataset_directory+ dataset +os.sep+"answers.csv")

# ORIGINAL CSV HAS PROBLEMS (don't story comma separated list within "" ... and then *also* use "" for Jed "nickname" Jones)
# it's only 5-6 entries - MANUAL FIX is fine
# movieId,title,releaseDate,directedBy,starring,imdbId,tmdbId,genres (49174, 8)
# -> movieId,title,releaseDate,imdbId,tmdbId (good for identifying)
# genres -> comma separated string list
# starring -> comma separated string list
# directedBy -> comma separated string list
SER_movies = pd.read_csv(Dataset_directory+ dataset +os.sep+"movies.csv") # movieId is unique

# userId,movieId,tag,timestamp (628157, 4)
SER_tags = pd.read_csv(Dataset_directory+ dataset +os.sep+"tags.csv")

# userId,movieId,rating,timestamp (9997850, 4)
SER_training = pd.read_csv(Dataset_directory+ dataset +os.sep+"training.csv")

# movieId,tag,relevance (12413640, 3)
SER_tag_genome = pd.read_csv(Dataset_directory+ dataset +os.sep+"tag_genome.csv")

# pick the most relevant tags/tags above certain relevance
# SER_tag_genome[ SER_tag_genome["relevance"]>.5 ].groupby('movieId').count()["tag"].hist(bins=100)

SER_answers_by_movieId = SER_answers.groupby("movieId").mean().drop(["userId","timestamp"],axis=1)
SER_answers_by_movieId["n"] = SER_answers.groupby('movieId').count()["userId"]

SER_movie_ratings = SER_training[["movieId","rating"]].groupby("movieId").apply(lambda x: [x["rating"].count(), x["rating"].mean(), x["rating"].std()])
SER_movie_ratings = SER_movie_ratings.apply(pd.Series)
SER_movie_ratings.columns = ["ratings_n","ratings_mean","ratings_std"]
SER_movie_ratings["ratings_n"] = SER_movie_ratings["ratings_n"].astype('int')


In [14]:
num_directors = SER_movies["directedBy"].apply(lambda x: len(x.split(",")) if pd.notnull(x) else np.nan)
director_list = SER_movies["directedBy"].apply(lambda x: x.split(",") if pd.notnull(x) else np.nan)
# (0)1-30 directors, mode=1

SER_movies["director_list"] = director_list ## worth it for memory now, maybe shift to sparse later
SER_movies["num_directors"] = num_directors

num_genres = SER_movies["genres"].apply(lambda x: len(x.split(",")) if pd.notnull(x) else np.nan)
genres_list = SER_movies["genres"].apply(lambda x: x.split(",") if pd.notnull(x) else np.nan)
# (0)1-10 genres, mode =1

SER_movies["genres_list"] = genres_list
SER_movies["num_genres"] = num_genres

num_starring = SER_movies["starring"].apply(lambda x: len(x.split(",")) if pd.notnull(x) else np.nan)
starring_list = SER_movies["starring"].apply(lambda x: x.split(",") if pd.notnull(x) else np.nan)
# (0)1-70 starring, mode=5

SER_movies["starring_list"] = starring_list
SER_movies["num_starring"] = num_starring

# titles are form title (year) and not unique

# releaseDates = pd.to_datetime(SER_movies["releaseDate"]).apply(lambda x: x.date())
# throws error - we'll have to decide where we do all the date/release year processing

In [15]:
SER_genre_set = list(set(flatten(SER_movies["genres_list"].dropna() ) ) )

# INT_df["SER_genres_list"] = INT_df["SER_genres_list"].apply(lambda x: eval(x) if pd.notnull(x) else np.nan)
flat_list = [item for sublist in SER_movies["genres_list"].dropna().values for item in sublist]
genres = pd.DataFrame(flat_list).drop_duplicates()#.sort_values(by="movieId")

for gen_ind in genres.index:
    gen_name = "genres__"+genres.loc[gen_ind,0]
    SER_movies[gen_name] = SER_movies["genres_list"].apply(lambda x: genres.loc[gen_ind,0] in x if isinstance(x,list) or pd.notnull(x) else np.nan)

SER_movies.drop("genres_list",axis=1,inplace=True)

SER_genre_vars = search(SER_movies,"genres__[A-Z]",case_sensitive=True).index
SER_movies[SER_genre_vars] = SER_movies[SER_genre_vars].replace(True,1).replace(False,0).astype('float32') 

In [16]:
SER_movies.loc[SER_movies["movieId"]==8688,"releaseDate"]   = "1968-12-13" # Shalako 13 December 1968
SER_movies.loc[SER_movies["movieId"]==33471,"releaseDate"]  = "2002-04-05" # https://www.imdb.com/title/tt0292553/?ref_=fn_al_tt_1
SER_movies.loc[SER_movies["movieId"]==46121,"releaseDate"]  = "2001-06-01" # https://www.imdb.com/title/tt0135024/releaseinfo?ref_=tt_dt_dt
SER_movies.loc[SER_movies["movieId"]==47991,"releaseDate"]  = "2005-06-09" # https://www.imdb.com/title/tt0383304/releaseinfo?ref_=tt_dt_dt
SER_movies.loc[SER_movies["movieId"]==42941,"releaseDate"]  = "1974-08-08" # https://www.imdb.com/title/tt0071840/releaseinfo?ref_=tt_dt_dt
SER_movies.loc[SER_movies["movieId"]==59300,"releaseDate"]  = "1986-09-12" # https://www.imdb.com/title/tt0090678/releaseinfo?ref_=tt_dt_dt
SER_movies.loc[SER_movies["movieId"]==55631,"releaseDate"]  = "2006-03-17" # https://www.imdb.com/title/tt0443446/releaseinfo?ref_=tt_dt_dt
SER_movies.loc[SER_movies["movieId"]==60034,"releaseDate"]  = "2007-01-22" # https://www.imdb.com/title/tt0893331/releaseinfo?ref_=tt_dt_dt
SER_movies.loc[SER_movies["movieId"]==97703,"releaseDate"]  = "2012-05-07" # https://www.imdb.com/title/tt2209386/releaseinfo?ref_=tt_dt_dt
SER_movies.loc[SER_movies["movieId"]==105159,"releaseDate"] = "2012-07-13" # https://www.imdb.com/title/tt1859446/releaseinfo?ref_=tt_dt_dt
SER_movies.loc[SER_movies["movieId"]==105453,"releaseDate"] = "2013-09-19" # https://www.imdb.com/title/tt2717558/releaseinfo?ref_=tt_dt_dt
SER_movies.loc[SER_movies["movieId"]==105855,"releaseDate"] = "2013-10-11" # https://www.imdb.com/title/tt2286990/releaseinfo?ref_=tt_dt_dt
SER_movies.loc[SER_movies["movieId"]==109455,"releaseDate"] = "2014-02-21" # https://www.imdb.com/title/tt2980794/releaseinfo?ref_=tt_dt_dt
SER_movies.loc[SER_movies["movieId"]==57486,"releaseDate"]  = "1970-10-23" # https://www.imdb.com/title/tt0066578/releaseinfo?ref_=tt_dt_dt
SER_movies.loc[SER_movies["movieId"]==53502,"releaseDate"]  = "1948-04-27" # https://www.imdb.com/title/tt0040098/releaseinfo?ref_=tt_dt_dt
SER_movies.loc[SER_movies["movieId"]==60049,"releaseDate"]  = "2007-05-30" # https://www.imdb.com/title/tt0832937/releaseinfo?ref_=tt_dt_dt

SER_movies["releaseDate"] = SER_movies["releaseDate"].replace("0000-00-00",np.nan)
## another problem - whole load of SER_releaseDates *just happen* to be the UNIX start date -> statistically improbable
# (yes, I checked whether it happened to be the data of a film festival - it was not!)
# also, more on one single date by an order of magnitude
SER_movies["releaseDate"] = SER_movies["releaseDate"].replace("1969-12-31",np.nan)

In [17]:
# OUTPUT: SER_movies

SER_movies.index = SER_movies["movieId"]
SER_movies[SER_movie_ratings.columns] = SER_movie_ratings
SER_movies[SER_answers_by_movieId.columns] = SER_answers_by_movieId


# tags/tag genome/recommendations - we don't use yet
# we only take the 'answers' df because it's really easily to output a per movie average
# not finished - some cleaning/prep still required

SER_movies.drop(["directedBy","starring","genres"],axis=1,inplace=True)
SER_movies.to_csv(Dataset_directory+"Processed"+os.sep+"SER_movies"+".csv")

# SER_answers_by_movieId.to_csv(Dataset_directory+"Processed"+os.sep+"SER_answers_by_movieId"+".csv")
# SER_movie_ratings.to_csv(Dataset_directory+"Processed"+os.sep+"SER_movie_ratings"+".csv")

In [18]:
# del SER_movie_ratings,SER_answers_by_movieId, SER_tag_genome, SER_training, person_ids, SER_tags,

In [19]:
####################################################################################################################
#------------------------------------------------------------------------------------------------------------------#
####################################################################################################################

In [20]:
## (3) LEARNING DATASET: LEARN

# https://grouplens.org/datasets/learning-from-sets-of-items-2019/
# For this paper: https://www.thinkmind.org/download.php?articleid=eknow_2017_4_10_68011

dataset = "learning-from-sets-2019"

# userId,movieId_1,movieId_2,movieId_3,movieId_4,movieId_5,rating,timestamp (29516, 8)
LEARN_set_ratings = pd.read_csv(Dataset_directory+ dataset +os.sep+"set_ratings.csv")

# userId,movieId,rating,timestamp (458970, 4)
LEARN_item_ratings = pd.read_csv(Dataset_directory+ dataset +os.sep+"item_ratings.csv")


LEARN_movie_ratings = LEARN_item_ratings[["movieId","rating"]].groupby("movieId").apply(lambda x: [x["rating"].count(), x["rating"].mean(), x["rating"].std()])
LEARN_movie_ratings = LEARN_movie_ratings.apply(pd.Series)
LEARN_movie_ratings.columns = ["n","ratings_mean","ratings_std"]


In [21]:
# OUTPUT: LEARN_movie_ratings

# we're not doing anything with the (ratings of sets of movies) set data here, just the accompanying item (one movie-at-a-time data)

LEARN_movie_ratings["n"] = LEARN_movie_ratings["n"].astype('int')
LEARN_movie_ratings.to_csv(Dataset_directory+"Processed"+os.sep+"LEARN_movie_ratings"+".csv")

In [22]:
# del LEARN_movie_ratings

In [23]:
####################################################################################################################
#------------------------------------------------------------------------------------------------------------------#
####################################################################################################################

In [24]:
## (4) HETREC DATASET: HETREC
# http://files.grouplens.org/datasets/hetrec2011/hetrec2011-movielens-readme.txt
# MovieLens + IMDb/Rotten Tomatoes

# Some restrictions on commercial use -> Have sent them a begging letter!

dataset = "hetrec2011-movielens-2k-v2"

# This file contains the main actores and actresses of the movies.
# A ranking is given to the actors of each movie according to the order in which 
# they appear on the movie IMDb cast web page. (same as "order" in tmdb)
# movieID,actorID,actorName,ranking (231742, 4)
# movieID *not* unique
HETREC_movie_actors = pd.read_csv(Dataset_directory+ dataset +os.sep+"movie_actors.dat", sep=r'\t', engine='python')

# movieID,country (10197, 2)
# movieID unique, country only 71 different options, but don't look like iso standard! ("palestinian occupied territories")
HETREC_movie_countries = pd.read_csv(Dataset_directory+ dataset +os.sep+"movie_countries.dat", sep=r'\t', engine='python')

# movieID,directorID,directorName (10155, 3)
# movieID unique, directorName mostly refers to a *single director*
# Some weird values = Director Ridley Scott, Daniel Davis [Edward D. Wood Jr.], Grigori Chukhraj & Valentin Yezhov
HETREC_movie_directors = pd.read_csv(Dataset_directory+ dataset +os.sep+"movie_directors.dat", sep=r'\t', engine='python')

# movieID,genre (20809, 2)
# movieID *not* unique, 20 different genres (some pretty empty short=1, IMAX=25)
HETREC_movie_genres = pd.read_csv(Dataset_directory+ dataset +os.sep+"movie_genres.dat", sep=r'\t', engine='python')

# movieID,location1,location2,location3,location4 (49167, 5)
# movieID *not* unique, location1-4 operate like address ... a lot of the time location1 is a country
# but sometimes it's a ship! or "Israeli-Jordanian Border"
# Lot of work to get anything more consistent than "number of locations"
HETREC_movie_locations = pd.read_csv(Dataset_directory+ dataset +os.sep+"movie_locations.dat", sep=r'\t', engine='python')

# movieID,tagID,tagWeight (51795, 3)
# powerlaw distribution with tagWeight -could use to really thin down
# movieID *not* unique
HETREC_movie_tags = pd.read_csv(Dataset_directory+ dataset +os.sep+"movie_tags.dat", sep=r'\t', engine='python')

# id,title,imdbID,spanishTitle,imdbPictureURL,year,rtID,rtAllCriticsRating,rtAllCriticsNumReviews,
# rtAllCriticsNumFresh,rtAllCriticsNumRotten,rtAllCriticsScore,rtTopCriticsRating,rtTopCriticsNumReviews,
# rtTopCriticsNumFresh,rtTopCriticsNumRotten,rtTopCriticsScore,rtAudienceRating,rtAudienceNumRatings,
# rtAudienceScore,rtPictureURL (10197, 21)
HETREC_movies = pd.read_csv(Dataset_directory+ dataset +os.sep+"movies.dat", sep=r'\t', engine='python')

# id,value (13222, 2)
# id -> content for all tags (1 = "earth")
HETREC_tags = pd.read_csv(Dataset_directory+ dataset +os.sep+"tags.dat", sep=r'\t', engine='python')

# userID,movieID,rating,date_day,date_month,date_year,date_hour,date_minute,date_second (855598, 9)
# full time stamp for each individual rating, broken down into separate columns
HETREC_user_ratedmovies = pd.read_csv(Dataset_directory+ dataset +os.sep+"user_ratedmovies.dat", sep=r'\t', engine='python')

# userID,movieID,rating,timestamp (855598, 4)
# same but with timestamp object
HETREC_user_ratedmovies_timestamps = pd.read_csv(Dataset_directory+ dataset +os.sep+"user_ratedmovies-timestamps.dat", sep=r'\t', engine='python')

# userID,movieID,tagID,date_day,date_month,date_year,date_hour,date_minute,date_second (47957, 9)
# full time stamp for each individual tagging, broken down into separate columns
HETREC_user_taggedmovies = pd.read_csv(Dataset_directory+ dataset +os.sep+"user_taggedmovies.dat", sep=r'\t', engine='python')

# userID,movieID,tagID,timestamp (47957, 4)
# same but with timestamp object
HETREC_user_taggedmovies_timestamps = pd.read_csv(Dataset_directory+ dataset +os.sep+"user_taggedmovies-timestamps.dat", sep=r'\t', engine='python')



In [25]:
# HETREC_user_ratedmovies_timestamps.groupby("movieId")
HETREC_user_ratedmovies_timestamps = HETREC_user_ratedmovies_timestamps.rename(columns = {'movieID':'movieId'})
HETREC_movie_ratings = HETREC_user_ratedmovies_timestamps[["movieId","rating"]].groupby("movieId").apply(lambda x: [x["rating"].count(), x["rating"].mean(), x["rating"].std()])
# ML_movie_ratings = ML_ratings[["movieId","rating"]].groupby("movieId").apply(lambda x: [x["rating"].count(), x["rating"].mean(), x["rating"].std()])
HETREC_movie_ratings = HETREC_movie_ratings.apply(pd.Series)
HETREC_movie_ratings.columns = ["ratings_n","ratings_mean","ratings_std"]
HETREC_movie_ratings['ratings_n'] = HETREC_movie_ratings['ratings_n'].astype('int')

In [26]:
# harmonise imdbId column to match all others
HETREC_movies = HETREC_movies.rename(columns = {'imdbID':'imdbId'})

# harmonise id columns to movieID and set index equal to that
HETREC_movies = HETREC_movies.rename(columns = {'id':'movieId'})
HETREC_movies.index = HETREC_movies["movieId"]

# add country data to HETREC_movies
HETREC_movie_countries = HETREC_movie_countries.rename(columns = {'movieID':'movieId'})
HETREC_movie_countries.index = HETREC_movie_countries["movieId"]
# not standard country set!!!
HETREC_movies["country"] = HETREC_movie_countries["country"].astype('category')


# add director data to HETREC_movies
HETREC_movie_directors = HETREC_movie_directors.rename(columns = {'movieID':'movieId'})
HETREC_movie_directors.index = HETREC_movie_directors["movieId"]
HETREC_movies[["directorID","directorName"]] = HETREC_movie_directors[["directorID","directorName"]].astype('category')
# just worth turning these into categories

# add movie_ratings to HETREC_movies
HETREC_movies[HETREC_movie_ratings.columns] = HETREC_movie_ratings

# imdbPictureURL 181 nulls
# rtID 311 nulls
# id unique
# imdbID -> not unique
# rtID -> not unique

In [27]:
numerical_cols = ['rtAllCriticsRating', 'rtAllCriticsNumReviews',
       'rtAllCriticsNumFresh', 'rtAllCriticsNumRotten', 'rtAllCriticsScore',
       'rtTopCriticsRating', 'rtTopCriticsNumReviews', 'rtTopCriticsNumFresh',
       'rtTopCriticsNumRotten', 'rtTopCriticsScore', 'rtAudienceRating',
       'rtAudienceNumRatings', 'rtAudienceScore']
HETREC_movies[numerical_cols] = HETREC_movies[numerical_cols].replace("\\N",np.nan).apply(lambda x: pd.to_numeric(x))

# "\\N" appears to be the code for nan (maybe in same 230??)

# imdbPictureURL -> full url for jpg, not unique
# rtPictureURL -> \\N for 230, post_default for 57, otherwise similar non-unique pattern

# rtId -> string id, similar-but-not-identical pattern of not-uniqueness
# rtAllCriticsRating -> 0 is very common, \\N also present (0.0-9.5999)
# rtAllCriticsNumReviews -> integers
# rtAllCriticsNumFresh -> integers how many gave a "fresh" rating
# rtAllCriticsNumRotten -> integers how many gave a "rotten" rating (skewed v against!)
# rtTopCriticsScore -> integer score 0-100 (lot of 0s!)
# rtAudienceRating -> float 0.0-5.0 (lot of 0s!)
# rtAudienceNumRatings-> integers lotof 0s!
# rtAudienceScore -> integeers 0-100, lot of 0s!
# HETREC_movies["rtAudienceNumRatings"].value_counts()
# HETREC_movies["rtAudienceRating"].replace("\\N",np.nan).astype('float').max()


In [28]:
# rotten tomatoes data, adding some extra variables and setting missing values appropriately

rtAllCriticsNoReviews = HETREC_movies["rtAllCriticsNumReviews"]==0.0
HETREC_movies["rtAllCriticsFracRotten"] = HETREC_movies["rtAllCriticsNumRotten"]/HETREC_movies["rtAllCriticsNumReviews"]
HETREC_movies.loc[rtAllCriticsNoReviews,"rtAllCriticsRating"] = np.nan
HETREC_movies.loc[rtAllCriticsNoReviews,"rtAllCriticsScore"] = np.nan
HETREC_movies.loc[rtAllCriticsNoReviews,"rtAllCriticsFracRotten"] = np.nan

rtTopCriticsNoReviews = HETREC_movies["rtTopCriticsNumReviews"]==0.0
HETREC_movies["rtTopCriticsFracRotten"] = HETREC_movies["rtTopCriticsNumRotten"]/HETREC_movies["rtTopCriticsNumReviews"]
HETREC_movies.loc[rtTopCriticsNoReviews,"rtTopCriticsRating"] = np.nan
HETREC_movies.loc[rtTopCriticsNoReviews,"rtTopCriticsScore"] = np.nan
HETREC_movies.loc[rtTopCriticsNoReviews,"rtTopCriticsFracRotten"] = np.nan

rtAudienceNoRatings = HETREC_movies["rtAudienceNumRatings"]==0.0
# HETREC_movies["rtTopCriticsFracRotten"] = HETREC_movies["rtTopCriticsNumRotten"]/HETREC_movies["rtTopCriticsNumReviews"]
HETREC_movies.loc[rtAudienceNoRatings,"rtAudienceRating"] = np.nan
HETREC_movies.loc[rtAudienceNoRatings,"rtAudienceScore"] = np.nan
# HETREC_movies["rtTopCriticsFracRotten"][rtAudienceNoRatings] = np.nan

HETREC_movies.loc[ HETREC_movies["rtAllCriticsRating"]==0.0 , "rtAllCriticsRating"] = np.nan
HETREC_movies.loc[ HETREC_movies["rtTopCriticsRating"]==0.0 , "rtTopCriticsRating"] = np.nan

In [29]:
# add genre data to HETREC_movies
genres = pd.DataFrame(HETREC_movie_genres['genre'].unique())

for gen_ind in genres.index:
    gen_name = "genres__"+genres.loc[gen_ind,0]    
    genre_indices = HETREC_movie_genres[ HETREC_movie_genres["genre"]==genres.loc[gen_ind,0] ]["movieID"]
    HETREC_movies[gen_name] = np.nan
    
    HETREC_movies.loc[HETREC_movie_genres["movieID"].unique(),gen_name] = 0
    HETREC_movies.loc[genre_indices,gen_name] = 1
    

In [30]:
# OUTPUT: HETREC_movies, HETREC_movie_actors
# output per movie dataset
HETREC_movies.to_csv(Dataset_directory+"Processed"+os.sep+"HETREC_movies"+".csv")

# we're not using HETREC_movie_locations because it looks like junk/a nightmare to extract something useful from

# HETREC_tags/HETREC_user_taggedmovies/HETREC_user_taggedmovies_timestamps can wait until we try do something with tags

# this will be work to integrate - we'll have to check whether it's worth it (overlap with tmbd_cast?)
HETREC_movie_actors["order"] = HETREC_movie_actors["ranking"]-1
# ranking same as tmdb "order" but starts at 1 not 0
HETREC_movie_actors.drop("ranking",axis=1,inplace=True)
HETREC_movie_actors.to_csv(Dataset_directory+"Processed"+os.sep+"HETREC_cast"+".csv")



In [31]:
# del HETREC_movie_actors,HETREC_movie_directors,
# del HETREC_user_taggedmovies_timestamps,HETREC_user_taggedmovies,HETREC_user_ratedmovies_timestamps,
# del HETREC_user_ratedmovies,HETREC_tags,HETREC_movie_locations,HETREC_movie_tags,

In [32]:
####################################################################################################################
#------------------------------------------------------------------------------------------------------------------#
####################################################################################################################

In [33]:
## (5) ML LATEST DATASET: ML
## The Movie Lens latest 58k movies

dataset = "Movie_Lens_Latest"
# informative README.txt!

# tagId,tag (1128,  2)
# id -> id content
ML_genome_tags = pd.read_csv(Dataset_directory+ dataset +os.sep+"genome-tags.csv")

# movieId,imdbId,tmdbId (58,098,  3)
ML_links = pd.read_csv(Dataset_directory+ dataset +os.sep+"links.csv")

# movieId,title,genres (58,098,  3)
# genres -> | separated list
#    Romance,Fantasy,IMAX,Action,Sci-Fi,Western,Drama,Horror,Mystery,Adventure,War,Comedy,Musical,Documentary,Thriller,
#    Film-Noir,(no genres listed),Crime,Animation,Children'
ML_movies = pd.read_csv(Dataset_directory+ dataset +os.sep+"movies.csv")

# userId,movieId,tag,timestamp (1,108,997,  4)
# tags look user submitted - 74714 different tags - not consistent with genome_tags length/tagId range
ML_tags = pd.read_csv(Dataset_directory+ dataset +os.sep+"tags.csv")

# movieId,tagId,relevance (14,862,528,  3)
# each of 1128 tagIds x 13176 movieIds (dense matrix)
# relevance is machine learning output (inverse distribution)
ML_genome_scores = pd.read_csv(Dataset_directory+ dataset +os.sep+"genome-scores.csv")

# userId,movieId,rating,timestamp (27,753,444,  4)
ML_ratings = pd.read_csv(Dataset_directory+ dataset +os.sep+"ratings.csv")


In [34]:
ML_movie_ratings = ML_ratings[["movieId","rating"]].groupby("movieId").apply(lambda x: [x["rating"].count(), x["rating"].mean(), x["rating"].std()])
ML_movie_ratings = ML_movie_ratings.apply(pd.Series)
ML_movie_ratings.columns = ["ratings_n","ratings_mean","ratings_std"]
ML_movie_ratings['ratings_n'] = ML_movie_ratings['ratings_n'].astype('int')

In [35]:
ML_movies["genres_list"] = ML_movies["genres"].apply(lambda x: x.split("|") if pd.notnull(x) else np.nan)

flat_list = [item for sublist in ML_movies["genres_list"].dropna().values for item in sublist]
genres = pd.DataFrame(flat_list).drop_duplicates()#.sort_values(by="movieId")
drop_ind = genres[genres[0]=="(no genres listed)"].index[0]
genres.drop(drop_ind , inplace=True)
for gen_ind in genres.index: # ditch "no genre" index!
    gen_name = "genres__"+genres.loc[gen_ind,0] 
    ML_movies[gen_name] = ML_movies["genres_list"].apply(lambda x: genres.loc[gen_ind,0] in x if isinstance(x,list) or pd.notnull(x) else np.nan).astype('float')
    
# ML_genre_vars = search(ML_movies,"ML_genres__[A-Z]",case_sensitive=True)
# ML_genre_vars

In [36]:
# OUTPUT: ML_movies, ML_links


# deal with tags/relevance scores later

ML_movies.index = ML_movies["movieId"]
ML_links.index = ML_links["movieId"]
ML_movies[ML_movie_ratings.columns] = ML_movie_ratings
ML_movies.drop(["genres","genres_list"],axis=1,inplace=True)
ML_movies[ML_links.columns] = ML_links


ML_movies.to_csv(Dataset_directory+"Processed"+os.sep+"ML_movies.csv")

# ML_movie_ratings.to_csv(Dataset_directory+"Processed"+os.sep+"ML_movie_ratings.csv") 
# ML_links.index = ML_links["movieId"]
# ML_links.to_csv(Dataset_directory+"Processed"+os.sep+"ML_links.csv")
# could split up genres


In [37]:
# del ML_movie_ratings,ML_genome_tags,ML_tags,ML_genome_scores,ML_ratings,

In [38]:
####################################################################################################################
#------------------------------------------------------------------------------------------------------------------#
####################################################################################################################

In [39]:
## (6) the-numbers.com DATASET: NUM

# superficial, polite, respectful scrape of freely available financial data from the-numbers.com
# 5769 films
# title|link|production_budget|domestic_gross|worldwide_gross|release_year|release_month|release_day|movieId|worldwide_gross_divided_by_budget|international_gross|domestic_gross_fraction

In [40]:
# OUTPUT: NUM_movies
# del NUM_movies
NUM_movies = pd.read_csv(Dataset_directory+"Processed"+os.sep+"NUM_movies"+".csv", index_col = "scrape_count.1")
NUM_movies.index.name = "scrape_count"
# NUM_movies.drop( ["scrape_count","date","page_no"], axis=1, inplace=True )

NUM_movies[ ['production_budget', 'domestic_gross', 'worldwide_gross'] ].replace(0, np.nan, inplace=True)

NUM_movies["worldwide_gross_divided_by_budget"] = NUM_movies["worldwide_gross"] / NUM_movies["production_budget"]
NUM_movies["international_gross"] = (NUM_movies["worldwide_gross"] - NUM_movies["domestic_gross"])
NUM_movies["domestic_gross_fraction"] = (NUM_movies["domestic_gross"] / NUM_movies["worldwide_gross"])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  method=method)


In [41]:
# until we fix this

# suspicious_dates = (NUM_movies["release_day"]==31) & (NUM_movies["release_month"]==12)
#suspicious_dates.sum()

# NUM_movies.loc[suspicious_dates,["release_day","release_month","release_year"]] = [np.nan,np.nan,np.nan]

In [42]:
# NUM_movies.to_msgpack(Dataset_directory+"Processed"+os.sep+"NUM_movies"+".msgpack")

In [43]:
# NUM_movies[NUM_date == "31-12-2015"]
# yeah, there's a problem here - some blank stuff not recognised

# the-numbers.com has running time - why we not grab????
#
# https://www.the-numbers.com/movie/Not-Safe-For-Work-(2014)#tab=summary

In [44]:
# NUM_date = NUM_movies["release_day"].astype('str')+"-"+NUM_movies["release_month"].astype('str')+"-"+NUM_movies["release_year"].astype('str')
# NUM_date.value_counts()

# # lots of movies released 31-12 ???

# # NUM_movies["release_day"].isnull().sum()

In [45]:
# NUM_movies.shape, NUM_date.shape

In [46]:
####################################################################################################################
#------------------------------------------------------------------------------------------------------------------#
####################################################################################################################

In [47]:
## (7) themoviedatabase.com API DATASET: TMB

tmdb_dir = create_subdir(Dataset_directory, "themoviedb")


company = pd.read_msgpack(tmdb_dir + "company_cleaned.msgpack")
collections = pd.read_msgpack(tmdb_dir + "collections_cleaned.msgpack")
person_ids = pd.read_msgpack(tmdb_dir + "person_ids_cleaned.msgpack")
person_ids.index = person_ids.id

keywords_categories = pd.read_msgpack(tmdb_dir + "keywords_cleaned.msgpack")
keywords_categories.index = keywords_categories.id
genre_categories = pd.read_msgpack( tmdb_dir + "genres.msgpack")
genre_categories.index = genre_categories.id
production_country_categories = pd.read_msgpack( tmdb_dir + "production_countries.msgpack")
language_categories = pd.read_msgpack( tmdb_dir + "languages.msgpack") ## list of tmdb spoken languages


In [63]:
# 'adult|alternative_titles|backdrop_path|belongs_to_collection|budget|genres|homepage|tmbdId|imdbId|keywords|
# original_language|original_title|overview|popularity|poster_path|production_companies|production_countries|
# release_date|revenue|runtime|spoken_languages|status|tagline|title|video|vote_average|vote_count|
# facebook_id|instagram_id|twitter_id|num_genres|num_production_companies|num_production_countries|num_spoken_languages|
# num_alternative_titles|num_keywords|cast_size|crew_size|cast_credit_list|crew_credit_list'
movies = pd.read_msgpack(tmdb_dir + "movies_full_final.msgpack")
movies.rename(columns = {'id':'tmdbId'},inplace=True)
movies.rename(columns = {'imdb_id':'imdbId'},inplace=True)
movies.index.name = 'tmdbId'

movies["adult"] = movies["adult"].astype('float32')
movies["video"] = movies["video"].astype('float32')

# replace () with np.nan
movies.loc[ movies["alternative_titles"]==(), "alternative_titles" ] = np.nan
movies.loc[ movies["keywords"]==(), "keywords" ] = np.nan
movies.loc[ movies["production_companies"]==(), "production_companies" ] = np.nan
movies.loc[ movies["production_countries"]==(), "production_countries" ] = np.nan
movies.loc[ movies["spoken_languages"]==(), "spoken_languages" ] = np.nan




# 'cast_id|character|credit_id|gender|id|order|tmdb_id|max_order|fractional_order'
TMB_cast = pd.read_msgpack( tmdb_dir+"TMB_cast"+".msgpack")
# 'credit_id|department|gender|id|job|tmdb_id|order|max_order|fractional_order'
TMB_crew = pd.read_msgpack( tmdb_dir+"TMB_crew"+".msgpack")


# 'gender|id|name|profile_path|credit_list|credit_number|mean_order|fractional_mean_order|order_list|gender_guesser|
# importulence|profile_path_cast|gender_guess_integrated|cast_crew_overlap|adult|known_for_department|popularity|
# death_day|death_month|death_year|birth_day|birth_month|birth_year'
TMB_cast_individuals = pd.read_msgpack( tmdb_dir+"TMB_cast_individuals"+".msgpack")

# 'gender|id|name|profile_path|credit_list|credit_number|mean_order|fractional_mean_order|
# order_list|department_list|department_number|job_list|job_number|gender_guesser|importulence|
# gender_guess_integrated|cast_crew_overlap|adult|known_for_department|popularity|
# death_day|death_month|death_year|birth_day|birth_month|birth_year'
TMB_crew_individuals = pd.read_msgpack( tmdb_dir+"TMB_crew_individuals"+".msgpack")


# cast_crew_aggregate variables represent per-movie summary variables across ... the whole of the cast or crew
# index tmdb
# (Numerical Variable)_(cast|crew)_(lead|second|lead5|mean)

# credit_number
# gender_guess_integrated
# adult
# popularity
# cast_crew_overlap
# mean_order
# fractional_mean_order
# importulence
# birth_yrs_rel_1900
# death_yrs_rel_1900
# life_span
# job_number
# department_number

secondary_tmdb_dir = "e:\\Datasets\\themoviedb\\"
TMB_cast_crew_aggregates = pd.read_msgpack(secondary_tmdb_dir+"TMB_cast_crew_aggregates_latest"+".msgpack")
TMB_cast_crew_aggregates.index.name = "tmdbId"


# very chunky!
# release_dates = pd.read_msgpack(tmdb_dir + "release_dates.msgpack")
movies[TMB_cast_crew_aggregates.columns]= TMB_cast_crew_aggregates

In [64]:
# SER_genre_set = list(set(flatten(INT_df["SER_genres_list"].dropna() ) ) )
TMB_genre_set = list(genre_categories["name"].values)
# INT_df["SER_genres_list"] = INT_df["SER_genres_list"].apply(lambda x: eval(x) if pd.notnull(x) else np.nan)
# flat_list = [item for sublist in INT_df["TMB_genres"].dropna().values for item in sublist]
genres = pd.DataFrame(TMB_genre_set).drop_duplicates()#.sort_values(by="movieId")

for gen_ind in genres.index:
    gen_name = "genres__"+genres.loc[gen_ind,0]
    movies[gen_name] = movies["genres"].apply(lambda x: genre_categories.index[gen_ind] in x if isinstance(x,list) or pd.notnull(x) else np.nan)


movies.drop( ["genres"] , axis=1, inplace=True)    
TMB_genre_vars = search(movies,"genres__[A-Z]",case_sensitive=True)
movies[ TMB_genre_vars.index ] = movies[ TMB_genre_vars.index].replace(True,1).replace(False,0).astype('float32')    
# TMB_genre_vars




In [65]:
# movies.dtypes

In [66]:
# movies.shape

In [67]:
## not going to save this again as it's already huge/most of the size of the integrated dataset

In [68]:
# TMB_cast_crew_aggregates.columns

In [69]:
# movies[TMB_cast_crew_aggregates.columns].notnull().sum()

In [70]:
####################################################################################################################
#------------------------------------------------------------------------------------------------------------------#
####################################################################################################################

In [71]:
## (8) themoviedatabase.com Kaggle DATASET: TMDB_KAGGLE
dataset = "The_Movies_Dataset"

# userId,movieId,rating,timestamp (26024289, 4)
TMDB_KAGGLE_ratings = pd.read_csv(Dataset_directory+ dataset +os.sep+"ratings.csv")

# movieId, imdbId, tmdbId (45843, 3)
TMDB_KAGGLE_links_large = pd.read_csv(Dataset_directory+ dataset +os.sep+"links.csv")

# movieId, imdbId, tmdbId (9125, 3)
TMDB_KAGGLE_links_small = pd.read_csv(Dataset_directory+ dataset +os.sep+"links_small.csv")

# movieId, imdbId, tmdbId (45853, 3) + 10 (prob causing overlaps!)
TMDB_KAGGLE_links = TMDB_KAGGLE_links_large.append(TMDB_KAGGLE_links_small).drop_duplicates()

TMDB_KAGGLE_links.drop([5173,7527],inplace=True)
TMDB_KAGGLE_links["imdbId"] = TMDB_KAGGLE_links["imdbId"].astype('int')

# TMB_links_amalgamated.isnull().sum()

drop_indices = [13446, 16880, 5574, 9378, 29684, 20100, 14023, 16278, 848, 20065, 12131, 45641, 13676, 34144, 2594, 11217, 45197, 33499, 21093, 14089, 9215, 15871, 4390, 1485, 15165, 17350, 14077, 9631, 34061, 13289]
TMDB_KAGGLE_links.loc[drop_indices,"tmdbId"] = np.nan

In [72]:
# del TMDB_KAGGLE_ratings
# del TMB_cast_crew_aggregates,TMB_crew_individuals,TMB_cast_individuals,TMB_crew,TMB_cast,








In [73]:
gc.collect()

0

In [74]:
####################################################

In [75]:
## linking the whole dataset

# SER_links -> movieId,imdbId,tmdbId (49174,  3)
SER_links = SER_movies[["movieId","imdbId","tmdbId"]].copy()
# HETREC_links -> movieId,imdbId (10197,  2)
HETREC_links = HETREC_movies[["movieId","imdbId"]].copy() # could add rtID on if you like
# TMB_links -> tmdbId,imdbId (467917,  2)
TMB_links = movies[["tmdbId","imdbId"]].copy()

# ML_links -> movieId,imdbId,tmdbId (58098,  3)
ML_links = ML_links[["movieId","imdbId","tmdbId"]].copy()
# TMDB_KAGGLE_links -> movieId,imdbId,tmdbId (45853,  3)

# NUM -> pre-existing links to movieId
# PERS -> movieId
# LEARN -> movieId

In [76]:
all_links = TMDB_KAGGLE_links.copy()
all_links = all_links.append(ML_links,ignore_index=True,sort=False)
all_links = all_links.append(TMB_links,ignore_index=True,sort=False)
all_links = all_links.append(SER_links,ignore_index=True,sort=False)
all_links = all_links.append(HETREC_links,ignore_index=True,sort=False)
all_links = all_links.drop_duplicates()
# movieId now unique (when you drop tmdbId nulls)

In [77]:
movieId_by_tmdbId = all_links[["movieId","tmdbId"]].dropna().drop_duplicates()
movieId_by_tmdbId.index = movieId_by_tmdbId["movieId"]
null_tmbdIds = all_links["tmdbId"].isnull()
all_links.loc[null_tmbdIds, "tmdbId"] = all_links[ null_tmbdIds ].apply(lambda x: movieId_by_tmdbId.loc[x["movieId"],"tmdbId"] if x["movieId"] in movieId_by_tmdbId.index else np.nan, axis=1)

In [78]:
all_links.isnull().sum()

movieId    467917
imdbId     132736
tmdbId        977
dtype: int64

In [79]:
all_links.dtypes

movieId    float64
imdbId     float64
tmdbId     float64
dtype: object

In [80]:
# movies.drop("movieId",axis=1, inplace=True)
movies["tmdbId"].isnull().sum()

0

In [81]:
#

In [97]:
# TMDB_KAGGLE_links["movieId"].loc[movies["tmdbId"][0:100]]
temp = TMDB_KAGGLE_links[["movieId","tmdbId"]].dropna().drop_duplicates(subset = ["tmdbId"])
temp.index = temp["tmdbId"]

movies["movieId"] = movies["tmdbId"].apply(lambda x:temp["movieId"].loc[x] if x in temp.index else np.nan )


In [98]:
# get a set of link unique in tmdbId
tmdbId_links = all_links[all_links["tmdbId"].notnull()].copy()
tmdbId_links.index = range(0,tmdbId_links.shape[0])
tmdbId_links.sort_values(by="movieId", inplace=True)
tmdbId_links.drop_duplicates(subset = ["tmdbId"],inplace=True)
tmdbId_links.index = tmdbId_links["tmdbId"]

movies.loc[movies["movieId"].isnull(),"movieId"] = movies.loc[movies["movieId"].isnull(),"tmdbId"].apply(lambda x:tmdbId_links["movieId"].loc[x] if x in tmdbId_links.index else np.nan )

In [99]:
# del tmdbId_links

In [100]:
movies["movieId"].isnull().sum()

410590

In [107]:
# MOVIEID_TMDBID MATCH ERROR
# 143750.0	2223.0 should be 35832-2223 (The Farmers Wife)
movies.loc[ 143750 , "movieId" ] = np.nan
movies.loc[ 35832  , "movieId" ] = 2223

# 125409.0	98643.0	Kiss and Tell	should be 256300-98643
movies.loc[ 125409 , "movieId" ] = np.nan
movies.loc[ 256300 , "movieId" ] = 98643

# 141210.0	1115.0	The Sleepover -> should be 277102-1115
movies.loc[ 141210 , "movieId" ] = np.nan
movies.loc[ 277102 , "movieId" ] = 1115         
           

In [108]:
INT_df = movies.copy()
INT_df.columns = ["TMB_"+x for x in INT_df.columns]
INT_df["movieId"] = INT_df['TMB_movieId']
INT_df["TMB_dataset"] = True

temp = HETREC_movies.copy()
temp.columns = ["HETREC_"+x for x in temp.columns]
temp.index.name = "movieId"
temp["HETREC_dataset"] = True
INT_df = INT_df.merge(temp, how ='outer', on ='movieId')

temp = SER_movies.copy()
temp.columns = ["SER_"+x for x in temp.columns]
temp.index.name = "movieId"
temp["SER_dataset"] = True
INT_df = INT_df.merge(temp, how ='outer', on ='movieId')

temp = PERS_ratings_and_big5.copy()
temp.columns = ["PERS_"+x for x in temp.columns]
temp.index.name = "movieId"
temp["PERS_dataset"] = True
INT_df = INT_df.merge(temp, how ='outer', on ='movieId')

temp = ML_movies.copy()
temp.columns = ["ML_"+x for x in temp.columns]
temp.index.name = "movieId"
temp["ML_dataset"] = True
INT_df = INT_df.merge(temp, how ='outer', on ='movieId')

temp = NUM_movies[NUM_movies["movieId"].notnull()].copy()
temp.index = temp["movieId"]
temp.columns = ["NUM_"+x for x in temp.columns]
temp["NUM_dataset"] = True
INT_df = INT_df.merge(temp, how ='outer', on ='movieId')

# set all _dataset source booleans to False where they are nan
INT_df[ search(INT_df,"_dataset").index ] = INT_df[ search(INT_df,"_dataset").index ].fillna(False)

# INT_df.to_csv(Dataset_directory+"Processed"+os.sep+"INT_df"+".csv")

In [109]:
tmdbId_vars = search(INT_df,"tmdbId").index
INT_df["tmdbId"]  = INT_df[ tmdbId_vars ].fillna(method="ffill",axis=1)[tmdbId_vars[-1]]
# movieId_vars = search(INT_df,"movieId").index
# INT_df["movieId"] = INT_df[ movieId_vars ].fillna(method="ffill",axis=1)[movieId_vars[-1]]
INT_df["uId"] =  INT_df["tmdbId"].apply(lambda x: str(int(x)) if pd.notnull(x) else "NA")+"-"\
                +INT_df["movieId"].apply(lambda x: str(int(x)) if pd.notnull(x) else "NA")

# amalgamated tmbdId and movieIds

In [110]:
# actually unique!
INT_df["uId"].value_counts()

555528-NA        1
575632-NA        1
188658-152707    1
561714-NA        1
87520-NA         1
354568-NA        1
557955-NA        1
53299-NA         1
22543-8828       1
513130-NA        1
447803-NA        1
73584-NA         1
578403-NA        1
426920-NA        1
306225-NA        1
270464-NA        1
608402-NA        1
128607-NA        1
170833-NA        1
75233-90947      1
358765-NA        1
211677-NA        1
604796-NA        1
413163-NA        1
32845-121459     1
503948-NA        1
544806-NA        1
96846-118113     1
294027-NA        1
432563-NA        1
                ..
307725-NA        1
400190-164731    1
76688-NA         1
223220-NA        1
575390-NA        1
510047-NA        1
373722-NA        1
323470-NA        1
509829-NA        1
320141-NA        1
84198-2679       1
60940-NA         1
496026-NA        1
587965-NA        1
524651-NA        1
350997-NA        1
565542-NA        1
353101-NA        1
590966-NA        1
414763-NA        1
442694-NA        1
431027-NA   

In [76]:
# pd.DataFrame.from_dict( json.loads(res) )
# ~7 hrs -> 86,000 rows out of 450,000
# 36.4 -> another 30hrs
# 1/5 -> 300MB -> 1.5Gb (maybe save to other hd!)
# ram not an issue - 200MB for tmbd_df


def pretty_print(df):
    return display( HTML( df.to_html().replace("\\n","<br>") ) )

def sizeof_fmt(num, suffix='B'):
    ''' By Fred Cirera, after https://stackoverflow.com/a/1094933/1870254'''
    for unit in ['','Ki','Mi','Gi','Ti','Pi','Ei','Zi']:
        if abs(num) < 1024.0:
            return "%3.1f%s%s" % (num, unit, suffix)
        num /= 1024.0
    return "%.1f%s%s" % (num, 'Yi', suffix)


def memory_use(locs = locals().items()):
    gc.collect()
    # locals().items()
    for name, size in sorted(((name, sys.getsizeof(value)) for name,value in locs),
                             key= lambda x: -x[1])[:10]:
        print("{:>30}: {:>8}".format(name,sizeof_fmt(size)))
    

In [77]:
# import sys
# memory_use(locs = locals().items())

In [78]:
# INT_df.columns[INT_df.dtypes=='object']

In [111]:
drop_ind = INT_df[INT_df["uId"]=="527218-NA"].index
INT_df.drop(drop_ind,inplace=True)

In [112]:
secondary_tmdb_dir = "E:\\Datasets\\themoviedb"
INT_df.to_msgpack(secondary_tmdb_dir+os.sep+"INT_df.msgpack")

In [113]:
# secondary_tmdb_dir = "E:\\Datasets\\themoviedb"
# INT_df = pd.read_msgpack(secondary_tmdb_dir+os.sep+"INT_df.msgpack")

In [115]:
# mem_usage(INT_df)

In [82]:
INT_df.isnull().sum().sort_values()

uId                                           0
NUM_dataset                                   0
ML_dataset                                    0
PERS_dataset                                  0
SER_dataset                                   0
HETREC_dataset                                0
TMB_dataset                                   0
tmdbId                                     1428
TMB_genres__Action                         2079
TMB_genres__Fantasy                        2079
TMB_genres__Family                         2079
TMB_genres__Adventure                      2079
TMB_crew_size                              2079
TMB_crew_credit_list                       2079
TMB_cast_credit_list                       2079
TMB_genres__Thriller                       2079
TMB_cast_size                              2079
TMB_genres__Science Fiction                2079
TMB_genres__History                        2079
TMB_genres__Horror                         2079
TMB_genres__Drama                       

In [77]:
# #  (zlib or blosc)

# INT_df.to_msgpack(secondary_tmdb_dir+os.sep+"INT_df.msgpack.zlib",compress = 'zlib')
# # INT_df.to_msgpack(secondary_tmdb_dir+os.sep+"INT_df.msgpack.blosc",compress = 'blosc')

In [9]:
# import pyarrow

# INT_df.to_parquet(secondary_tmdb_dir+os.sep+"INT_df.parquet", engine='pyarrow')

In [None]:
# INT_df["TMB_tmdbId"].value_counts()
# TMB_video
# HETREC_country -> category?
# TMB_genres

In [None]:
# !conda install pyarrow -c conda-forge

In [96]:
## losslessly compress dataframe

# include option not to use null datatypes (older panda setups won't be able to read them!)

        
def mem_usage(pandas_obj):
    if isinstance(pandas_obj,pd.DataFrame):
        usage_b = pandas_obj.memory_usage(deep=True).sum()
    else: # we assume if not a df it's a series
        usage_b = pandas_obj.memory_usage(deep=True)
    usage_mb = usage_b / 1024 ** 2 # convert bytes to megabytes
    return "{:03.2f} MB".format(usage_mb)    

def compress_df(df):
    for col in df:
        mem = mem_usage(df[col])
        dt = df[col].dtype
        has_nans = df[col].isnull().sum()>0

        if dt=="object":
            descr = df[col].describe()
            # case where boolean with nulls gets turn into an object
            if (descr["unique"]==2) and (True in df[col].unique()) and (False in df[col].unique()):
                if has_nans:
                    df[col] = df[col].astype('float32')
                else:
                    df[col] = df[col].astype('uint8')
            elif descr["unique"]<(descr["count"]/2):
                df[col] = df[col].astype('category')
        elif dt in ["int16","int32","int64"]:
            df[col] = df[col].apply(pd.to_numeric,downcast='signed')
        elif dt in ["uint16","uint32","uint64"]:
            df[col] = df[col].apply(pd.to_numeric,downcast='unsigned')
        elif dt == "float64":
            df[col] = df[col].apply(pd.to_numeric,downcast='float')

        dt_new = df[col].dtype
        if dt_new!=dt:
            mem_new = mem_usage(df[col])
            print(col,dt, mem, dt_new, mem_new )
            
    return df



In [169]:
for col in INT_df.columns[ INT_df.dtypes=="float64" ]:
    if (INT_df[col].dropna() != INT_df[col].astype('float32').dropna()).sum()>0 :
        print(col,"not the same")
    else:
        INT_df[col] = INT_df[col].astype('float32')
        print(col,"the same, column reduced")
    

TMB_budget not the same
TMB_popularity not the same
TMB_revenue not the same
TMB_vote_average not the same
TMB_gender_guess_integrated_crew_mean not the same
TMB_gender_guess_integrated_cast_mean not the same
TMB_gender_guess_integrated_cast_lead not the same
TMB_gender_guess_integrated_crew_lead5 not the same
TMB_gender_guess_integrated_cast_lead5 not the same
TMB_credit_number_crew_mean not the same
TMB_credit_number_cast_mean not the same
TMB_credit_number_cast_lead not the same
TMB_credit_number_crew_lead5 not the same
TMB_credit_number_cast_lead5 not the same
TMB_job_number_crew_mean not the same
TMB_job_number_crew_lead5 not the same
TMB_department_number_crew_mean not the same
TMB_department_number_crew_lead5 not the same
TMB_mean_order_crew_mean not the same
TMB_mean_order_cast_mean not the same
TMB_mean_order_crew_lead not the same
TMB_mean_order_cast_lead not the same
TMB_mean_order_crew_lead5 not the same
TMB_mean_order_cast_lead5 not the same
TMB_fractional_mean_order_crew_

In [166]:
INT_df[ INT_df.columns[ INT_df.dtypes=="object" ] ]

Unnamed: 0,TMB_alternative_titles,TMB_backdrop_path,TMB_homepage,TMB_keywords,TMB_original_title,TMB_overview,TMB_poster_path,TMB_tagline,TMB_title,TMB_facebook_id,...,HETREC_imdbPictureURL,HETREC_rtID,HETREC_rtPictureURL,SER_title,SER_releaseDate,SER_starring_list,ML_title,NUM_title,NUM_link,uId
0,"((US, E T The Extra Terrestrial, ), (US, ET Th...",/tNpJuz8NEG0DsGG8SN0dL2kbCzs.jpg,http://www.et20.com/,"(455, 697, 1007, 1432, 1603, 1604, 1826, 2430,...",E.T. the Extra-Terrestrial,After a gentle alien becomes stranded on Earth...,/8htLKK03TJjKZOXJgihZCu8v0P.jpg,He is afraid. He is alone. He is three million...,E.T. the Extra-Terrestrial,,...,http://ia.media-imdb.com/images/M/MV5BMTc1NTQ0...,et_the_extraterrestrial,http://content8.flixster.com/movie/16/90/48/16...,E.T. the Extra-Terrestrial (1982),1982-06-11,"(Henry Thomas, Dee Wallace-Stone, Robert Mac...",E.T. the Extra-Terrestrial (1982),ET: The Extra-Terrestrial,/movie/ET-The-Extra-Terrestrial#tab=summary,601-1097
1,"((US, ID4, promotional abbreviation), (US, Ind...",/4E2xKGrU2qcqUE2S3Nl27hwZdqy.jpg,,"(1612, 1627, 1825, 4097, 4278, 6091, 9738, 973...",Independence Day,"On July 2, a giant alien mothership enters orb...",/bqLlWZJdhrS0knfEJRkquW7L8z2.jpg,Earth. Take a good look. It might be your last.,Independence Day,,...,http://ia.media-imdb.com/images/M/MV5BMTYyNjA2...,1071806-independence_day,http://content9.flixster.com/movie/10/94/47/10...,Independence Day (a.k.a. ID4) (1996),1996-07-03,"(Will Smith, Bill Pullman, Jeff Goldblum, M...",Independence Day (a.k.a. ID4) (1996),Independence Day,/movie/Independence-Day#tab=summary,602-780
2,"((EG, المتمرد, ), (US, The Matrix 1, faux titl...",/icmmSD4vTTDKOq2vvdulafOGw93.jpg,http://www.warnerbros.com/matrix,"(83, 310, 312, 490, 530, 779, 1430, 1721, 3074...",The Matrix,"Set in the 22nd century, The Matrix tells the ...",/hEpWvX6Bp79eLxY1kX5ZZJcme5U.jpg,Welcome to the Real World.,The Matrix,TheMatrixMovie,...,http://ia.media-imdb.com/images/M/MV5BMjEzNjg1...,matrix,http://content7.flixster.com/movie/16/90/52/16...,"Matrix, The (1999)",1999-03-31,"(Laurence Fishburne, Keanu Reeves, Hugo Weav...","Matrix, The (1999)",The Matrix,/movie/Matrix-The#tab=summary,603-2571
3,"((US, The Matrix 2, working title), (US, The M...",/Fp3piEuHXxKnPBO5R0Wj4wjZHg.jpg,,"(83, 310, 312, 779, 780, 1001, 1566, 1701, 172...",The Matrix Reloaded,Six months after the events depicted in The Ma...,/ezIurBz2fdUc68d98Fp9dRf5ihv.jpg,Free your mind.,The Matrix Reloaded,,...,http://ia.media-imdb.com/images/M/MV5BMjA0NDM5...,matrix_reloaded,http://content8.flixster.com/movie/59/86/57/59...,"Matrix Reloaded, The (2003)",2003-05-15,"(Keanu Reeves, Laurence Fishburne, Carrie-An...","Matrix Reloaded, The (2003)",The Matrix Reloaded,/movie/Matrix-Reloaded-The#tab=summary,604-6365
4,"((US, The Matrix 3, working title), (US, The M...",/pdVHUsb2eEz9ALNTr6wfRJe5xVa.jpg,,"(83, 310, 312, 334, 490, 663, 779, 780, 1001, ...",The Matrix Revolutions,The human city of Zion defends itself against ...,/2aJvwc4zXqtVUDbEu62e14J0mhe.jpg,Everything that has a beginning has an end.,The Matrix Revolutions,,...,http://ia.media-imdb.com/images/M/MV5BNzg2NTA1...,matrix_revolutions,http://content9.flixster.com/movie/35/84/17/35...,"Matrix Revolutions, The (2003)",2003-11-05,"(Keanu Reeves, Laurence Fishburne, Carrie-An...","Matrix Revolutions, The (2003)",The Matrix Revolutions,/movie/Matrix-Revolutions-The#tab=summary,605-6934
5,"((HK, 非洲之旅, ), (JP, 愛と哀しみの果て, ), (NO, Mitt Afr...",/uMPrCi5DA0kuIeZGjoE7FPGqETW.jpg,,"(409, 1326, 2043, 3513, 4414, 4932, 5944, 9840...",Out of Africa,Out of Africa tells the story of the life of D...,/gYNfg38sM4aSpxfC8gPkwg5UZHN.jpg,Based on a true story.,Out of Africa,,...,http://ia.media-imdb.com/images/M/MV5BMjA4Mzcy...,out_of_africa,http://content9.flixster.com/movie/10/85/35/10...,Out of Africa (1985),1985-12-18,"(Klaus Maria Brandauer, Michael Kitchen, Rob...",Out of Africa (1985),Out of Africa,/movie/Out-of-Africa#tab=summary,606-1959
6,"((IL, Gvarim B'Shahor, Romanized title), (US, ...",/agCihVGrXk3hmmwMIgsn2ao1eEa.jpg,http://www.sonypictures.com/homevideo/meninblack,"(1308, 1568, 1826, 2173, 2428, 2547, 3240, 324...",Men in Black,After a police chase with an otherworldly bein...,/f24UVKq3UiQWLqGWdqjwkzgB8j8.jpg,Protecting the Earth from the scum of the univ...,Men in Black,,...,http://ia.media-imdb.com/images/M/MV5BMTYzNzg5...,men_in_black,http://content7.flixster.com/movie/83/03/83032...,Men in Black (a.k.a. MIB) (1997),1997-07-04,"(Tommy Lee Jones, Will Smith, Linda Fiorenti...",Men in Black (a.k.a. MIB) (1997),Men in Black,/movie/Men-in-Black#tab=summary,607-1580
7,"((US, MIB II, promotional abbreviation), (US, ...",/cITp4EM8HEKrlhXDEeGNy2IB8D9.jpg,http://www.sonypictures.com/homevideo/meninbla...,"(83, 1308, 1568, 1826, 2173, 2428, 2547, 3243,...",Men in Black II,"Kay and Jay reunite to provide our best, last ...",/qWjRfBwr4VculczswwojXgoU0mq.jpg,Same Planet. New Scum.,Men in Black II,,...,http://ia.media-imdb.com/images/M/MV5BMTQwMjA5...,men_in_black_ii,http://content6.flixster.com/movie/55/06/05/55...,Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (...,2002-07-03,"(Tommy Lee Jones, Will Smith, Rip Torn, Lar...",Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (...,Men in Black 2,/movie/Men-in-Black-2#tab=summary,608-5459
8,"((SV, Juegos Diabolicos, ), (TW, 鬼哭神號, ), (CZ,...",/i998vFSbMKlLL5EfMVY1kvDDUuO.jpg,,"(970, 2723, 3093, 3350, 3358, 3737, 10235, 108...",Poltergeist,"Steve Freeling lives with his wife, Diane, and...",/hKs6umpuLSgZhOiZI1pxpO0iVTQ.jpg,They're here.,Poltergeist,,...,http://ia.media-imdb.com/images/M/MV5BMTU4NzMy...,1016513-poltergeist,http://content7.flixster.com/movie/25/42/25429...,Poltergeist (1982),1982-06-04,"(JoBeth Williams, Craig T. Nelson, Dominique...",Poltergeist (1982),Poltergeist,/movie/Poltergeist#tab=summary,609-1994
9,"((EE, München, ), (HR, Munchen, ), (IL, Minche...",/4P5yVNTA3RWXLgXWPAxCWRTJ4le.jpg,,"(90, 441, 536, 922, 1228, 1562, 2051, 2070, 24...",Munich,"During the 1972 Olympic Games in Munich, eleve...",/7vVNo6KNonhUYYcT40Zb6QnxmY5.jpg,The world was watching in 1972 as 11 Israeli a...,Munich,,...,http://ia.media-imdb.com/images/M/MV5BMTcwODgx...,munich,http://content6.flixster.com/movie/11/12/73/11...,Munich (2005),2005-12-23,"(Eric Bana, Daniel Craig, Ciarán Hinds, Mat...",Munich (2005),Munich,/movie/Munich#tab=summary,612-41997


In [170]:
INT_df.dtypes.value_counts()

float32           131
float64            94
object             26
bool                6
category            1
category            1
datetime64[ns]      1
category            1
category            1
category            1
category            1
category            1
category            1
category            1
dtype: int64

In [173]:
search(INT_df,"release")

TMB_release_date     430767
SER_releaseDate       46049
NUM_release_year       5164
NUM_release_month      5164
NUM_release_day        5164
dtype: int64

In [174]:
INT_df.shape

(469996, 267)

In [176]:
INT_df["TMB_dataset"].value_counts()

True     467917
False      2079
Name: TMB_dataset, dtype: int64

In [178]:
INT_df[ (INT_df["TMB_dataset"]==True) & ( INT_df["TMB_release_date"].isnull() ) ]

Unnamed: 0,TMB_adult,TMB_alternative_titles,TMB_backdrop_path,TMB_belongs_to_collection,TMB_budget,TMB_homepage,TMB_tmdbId,TMB_imdbId,TMB_keywords,TMB_original_language,...,NUM_release_month,NUM_release_day,NUM_movieId,NUM_worldwide_gross_divided_by_budget,NUM_international_gross,NUM_domestic_gross_fraction,NUM_dataset,tmdbId,uId,TMB_dataset
143,0.0,,,,,,1489.0,990406.0,"(41645,)",en,...,,,,,,,False,1489.0,1489-NA,True
855,1.0,,,,,,5740.0,493560.0,"(190370,)",en,...,,,,,,,False,5740.0,5740-NA,True
1203,0.0,,,,,,8089.0,1189315.0,"(378, 2109, 3398, 5882, 5933)",en,...,,,,,,,False,8089.0,8089-NA,True
1211,0.0,,,,,,8317.0,4032552.0,"(242,)",en,...,,,,,,,False,8317.0,8317-NA,True
1272,0.0,,,,,,8947.0,473472.0,"(279, 1157, 4240)",en,...,,,,,,,False,8947.0,8947-NA,True
1279,0.0,,,,,,9083.0,760314.0,,en,...,,,,,,,False,9083.0,9083-NA,True
1624,0.0,"((US, Giallo, ), (US, North Woods, working tit...",,,,,12675.0,1904981.0,,en,...,,,,,,,False,12675.0,12675-NA,True
1764,0.0,,,,,,13494.0,800175.0,,en,...,,,,,,,False,13494.0,13494-NA,True
2403,0.0,,/fVeccIVjF7hQqt81xZRREz1SjMf.jpg,,,,17384.0,,,en,...,,,,,,,False,17384.0,17384-NA,True
4130,1.0,,,,,,26228.0,,,en,...,,,,,,,False,26228.0,26228-NA,True


In [146]:
# INT_df2 = INT_df.copy()
# INT_df2 = compress_df(INT_df2)

SER_releaseDate object 19.47 MB category 6.14 MB


In [100]:
# INT_df[["TMB_production_companies","TMB_production_countries","TMB_spoken_languages"]] = INT_df[["TMB_production_companies","TMB_production_countries","TMB_spoken_languages"]].astype('category')

In [153]:
descr = INT_df["SER_releaseDate"].describe()
descr

count          46049
unique         16240
top       1994-01-01
freq             156
Name: SER_releaseDate, dtype: object

In [120]:
# search(INT_df,"date")

TMB_release_date    430767
SER_releaseDate      46049
dtype: int64

In [121]:
# INT_df["TMB_release_date"].value_counts()

2006-01-01    1890
2010-01-01    1857
2005-01-01    1833
2008-01-01    1819
2009-01-01    1806
2007-01-01    1783
2011-01-01    1745
2012-01-01    1686
2013-01-01    1643
2014-01-01    1639
2004-01-01    1605
2003-01-01    1505
2002-01-01    1308
2015-01-01    1156
2001-01-01    1141
1995-01-01    1080
1998-01-01    1079
1997-01-01    1075
1988-01-01    1062
2000-01-01    1057
1989-01-01    1054
1999-01-01    1051
1996-01-01    1033
1991-01-01    1028
1987-01-01    1016
1992-01-01    1001
1994-01-01     997
1993-01-01     981
2017-01-01     978
1990-01-01     965
              ... 
1891-01-02       1
1954-01-26       1
1950-12-18       1
1941-09-08       1
1901-03-20       1
1941-05-19       1
1921-10-18       1
1947-06-22       1
1902-05-24       1
1961-02-17       1
1933-06-04       1
1920-08-29       1
1924-08-06       1
1915-06-19       1
1916-12-06       1
1910-12-09       1
1952-11-22       1
1943-10-05       1
1912-04-14       1
1957-09-28       1
1933-02-21       1
1994-06-27  

In [147]:
# INT_df["SER_movieId"]

In [148]:
# # until we fix this

# suspicious_dates = (INT_df["NUM_release_day"]==31) & (INT_df["NUM_release_month"]==12)
# #suspicious_dates.sum()

# INT_df.loc[suspicious_dates,["NUM_release_day","NUM_release_month","NUM_release_year"]] = [np.nan,np.nan,np.nan]

In [117]:
# INT_df.loc[INT_df["SER_movieId"]==8688,"SER_releaseDate"]   = "1968-12-13" # Shalako 13 December 1968
# INT_df.loc[INT_df["SER_movieId"]==33471,"SER_releaseDate"]  = "2002-04-05" # https://www.imdb.com/title/tt0292553/?ref_=fn_al_tt_1
# INT_df.loc[INT_df["SER_movieId"]==46121,"SER_releaseDate"]  = "2001-06-01" # https://www.imdb.com/title/tt0135024/releaseinfo?ref_=tt_dt_dt
# INT_df.loc[INT_df["SER_movieId"]==47991,"SER_releaseDate"]  = "2005-06-09" # https://www.imdb.com/title/tt0383304/releaseinfo?ref_=tt_dt_dt
# INT_df.loc[INT_df["SER_movieId"]==42941,"SER_releaseDate"]  = "1974-08-08" # https://www.imdb.com/title/tt0071840/releaseinfo?ref_=tt_dt_dt
# INT_df.loc[INT_df["SER_movieId"]==59300,"SER_releaseDate"]  = "1986-09-12" # https://www.imdb.com/title/tt0090678/releaseinfo?ref_=tt_dt_dt
# INT_df.loc[INT_df["SER_movieId"]==55631,"SER_releaseDate"]  = "2006-03-17" # https://www.imdb.com/title/tt0443446/releaseinfo?ref_=tt_dt_dt
# INT_df.loc[INT_df["SER_movieId"]==60034,"SER_releaseDate"]  = "2007-01-22" # https://www.imdb.com/title/tt0893331/releaseinfo?ref_=tt_dt_dt
# INT_df.loc[INT_df["SER_movieId"]==97703,"SER_releaseDate"]  = "2012-05-07" # https://www.imdb.com/title/tt2209386/releaseinfo?ref_=tt_dt_dt
# INT_df.loc[INT_df["SER_movieId"]==105159,"SER_releaseDate"] = "2012-07-13" # https://www.imdb.com/title/tt1859446/releaseinfo?ref_=tt_dt_dt
# INT_df.loc[INT_df["SER_movieId"]==105453,"SER_releaseDate"] = "2013-09-19" # https://www.imdb.com/title/tt2717558/releaseinfo?ref_=tt_dt_dt
# INT_df.loc[INT_df["SER_movieId"]==105855,"SER_releaseDate"] = "2013-10-11" # https://www.imdb.com/title/tt2286990/releaseinfo?ref_=tt_dt_dt
# INT_df.loc[INT_df["SER_movieId"]==109455,"SER_releaseDate"] = "2014-02-21" # https://www.imdb.com/title/tt2980794/releaseinfo?ref_=tt_dt_dt
# INT_df.loc[INT_df["SER_movieId"]==57486,"SER_releaseDate"]  = "1970-10-23" # https://www.imdb.com/title/tt0066578/releaseinfo?ref_=tt_dt_dt
# INT_df.loc[INT_df["SER_movieId"]==53502,"SER_releaseDate"]  = "1948-04-27" # https://www.imdb.com/title/tt0040098/releaseinfo?ref_=tt_dt_dt
# INT_df.loc[INT_df["SER_movieId"]==60049,"SER_releaseDate"]  = "2007-05-30" # https://www.imdb.com/title/tt0832937/releaseinfo?ref_=tt_dt_dt

# INT_df["SER_releaseDate"] = INT_df["SER_releaseDate"].replace("0000-00-00",np.nan)
# ## another problem - whole load of SER_releaseDates *just happen* to be the UNIX start date -> statistically improbable
# # (yes, I checked whether it happened to be the data of a film festival - it was not!)
# # also, more on one single date by an order of magnitude
# INT_df["SER_releaseDate"] = INT_df["SER_releaseDate"].replace("1969-12-31",np.nan)

In [113]:
# mem_usage(INT_df["SER_releaseDate"]),mem_usage( pd.to_datetime( INT_df["SER_releaseDate"] ).apply(lambda x: x.date()) )


In [109]:
# pd.to_datetime( INT_df["SER_releaseDate"] ).apply(lambda x: x.date())

In [89]:
# # I wonder if MovieLens would like to pay me for fixing their data?
# # INT_df.loc[INT_df["movieId"]==8688,"SER_releaseDate"] = "1968-12-13" # Shalako 13 December 1968
# # INT_df.loc[INT_df["movieId"]==33471,"SER_releaseDate"] = "2002-04-05" # https://www.imdb.com/title/tt0292553/?ref_=fn_al_tt_1
# # INT_df.loc[INT_df["movieId"]==46121,"SER_releaseDate"] = "2001-06-01" # https://www.imdb.com/title/tt0135024/releaseinfo?ref_=tt_dt_dt
# # INT_df.loc[INT_df["movieId"]==47991,"SER_releaseDate"] = "2005-06-09" # https://www.imdb.com/title/tt0383304/releaseinfo?ref_=tt_dt_dt
# # INT_df.loc[INT_df["movieId"]==42941,"SER_releaseDate"] = "1974-08-08" # https://www.imdb.com/title/tt0071840/releaseinfo?ref_=tt_dt_dt
# # INT_df.loc[INT_df["movieId"]==59300,"SER_releaseDate"] = "1986-09-12" # https://www.imdb.com/title/tt0090678/releaseinfo?ref_=tt_dt_dt
# # INT_df.loc[INT_df["movieId"]==55631,"SER_releaseDate"] = "2006-03-17" # https://www.imdb.com/title/tt0443446/releaseinfo?ref_=tt_dt_dt
# # INT_df.loc[INT_df["movieId"]==60034,"SER_releaseDate"] = "2007-01-22" # https://www.imdb.com/title/tt0893331/releaseinfo?ref_=tt_dt_dt
# # INT_df.loc[INT_df["movieId"]==97703,"SER_releaseDate"] = "2012-05-07" # https://www.imdb.com/title/tt2209386/releaseinfo?ref_=tt_dt_dt
# # INT_df.loc[INT_df["movieId"]==105159,"SER_releaseDate"] = "2012-07-13" # https://www.imdb.com/title/tt1859446/releaseinfo?ref_=tt_dt_dt
# # INT_df.loc[INT_df["movieId"]==105453,"SER_releaseDate"] = "2013-09-19" # https://www.imdb.com/title/tt2717558/releaseinfo?ref_=tt_dt_dt
# # INT_df.loc[INT_df["movieId"]==105855,"SER_releaseDate"] = "2013-10-11" # https://www.imdb.com/title/tt2286990/releaseinfo?ref_=tt_dt_dt
# # INT_df.loc[INT_df["movieId"]==109455,"SER_releaseDate"] = "2014-02-21" # https://www.imdb.com/title/tt2980794/releaseinfo?ref_=tt_dt_dt
# # INT_df.loc[INT_df["movieId"]==57486,"SER_releaseDate"] = "1970-10-23" # https://www.imdb.com/title/tt0066578/releaseinfo?ref_=tt_dt_dt
# # INT_df.loc[INT_df["movieId"]==53502,"SER_releaseDate"] = "1948-04-27" # https://www.imdb.com/title/tt0040098/releaseinfo?ref_=tt_dt_dt
# # INT_df.loc[INT_df["movieId"]==60049,"SER_releaseDate"] = "2007-05-30" # https://www.imdb.com/title/tt0832937/releaseinfo?ref_=tt_dt_dt

# INT_df["SER_releaseDate"] = pd.to_datetime(INT_df["SER_releaseDate"].replace("0000-00-00",np.nan)).apply(lambda x: x.date())
# ## another problem - whole load of SER_releaseDates *just happen* to be the UNIX start date -> statistically improbable
# # (yes, I checked whether it happened to be the data of a film festival - it was not!)
# # also, more on one single date by an order of magnitude
# matching_UNIX_start_date = INT_df["SER_releaseDate"]==pd.to_datetime("1969-12-31").date()
# INT_df.loc[matching_UNIX_start_date,"SER_releaseDate"] = np.datetime64('NaT')

HETREC_directorID      10155
HETREC_directorName    10155
SER_director_list      47712
SER_num_directors      47712
dtype: int64

In [90]:
INT_df["SER_num_directors"].value_counts()

1.0     45814
2.0      1600
3.0       167
4.0        45
5.0        23
7.0        19
6.0        16
8.0         8
11.0        4
10.0        3
9.0         3
13.0        2
26.0        2
27.0        1
36.0        1
25.0        1
12.0        1
30.0        1
15.0        1
Name: SER_num_directors, dtype: int64

In [35]:
ad = pd.Series( pd.array( INT_df["TMB_adult"].astype('float32') , dtype="UInt8") )

In [36]:
mem_usage( INT_df["TMB_adult"]), mem_usage( ad), 

('18.02 MB', '0.90 MB')

In [55]:
ad = INT_df["TMB_adult"].copy()
ad.drop(2,inplace=True)
ad = pd.Series( pd.array( ad.astype('float32') , dtype="UInt8"), ad.index )

In [None]:
df = person_ids
for col in df.columns:
    print(col, df[col].dtype, mem_usage(df[col]) )

In [63]:
ad.replace(np.nan,0.0).astype('uint8').value_counts()#.apply(pd.to_numeric,downcast='unsigned')

0    445758
1     24237
Name: TMB_adult, dtype: int64

In [44]:
INT_df["TMB_adult"].shape

(469996,)

In [58]:
INT_df["TMB_adult"].apply(pd.to_numeric,downcast='float')

0         0.0
1         0.0
2         0.0
3         0.0
4         0.0
5         0.0
6         0.0
7         0.0
8         0.0
9         0.0
10        0.0
11        0.0
12        0.0
13        0.0
14        0.0
15        0.0
16        0.0
17        0.0
18        0.0
19        0.0
20        0.0
21        0.0
22        0.0
23        0.0
24        0.0
25        0.0
26        0.0
27        0.0
28        0.0
29        0.0
         ... 
469966    NaN
469967    NaN
469968    NaN
469969    NaN
469970    NaN
469971    NaN
469972    NaN
469973    NaN
469974    NaN
469975    NaN
469976    NaN
469977    NaN
469978    NaN
469979    NaN
469980    NaN
469981    NaN
469982    NaN
469983    NaN
469984    NaN
469985    NaN
469986    NaN
469987    NaN
469988    NaN
469989    NaN
469990    NaN
469991    NaN
469992    NaN
469993    NaN
469994    NaN
469995    NaN
Name: TMB_adult, Length: 469996, dtype: float32

In [5]:
tmdb_dir = create_subdir(Dataset_directory, "themoviedb")
TMB_cast_individuals = pd.read_msgpack( tmdb_dir+"TMB_cast_individuals"+".msgpack")

In [10]:
TMB_crew_individuals = pd.read_msgpack( tmdb_dir+"TMB_crew_individuals"+".msgpack")

In [7]:
TMB_cast_individuals.columns

Index(['gender', 'id', 'name', 'profile_path', 'credit_list', 'credit_number',
       'mean_order', 'fractional_mean_order', 'order_list', 'gender_guesser',
       'importulence', 'profile_path_cast', 'gender_guess_integrated',
       'cast_crew_overlap', 'adult', 'known_for_department', 'popularity',
       'death_day', 'death_month', 'death_year', 'birth_day', 'birth_month',
       'birth_year'],
      dtype='object')

In [8]:
TMB_cast_individuals["adult"].value_counts(dropna=False)

False    722447
True      30134
Name: adult, dtype: int64

In [9]:
TMB_cast_individuals.shape

(752581, 23)

In [None]:
TMB_cast_individuals["adult"].value_counts(dropna=False)

In [11]:
TMB_cast_individuals.isnull().sum()

gender                          0
id                              0
name                            0
profile_path               587155
credit_list                     0
credit_number                   0
mean_order                      0
fractional_mean_order           0
order_list                      0
gender_guesser                  0
importulence                11081
profile_path_cast          752577
gender_guess_integrated    103344
cast_crew_overlap               0
adult                           0
known_for_department          663
popularity                    660
death_day                  725314
death_month                725314
death_year                 724476
birth_day                  650912
birth_month                650912
birth_year                 644464
dtype: int64

In [13]:
TMB_crew_individuals.isnull().sum()

gender                          0
id                              0
name                            0
profile_path               550847
credit_list                     0
credit_number                   0
mean_order                      0
fractional_mean_order           0
order_list                      0
department_list                 0
department_number               0
job_list                        0
job_number                      0
gender_guesser                  0
importulence                53457
gender_guess_integrated     79087
cast_crew_overlap               0
adult                           0
known_for_department          438
popularity                    433
death_day                  578775
death_month                578775
death_year                 578317
birth_day                  553425
birth_month                553425
birth_year                 550067
dtype: int64

In [12]:
search(TMB_cast_individuals,"gender")

gender                     752581
gender_guesser             752581
gender_guess_integrated    649237
dtype: int64

In [None]:
## add aggregations for
# cast_crew_overlap
# adult
# popularity
# born X years before 2000
# dead for X years before 2000



# time consuming (hrs!) - set up and test, leave for overnight run!