In [1]:
import os
import json
import numpy as np
import pandas as pd
from pandas_profiling import ProfileReport

In [2]:
isGenerateReport = False

In [3]:
dataset_tables = {}

for file in os.listdir("./original-dataset"):
    df = pd.read_csv(f"./original-dataset/{file}")
    fileName = file[:-4]
    dataset_tables[fileName] = df
    
    if isGenerateReport:
        title = "{title} Report".format(title=fileName.replace("-", " ").title())
        ProfileReport(df, title=title, explorative=True).to_file(f"./profile-reports/{fileName}-report.html")

In [4]:
dataset_tables

{'genome-scores':           movieId  tagId  relevance
 0               1      1    0.02500
 1               1      2    0.02500
 2               1      3    0.05775
 3               1      4    0.09675
 4               1      5    0.14675
 ...           ...    ...        ...
 11709763   131170   1124    0.58775
 11709764   131170   1125    0.01075
 11709765   131170   1126    0.01575
 11709766   131170   1127    0.11450
 11709767   131170   1128    0.02175
 
 [11709768 rows x 3 columns],
 'genome-tags':       tagId           tag
 0         1           007
 1         2  007 (series)
 2         3  18th century
 3         4         1920s
 4         5         1930s
 ...     ...           ...
 1123   1124       writing
 1124   1125         wuxia
 1125   1126          wwii
 1126   1127        zombie
 1127   1128       zombies
 
 [1128 rows x 2 columns],
 'links':        movieId   imdbId    tmdbId
 0            1   114709     862.0
 1            2   113497    8844.0
 2            3   113228  

In [5]:
for tableName in dataset_tables:
    print(f"Number of rows in {tableName}: {len(dataset_tables[tableName].index)}")

Number of rows in genome-scores: 11709768
Number of rows in genome-tags: 1128
Number of rows in links: 27278
Number of rows in movies: 27278
Number of rows in ratings: 20000263
Number of rows in tags: 465564


In [6]:
def getMissingKeys(primaryTable, foreignTable, keyName, isShowProgress=False):
    foreignKeys = set(foreignTable[keyName].to_list())
    primaryKeys = set(primaryTable[keyName].to_list())
    
    missingKeys = []
    numMissingKeys = 0
    
    for i, primaryKey in enumerate(primaryKeys):
        if not primaryKey in foreignKeys:
            missingKeys.append(primaryKey)
            numMissingKeys += 1
        
        if isShowProgress:
            print(i)
    
    return numMissingKeys, missingKeys

In [7]:
getMissingKeys(dataset_tables["movies"], dataset_tables["ratings"], "movieId")

(534,
 [26018,
  26580,
  27249,
  27396,
  31797,
  32773,
  33019,
  33229,
  33573,
  45994,
  63280,
  65078,
  66622,
  69332,
  69565,
  69864,
  72681,
  72897,
  72908,
  77451,
  78713,
  79363,
  80226,
  80592,
  80787,
  80827,
  84291,
  85476,
  86314,
  86372,
  86949,
  87266,
  87466,
  88471,
  88833,
  89100,
  89135,
  89215,
  89341,
  89482,
  89806,
  90035,
  90116,
  90493,
  92268,
  92516,
  92845,
  92925,
  93002,
  94076,
  94435,
  94725,
  94808,
  96086,
  96176,
  96193,
  96275,
  96576,
  96598,
  98389,
  98398,
  98452,
  98583,
  98924,
  99020,
  99515,
  99562,
  99717,
  100463,
  100577,
  101216,
  101224,
  101237,
  101407,
  101472,
  101505,
  102156,
  102327,
  102417,
  102821,
  102823,
  103177,
  103472,
  103641,
  104095,
  104748,
  104750,
  105111,
  105279,
  105542,
  105792,
  105796,
  105961,
  105963,
  106028,
  106188,
  106245,
  106268,
  106423,
  106483,
  106521,
  106859,
  107226,
  107289,
  107640,
  108046,
  

In [8]:
movieForeignTables = ["links", "ratings", "tags", "genome-scores"]
genomeTagForeignTables = ["genome-scores"]

movieForeignKeys = {
    "missingForeignKeys": {},
    "nonExistentKeys": {}
}

genomeTagForeignKeys = {
    "missingForeignKeys": {},
    "nonExistentKeys": {}
}

print("Movies:")
for tableName in movieForeignTables:
    numMissingForeignKeys, missingForeignKeys = getMissingKeys(dataset_tables["movies"], dataset_tables[tableName], "movieId")
    numNonExistentPrimaryKeys, nonExistentPrimaryKeys = getMissingKeys(dataset_tables[tableName], dataset_tables["movies"], "movieId")
    
    movieForeignKeys["missingForeignKeys"][tableName] = missingForeignKeys
    movieForeignKeys["nonExistentKeys"][tableName] = nonExistentPrimaryKeys
    
    print(f"{tableName} has {numMissingForeignKeys} missing foreign keys | {numNonExistentPrimaryKeys} non-existent keys")

print("\nGenome Tags:")
for tableName in genomeTagForeignTables:
    numMissingForeignKeys, missingForeignKeys = getMissingKeys(dataset_tables["genome-tags"], dataset_tables[tableName], "tagId")
    numNonExistentPrimaryKeys, nonExistentPrimaryKeys = getMissingKeys(dataset_tables[tableName], dataset_tables["genome-tags"], "tagId")
    
    genomeTagForeignKeys["missingForeignKeys"][tableName] = missingForeignKeys
    genomeTagForeignKeys["nonExistentKeys"][tableName] = nonExistentPrimaryKeys
    
    print(f"{tableName} has {numMissingForeignKeys} missing foreign keys | {numNonExistentPrimaryKeys} non-existent keys")
    
with open("movie-missing-foreign-keys.json", "w") as outfile:
    json.dump(movieForeignKeys, outfile)
    
with open("genome-tag-foreign-keys.json", "w") as outfile:
    json.dump(genomeTagForeignKeys, outfile)

Movies:
links has 0 missing foreign keys | 0 non-existent keys
ratings has 534 missing foreign keys | 0 non-existent keys
tags has 7733 missing foreign keys | 0 non-existent keys
genome-scores has 16897 missing foreign keys | 0 non-existent keys

Genome Tags:
genome-scores has 0 missing foreign keys | 0 non-existent keys


In [9]:
isGenerateRelevantTags = False

In [10]:
movie_to_tag_relevance = {} # key is movieId, values are a list of tagIds

if isGenerateRelevantTags:
    movie_ids = sorted(list(set(dataset_tables["genome-scores"]["movieId"].tolist())))
    print("Number of movie ids:", len(movie_ids))
    tags_scores_matrix = np.array(np.array_split(dataset_tables["genome-scores"]["relevance"].values.flatten(), len(movie_ids)))
    print("Tag scores matrix shape:", tags_scores_matrix.shape)

    tag_id_to_name = dataset_tables["genome-tags"].set_index("tagId").T.to_dict("list")

    for i, movie_id in enumerate(movie_ids):
        movie_to_tag_relevance[movie_id] = [tag_id_to_name[x+1][0] for x in np.argpartition(tags_scores_matrix[i], -4)[-4:].tolist()]

    with open("movie_to_relevant_tags.json", "w") as outfile:
        json.dump(movie_to_tag_relevance, outfile)
else:
    with open('movie_to_relevant_tags.json', 'r') as fp:
        movie_to_tag_relevance = json.load(fp)

In [11]:
movie_to_tag_relevance

{'1': ['kids and family', 'pixar animation', 'computer animation', 'toys'],
 '2': ['kids', 'adventure', 'jungle', 'children'],
 '3': ['comedy', 'sequels', 'good sequel', 'sequel'],
 '4': ['romantic', 'girlie movie', 'chick flick', 'women'],
 '5': ['sequel', 'sequels', 'good sequel', 'father daughter relationship'],
 '6': ['bank robbery', 'gunfight', 'heist', 'crime'],
 '7': ['romance', 'romantic comedy', 'romantic', 'remake'],
 '8': ['childhood', 'unlikely friendships', 'based on a book', 'adventure'],
 '9': ['video game adaptation', 'lone hero', 'good action', 'action'],
 '10': ['franchise', 'bond', '007 (series)', '007'],
 '11': ['world politics', 'political', 'politics', 'president'],
 '12': ['goofy', 'comedy', 'spoof', 'parody'],
 '13': ['dog', 'animated', 'animation', 'talking animals'],
 '14': ['politics', 'biographical', 'world politics', 'president'],
 '15': ['action', 'swashbuckler', 'treasure hunt', 'pirates'],
 '16': ['gangster', 'mob', 'organized crime', 'mafia'],
 '17': ['