In [8]:
"""
Author: Ra Cohen (ra.q.cohen@gmail.com)
Date: May 15, 2023
Purpose: Parse IMDB supplied data sets (https://developer.imdb.com/non-commercial-datasets/) 
for movie information from matched data courtesy of (https://github.com/dhruvilgala/tvtropes)
"""

import pandas as pd
import json

In [2]:
imdb_ratings = pd.read_csv("data/imdb_ratings.tsv", sep = '\t', low_memory=False)

In [3]:
imdb_ratings.head()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1972
1,tt0000002,5.8,263
2,tt0000003,6.5,1819
3,tt0000004,5.6,178
4,tt0000005,6.2,2614


In [4]:
with open('data/id_to_title.json') as json_file:
    id_to_title = json.load(json_file)

In [6]:
test_id = [x for x in id_to_title.keys()][0]

In [7]:
imdb_ratings[imdb_ratings['tconst'] == test_id]

Unnamed: 0,tconst,averageRating,numVotes
52259,tt0075617,6.4,3613


In [25]:
id_to_rating = dict()
failed = 0
for i, id_ in enumerate(id_to_title.keys()):
    row = imdb_ratings[imdb_ratings['tconst'] == id_]
    try:
        id_to_rating[id_] = float(row['averageRating'])
    except:
        failed += 1
    if i % 100 == 0:
        print("Completed {} out of {}".format(i, len(id_to_title.keys())))
print("Completed with {} failures".format(failed))

Completed 0 out of 12967
Completed 100 out of 12967
Completed 200 out of 12967
Completed 300 out of 12967
Completed 400 out of 12967
Completed 500 out of 12967
Completed 600 out of 12967
Completed 700 out of 12967
Completed 800 out of 12967
Completed 900 out of 12967
Completed 1000 out of 12967
Completed 1100 out of 12967
Completed 1200 out of 12967
Completed 1300 out of 12967
Completed 1400 out of 12967
Completed 1500 out of 12967
Completed 1600 out of 12967
Completed 1700 out of 12967
Completed 1800 out of 12967
Completed 1900 out of 12967
Completed 2000 out of 12967
Completed 2100 out of 12967
Completed 2200 out of 12967
Completed 2300 out of 12967
Completed 2400 out of 12967
Completed 2500 out of 12967
Completed 2600 out of 12967
Completed 2700 out of 12967
Completed 2800 out of 12967
Completed 2900 out of 12967
Completed 3000 out of 12967
Completed 3100 out of 12967
Completed 3200 out of 12967
Completed 3300 out of 12967
Completed 3400 out of 12967
Completed 3500 out of 12967
Comp

In [26]:
with open('data/id_to_rating.json', 'w+') as f:
    json.dump(id_to_rating, f)

In [27]:
imdb_info = pd.read_csv("data/imdb_basics.tsv", sep = '\t', low_memory=False)

In [40]:
id_to_genre = dict()
all_genres = set()
failures = 0
for i, id_ in enumerate(id_to_title.keys()):
    row = imdb_info[imdb_info['tconst'] == id_]
    try:
        raw_genres = str(row['genres'].values)
        genres = raw_genres[2:-2].split(',')
        id_to_genre[id_] = genres
        for genre in genres:
            all_genres.add(genre)
    except:
        print(row)
        failures += 1
    if i % 500 == 0:
        print("Completed {} out of {}".format(i, len(id_to_title.keys())))
print("Completed with {} failures".format(failures))

Completed 0 out of 12967
Completed 500 out of 12967
Completed 1000 out of 12967
Completed 1500 out of 12967
Completed 2000 out of 12967
Completed 2500 out of 12967
Completed 3000 out of 12967
Completed 3500 out of 12967
Completed 4000 out of 12967
Completed 4500 out of 12967
Completed 5000 out of 12967
Completed 5500 out of 12967
Completed 6000 out of 12967
Completed 6500 out of 12967
Completed 7000 out of 12967
Completed 7500 out of 12967
Completed 8000 out of 12967
Completed 8500 out of 12967
Completed 9000 out of 12967
Completed 9500 out of 12967
Completed 10000 out of 12967
Completed 10500 out of 12967
Completed 11000 out of 12967
Completed 11500 out of 12967
Completed 12000 out of 12967
Completed 12500 out of 12967
Completed with 0 failures


In [41]:
with open('data/id_to_genres.json', 'w+') as f:
    json.dump(id_to_genre, f)

In [43]:
all_genres_l = list(all_genres)
all_genres_l.sort()
all_genres_l

['',
 'Action',
 'Adult',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Documentary',
 'Drama',
 'Family',
 'Fantasy',
 'Film-Noir',
 'Game-Show',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'News',
 'Reality-TV',
 'Romance',
 'Sci-Fi',
 'Short',
 'Sport',
 'Talk-Show',
 'Thriller',
 'War',
 'Western',
 '\\\\N']

In [None]:
with open('data/id_to_genre_features.json', 'w+') as f:
    json.dump(id_to_genre_features, f)