In [1]:
import pathlib
import os
import sys
from collections import defaultdict
from statistics import mean
from py2neo import Graph
from py2neo.bulk import merge_nodes, merge_relationships

parent_path = pathlib.Path(os.getcwd()).parent.absolute()
sys.path.append(str(parent_path))

from utils.general import read_csv, df_to_json
from tqdm import tqdm
from tabulate import tabulate


### Load CSVs

In [2]:
data_dir = "movies_with_metadata"

In [3]:
movies_json = df_to_json(
    read_csv(
        filename="movies_metadata",
        parent_dir_name=data_dir,
        low_memory=False,
    )
)

Reading from: /Users/ioannisathanasiou/diploma/model/movies_with_metadata/movies_metadata.csv


In [4]:
ratings_json = df_to_json(
    read_csv(
        filename="ratings_small",
        parent_dir_name=data_dir,
    )
)


Reading from: /Users/ioannisathanasiou/diploma/model/movies_with_metadata/ratings_small.csv


### Movies

In [5]:
movies_json[0]

{'adult': 'False',
 'belongs_to_collection': "{'id': 10194, 'name': 'Toy Story Collection', 'poster_path': '/7G9915LfUQ2lVfwMEEhDsn3kT4B.jpg', 'backdrop_path': '/9FBwqcd9IRruEDUrTdcaafOMKUq.jpg'}",
 'budget': '30000000',
 'genres': "[{'id': 16, 'name': 'Animation'}, {'id': 35, 'name': 'Comedy'}, {'id': 10751, 'name': 'Family'}]",
 'homepage': 'http://toystory.disney.com/toy-story',
 'id': '862',
 'imdb_id': 'tt0114709',
 'original_language': 'en',
 'original_title': 'Toy Story',
 'overview': "Led by Woody, Andy's toys live happily in his room until Andy's birthday brings Buzz Lightyear onto the scene. Afraid of losing his place in Andy's heart, Woody plots against Buzz. But when circumstances separate Buzz and Woody from their owner, the duo eventually learns to put aside their differences.",
 'popularity': '21.946943',
 'poster_path': '/rhIRbceoE9lR4veEXuwCC2wARtG.jpg',
 'production_companies': "[{'name': 'Pixar Animation Studios', 'id': 3}]",
 'production_countries': "[{'iso_3166_1':

In [6]:
len(movies_json)

45466

#### Ratings

In [7]:
ratings_json[0]

{'userId': 1, 'movieId': 31, 'rating': 2.5, 'timestamp': 1260759144}

In [8]:
len(ratings_json)

100004

In [28]:
def count_average_ratings_per_movie(kept_ratings):
    ratings_per_movie_count = defaultdict(int)
    for rating in kept_ratings:
        ratings_per_movie_count[rating["movieId"]] += 1
    return mean(ratings_per_movie_count.values())

def count_average_ratings_per_user(kept_ratings):
    ratings_per_user_count = defaultdict(int)
    for rating in kept_ratings:
        ratings_per_user_count[rating["userId"]] += 1
    return mean(ratings_per_user_count.values())

In [29]:
count_average_ratings_per_movie(ratings_json)

11.030664019413193

In [30]:
count_average_ratings_per_user(ratings_json)

149.03725782414307

### Filter ratings based on movies

I will find the average ratings per movie, and observe how the size of the dataset changes if I keep only the movies that have more ratings than a specific threshold

In [9]:
rated_movies_ids = set([
    rating["movieId"]
    for rating in ratings_json
])

In [10]:
len(rated_movies_ids)

9066

In [11]:
ratings_per_movie = defaultdict(list)
for rating in ratings_json:
    ratings_per_movie[rating["movieId"]].append(rating)

In [12]:
ratings_per_movie_count = { movieId: len(ratings) for movieId, ratings in ratings_per_movie.items() }

In [13]:
avg_ratings_count = mean(ratings_per_movie_count.values())
avg_ratings_count

11.030664019413193

In [18]:
def count_movies_ratings_by_threshold(multiplier):
    threshold = avg_ratings_count * multiplier
    very_rated_movies = set(
        [movieId for movieId, ratings in ratings_per_movie_count.items() if ratings >= threshold])
    little_rated_movies = set(
        [movieId for movieId, ratings in ratings_per_movie_count.items() if ratings < threshold])
    ratings_on_very_rated_movies = [
        rating for rating in ratings_json if rating["movieId"] in very_rated_movies]
    ratings_on_little_rated_movies = [
        rating for rating in ratings_json if rating["movieId"] in little_rated_movies]
    return [multiplier, threshold, len(very_rated_movies), len(little_rated_movies), len(ratings_on_very_rated_movies), len(ratings_on_little_rated_movies)]


In [20]:
results = []
for i in range(1, 40):
    print("Multipler:", i/10)
    results.append(count_movies_ratings_by_threshold(i/10))
print("OK")
print(tabulate(results, headers=["High Movies", "Low movies", "Ratings on High", "Ratings on Low"], tablefmt="github"))

Multipler: 0.1
Multipler: 0.2
Multipler: 0.3
Multipler: 0.4
Multipler: 0.5
Multipler: 0.6
Multipler: 0.7
Multipler: 0.8
Multipler: 0.9
Multipler: 1.0
Multipler: 1.1
Multipler: 1.2
Multipler: 1.3
Multipler: 1.4
Multipler: 1.5
Multipler: 1.6
Multipler: 1.7
Multipler: 1.8
Multipler: 1.9
Multipler: 2.0
Multipler: 2.1
Multipler: 2.2
Multipler: 2.3
Multipler: 2.4
Multipler: 2.5
Multipler: 2.6
Multipler: 2.7
Multipler: 2.8
Multipler: 2.9
Multipler: 3.0
Multipler: 3.1
Multipler: 3.2
Multipler: 3.3
Multipler: 3.4
Multipler: 3.5
Multipler: 3.6
Multipler: 3.7
Multipler: 3.8
Multipler: 3.9
OK
|     |          |   High Movies |   Low movies |   Ratings on High |   Ratings on Low |
|-----|----------|---------------|--------------|-------------------|------------------|
| 0.1 |  1.10307 |          6003 |         3063 |             96941 |             3063 |
| 0.2 |  2.20613 |          4801 |         4265 |             94537 |             5467 |
| 0.3 |  3.3092  |          4046 |         5020 |       

### Filter ratings based on users

I will find the average ratings per user, and observe how the size of the dataset changes if I keep only the users that have more ratings than a specific threshold

In [21]:
users_ids = set([
    rating["userId"]
    for rating in ratings_json
])

In [22]:
len(users_ids)

671

In [23]:
ratings_per_user = defaultdict(list)
for rating in ratings_json:
    ratings_per_user[rating["userId"]].append(rating)

In [24]:
ratings_per_user_count = { userId: len(ratings) for userId, ratings in ratings_per_user.items() }

In [25]:
avg_ratings_count = mean(ratings_per_user_count.values())
avg_ratings_count

149.03725782414307

In [26]:
def count_users_ratings_by_threshold(multiplier):
    threshold = avg_ratings_count * multiplier
    very_active_users = set(
        [userId for userId, ratings in ratings_per_user_count.items() if ratings >= threshold])
    little_active_users = set(
        [userId for userId, ratings in ratings_per_user_count.items() if ratings < threshold])
    ratings_of_very_active_users = [
        rating for rating in ratings_json if rating["userId"] in very_active_users]
    ratings_of_little_active_users = [
        rating for rating in ratings_json if rating["userId"] in little_active_users]
    return [multiplier, threshold, len(very_active_users), len(little_active_users), len(ratings_of_very_active_users), len(ratings_of_little_active_users)]


In [27]:
results = []
for i in range(1, 40):
    print("Multipler:", i/10)
    results.append(count_users_ratings_by_threshold(i/10))
print("OK")
print(tabulate(results, headers=["Active Users", "Low Users", "Ratings of Active", "Ratings of Low"], tablefmt="github"))

Multipler: 0.1
Multipler: 0.2
Multipler: 0.3
Multipler: 0.4
Multipler: 0.5
Multipler: 0.6
Multipler: 0.7
Multipler: 0.8
Multipler: 0.9
Multipler: 1.0
Multipler: 1.1
Multipler: 1.2
Multipler: 1.3
Multipler: 1.4
Multipler: 1.5
Multipler: 1.6
Multipler: 1.7
Multipler: 1.8
Multipler: 1.9
Multipler: 2.0
Multipler: 2.1
Multipler: 2.2
Multipler: 2.3
Multipler: 2.4
Multipler: 2.5
Multipler: 2.6
Multipler: 2.7
Multipler: 2.8
Multipler: 2.9
Multipler: 3.0
Multipler: 3.1
Multipler: 3.2
Multipler: 3.3
Multipler: 3.4
Multipler: 3.5
Multipler: 3.6
Multipler: 3.7
Multipler: 3.8
Multipler: 3.9
OK
|     |          |   Active Users |   Low Users |   Ratings of Active |   Ratings of Low |
|-----|----------|----------------|-------------|---------------------|------------------|
| 0.1 |  14.9037 |            671 |           0 |              100004 |                0 |
| 0.2 |  29.8075 |            553 |         118 |               97264 |             2740 |
| 0.3 |  44.7112 |            450 |         221 