### Goal of the notebook

* Build a graph based on the `large_dataset.ipynb` notebook and save the corresponding `users ids` and `movies ids` that should be kept in the new graph

### Imports

In [1]:
import pathlib
import os
import sys
from collections import defaultdict
from statistics import mean
from py2neo import Graph
from py2neo.bulk import merge_nodes, merge_relationships
import random

parent_path = pathlib.Path(os.getcwd()).parent.absolute()
sys.path.append(str(parent_path))

from utils.general import read_csv, df_to_json
from tqdm import tqdm
from tabulate import tabulate


### Load CSVs

In [2]:
data_dir = "movies_with_metadata"

In [3]:
movies_json = df_to_json(
    read_csv(
        filename="movies_metadata",
        parent_dir_name=data_dir,
        low_memory=False,
    )
)

Reading from: /Users/ioannisathanasiou/diploma/model/movies_with_metadata/movies_metadata.csv


In [4]:
ratings_json = df_to_json(
    read_csv(
        filename="ratings",
        parent_dir_name=data_dir,
    )
)


Reading from: /Users/ioannisathanasiou/diploma/model/movies_with_metadata/ratings.csv


In [42]:
links_json = df_to_json(
    read_csv(
        filename="links",
        parent_dir_name=data_dir,
    )
)

Reading from: /Users/ioannisathanasiou/diploma/model/movies_with_metadata/links.csv


### Filter ratings based on users

I will find the average ratings per user, and observe how the size of the dataset changes if I keep only the users that have more ratings than a specific threshold

In [5]:
users_ids = set([
    rating["userId"]
    for rating in ratings_json
])

In [6]:
len(users_ids)

270896

In [7]:
ratings_per_user = defaultdict(list)
for rating in ratings_json:
    ratings_per_user[rating["userId"]].append(rating)

In [8]:
ratings_per_user_count = { userId: len(ratings) for userId, ratings in ratings_per_user.items() }

In [9]:
avg_ratings_count = mean(ratings_per_user_count.values())
avg_ratings_count

96.06745393065974

In [10]:
def count_average_ratings_per_movie(kept_ratings):
    ratings_per_movie_count = defaultdict(int)
    for rating in kept_ratings:
        ratings_per_movie_count[rating["movieId"]] += 1
    return mean(ratings_per_movie_count.values())

def count_average_ratings_per_user(kept_ratings):
    ratings_per_user_count = defaultdict(int)
    for rating in kept_ratings:
        ratings_per_user_count[rating["userId"]] += 1
    return mean(ratings_per_user_count.values())

In [11]:
def get_statistics_by_kept_users(kept_users):
    kept_ratings = [
        rating for rating in ratings_json if rating["userId"] in kept_users]
    kept_movies = set([
        rating["movieId"] for rating in kept_ratings])
    avg_ratings_per_movie = count_average_ratings_per_movie(kept_ratings)
    avg_ratings_per_user = count_average_ratings_per_user(kept_ratings)
    return kept_ratings, kept_movies, avg_ratings_per_movie, avg_ratings_per_user

In [13]:
users_groups = defaultdict(list)

avg_ratings_count = mean(ratings_per_user_count.values())
for userId, ratings_count in ratings_per_user_count.items():
    x = round(ratings_count / avg_ratings_count * 100)
    group = x - x%5
    users_groups[group].append(userId)

len(users_groups)

643

In [14]:
users_groups_count = { key: len(users) for key, users in users_groups.items() }
users_groups_count = { key: users for key, users in sorted(users_groups_count.items(), key=lambda x: x[0]) }

In [19]:
def random_squeeze_group(group, max_users):
    limit = random.randrange(max_users)
    if len(group) <= limit:
        return group
    else:
        return random.sample(group, limit)

def get_statistics_by_groups_kept(min_group, max_group, max_users_per_group, randomize=True):
    groups = [group for group_key, group in users_groups.items() if min_group <= group_key <= max_group and random.random()>0.2]
    users_to_keep = [userId for group in groups for userId in set(random_squeeze_group(group, max_users_per_group))]
    kept_ratings, kept_movies, avg_ratings_per_movie, avg_ratings_per_user = get_statistics_by_kept_users(set(users_to_keep))
    return users_to_keep, kept_ratings, kept_movies, avg_ratings_per_movie, avg_ratings_per_user

### Average Dataset with around 2M ratings

In [21]:
# average user has submitted 96 ratings
#  keep users that have ratings_count more than 5% and less than 300% of 96
min_group = 10
max_group = 300
max_users_per_group = 700
# split these users in groups (by 5% distance from average ratings_count)
# keep a random number of users (max 500) from each group
users_to_keep, kept_ratings, kept_movies, avg_ratings_per_movie, avg_ratings_per_user = get_statistics_by_groups_kept(
    min_group, max_group, max_users_per_group)

print(tabulate([[len(users_to_keep), len(kept_ratings), len(kept_movies), avg_ratings_per_movie, avg_ratings_per_user]], headers=[
      "Kept users", "Kept Ratings", "Kept Movies", "Avg ratings/movie", "Avg ratings/user"], tablefmt="github"))


|   Kept users |   Kept Ratings |   Kept Movies |   Avg ratings/movie |   Avg ratings/user |
|--------------|----------------|---------------|---------------------|--------------------|
|        16665 |        2543358 |         18041 |             140.977 |            152.617 |


### Save the kept users and movies of the corresponding graph

In [40]:
f = open(os.path.join("..", "movies_with_metadata", "users_subgraph.csv"), "w")
f.write(", ".join(str(i) for i in users_to_keep))
f.close()

In [41]:
f = open(os.path.join("..", "movies_with_metadata", "movies_subgraph.csv"), "w")
f.write(", ".join(str(i) for i in kept_movies))
f.close()

In [45]:
movies_imdbIds_to_keep = [
    link["imdbId"] for link in links_json if link["movieId"] in kept_movies
]

In [49]:
f = open(os.path.join("..", "movies_with_metadata", "movies_imdbIds_subgraph.csv"), "w")
f.write(", ".join(str(i) for i in movies_imdbIds_to_keep))
f.close()

### Conclusion

| graph | users | movies | ratings | avg ratings per movie | avg ratings per user |
| --- | --- | --- | --- | --- | --- |
| small | 700 | 9k | 100k | 11 | 149
| large | 270k | 45k | 25M | 576 | 96 |
| sub-graph | 10k-20k | 10k-20k | 2M | 100-200 | 100-200 |


* To build the sub-graph, I filtered the ratings on the large graph based on the users, following the steps:
    * count the ratings submitted by each user
    * find the average number of ratings per user in the initial graph (96)
    * split the users in groups, based on their ratings count in comparison to the average ratings count per user (96)
    * keep randomly the 80% the groups with users who have submitted from 10 (96 * 10%) to 288 (96 * 300%) ratings
    * keep randomly at most 700 users from each group