# Comparisons tool 2: Reducing distances from video to video

This tool aims to give to a tournesol user a list of suggested comparisons.

- Comparisons between 2 videos marked as "Public" (will not suggest comparisons with "Private" videos)
- Compute the distance between compared videos, by the minimum number of comparisons to cross to get from one to the other
- Find good new comparisons to reduce these distances as much as possible

Requirements:
- Knowing how to get Authentication JWT from Tournesol website
- Having more than 20 compared videos (With few videos, this tool is not useful and may not suggest any comparisons)

Special rules:

- Will not suggest to add comparisons to videos where there are less contributors than your comparisons with it\
  (Prefer improving popular videos to improve global Tournesol connectivity as much as possible)

In [None]:
# Imports
import os
import math
import time
import random
import numpy as np
import networkx as nx
import plotly.graph_objects as go
from datetime import datetime, timezone

# Ensure notebook is running from src/ dir
_pwd = os.path.realpath('.').split(os.sep)
if 'src' in _pwd:
	while _pwd[-1] != 'src':
		_pwd.pop()
	os.chdir(os.sep.join(_pwd))
print(os.path.realpath('.'))

# Local project requirements
from model.tournesol_api import TournesolAPI, get, get_individual_score, pretty_print_vdata

## Parameters

JWT: Get it from tournesol.app

- open [website](https://tournesol.app), authenticate, then open dev tools, get any request to Tournesol api, see Request Headers, get `Authentication="Bearer ..."` value
- DO NOT SHARE THIS TOKEN TO ANYONE. NEVER. IN ANY CONDITIONS. It's sensitive like your password.
- This token expires after some time of inactivity (about a day or so). If tool fails, try to update the token first.

In [None]:
# PARAMETERS
TOURNESOL_API=TournesolAPI(input('JWT (example: "Bearer xxxxxxxxx")'))
TOURNESOL_API.loadCache(f"../data/Tournesol_API_cache-{TOURNESOL_API.username}.json.gz")

MAX_PUBLIC_COMPARISONS = 10 # will not suggest to compare videos having more than this
MAX_NUMBER_OF_SUGGESTIONS = 25 # will generate this amount of suggested comparisons

In [None]:
# Load dataset
TOURNESOL_API.getMyComparedVideos()
comparisons = TOURNESOL_API.getAllMyComparisons()

public_undgraph = nx.Graph()
private_digraph = nx.DiGraph()
for cdata in comparisons:
	if cdata.get('is_public', False):
		public_undgraph.add_edge(cdata['entity_a'], cdata['entity_b'])

	score = [dta['score'] for dta in cdata['criteria_scores'] if dta['criteria'] == 'largely_recommended'][0]
	if score >= 0:
		private_digraph.add_edge(cdata['entity_a'], cdata['entity_b'])
	if score <= 0:
		private_digraph.add_edge(cdata['entity_b'], cdata['entity_a'])
private_undgraph = private_digraph.to_undirected(as_view=True)

videos = {vid: TOURNESOL_API.getVData(vid, useCache=True, saveCache=False) for vid in private_undgraph.nodes}
for vid,video in videos.items():
	if get(video, False, 'individual_rating', 'is_public') and (vid not in public_undgraph):
		public_undgraph.add_node(vid)

print('Videos', len(videos))
print('Comparisons', len(comparisons))
print('Public', public_undgraph)
print('Private', private_undgraph)
print('Directed', private_digraph)

In [None]:
# Suggest comparisons
centernode = max(public_undgraph.nodes, key=lambda v: private_undgraph.degree[v])
contribs = {vid:get(videos[vid], 0, 'collective_rating', 'n_contributors') for vid in public_undgraph.nodes}
candidates = {vid:public_undgraph.degree[vid] for vid in public_undgraph.nodes
	if (
		# number of contributors > public comparisons
		contribs[vid] > private_undgraph.degree[vid]
		# Or if has no connexion to highest degree node
		or not nx.has_path(private_undgraph, vid, centernode)
	)
	# days since last comparisons >= public comparisons (x public comparisons: will not be suggested for x days after a comparison)
	and (datetime.now(timezone.utc) - datetime.fromisoformat(get(videos[vid], '2000-01-01T00:00:00Z', 'individual_rating', 'last_compared_at'))).days >= public_undgraph.degree[vid]
}

# Get all candidates scores
indiv_score = {vid:get_individual_score(videos[vid]) or 0 for vid in candidates}

MAX_DISTANCE_CUTOFF = int(math.sqrt(public_undgraph.number_of_nodes())+1)

sim_public_undgraph = public_undgraph.copy()
sim_private_digraph = private_digraph.copy()
sim_private_undgraph = sim_private_digraph.to_undirected(as_view=True)

suggested = 0

print(' num:                        Comparison URL                  distance & individual comparisons: public(private)')
while suggested < MAX_NUMBER_OF_SUGGESTIONS and len(candidates) > 1:
	distances:dict[str,dict[str,tuple[int,int]]] = {}
	dist_public:dict[str,int] = {}
	dist_private:dict[str,int] = {}
	clist = sorted(candidates)
	for c in clist:
		private_dists_from_c = nx.single_source_shortest_path_length(sim_private_undgraph, source=c, cutoff=MAX_DISTANCE_CUTOFF)
		public_dists_from_c = nx.single_source_shortest_path_length(sim_public_undgraph, source=c, cutoff=MAX_DISTANCE_CUTOFF)
		distances[c] = {}
		dist_public[c] = 0
		dist_private[c] = 0
		for d in clist:
			if d >= c: break
			prv_dist = private_dists_from_c.get(d, MAX_DISTANCE_CUTOFF)
			pub_dist = public_dists_from_c.get(d, MAX_DISTANCE_CUTOFF)
			dist_private[c] += prv_dist
			dist_private[d] += prv_dist
			dist_public[c] += pub_dist
			dist_public[d] += pub_dist
			distances[c][d] = (pub_dist, prv_dist)
			distances[d][c] = (pub_dist, prv_dist)

	if not distances:
		break

	# Get as the first video, the one that:
	# - (1) has the more distance to others in public comparisons
	# - (2) has the more distance to others in private comparisons
	# - (3) has the largest difference of contributors - indiv_public_comparisons
	cmp1 = max(distances, key=lambda c1: (dist_public.get(c1), dist_private.get(c1), contribs[c1] - sim_public_undgraph.degree[c1]))

	# Second candidate may not be connected to first candidate in directed graph
	connected_nodes = {cmp1, *nx.descendants(sim_private_digraph, cmp1), *nx.ancestors(sim_private_digraph, cmp1)}
	candidates_2 = list(vid for vid in candidates if vid not in connected_nodes)
	if candidates_2:
		# Get as the second video, the one that:
		# - (1) has the more distance to cmp1 in public graph
		# - (2) has the more distance to cmp2 in private graph
		# - (3) has the indiv_score the most similar to cmp1
		cmp2 = max(candidates_2, key=lambda c2: (distances[cmp1].get(c2), -abs(indiv_score[c2] - indiv_score[cmp1])))

		# Shuffle left&right
		(cmp1, cmp2) = random.choice([(cmp1, cmp2), (cmp2, cmp1)])
		pair_dist_pub = distances[cmp1].get(cmp2)[0]
		pair_dist_prv = distances[cmp1].get(cmp2)[1]
		def _pubpriv(pub, priv):
			if pub == priv:
				if pub >= MAX_DISTANCE_CUTOFF:
					return ' ∞    '
				return f"{pub:2d}    "
		
			if pub >= MAX_DISTANCE_CUTOFF:
				return f" ∞({priv:2d})"
			
			return f"{pub:2d}({priv-pub:+1d})"

		suggested += 1
		print(f"{suggested:4d}: https://tournesol.app/comparison?uidA={cmp1}&uidB={cmp2} dist:{_pubpriv(pair_dist_pub,pair_dist_prv)} cmps:{_pubpriv(public_undgraph.degree[cmp1], private_undgraph.degree[cmp1])} & {_pubpriv(public_undgraph.degree[cmp2], private_undgraph.degree[cmp2])}")
		sim_private_digraph.add_edge(cmp1, cmp2)
		sim_private_digraph.add_edge(cmp2, cmp1)
		sim_public_undgraph.add_edge(cmp1, cmp2)
		candidates.pop(cmp2)
	candidates.pop(cmp1)