In [None]:
# Imports
import time
import math
import requests
import random
import dateutil.parser as dateparse
import datetime

# Parameters

JWT: Get it from tournesol.app
	- open website, open dev tools, get any request to Tournesol api, see Request Headers, get Authentication="Bearer ..." value
	- DO NOT SHARE THIS TOKEN TO ANYONE. NEVER. IN ANY CONDITIONS. Even support will never need it.
	- This token expires after some time of inactivity. If tool fails, try to update the token first.

LNGS: Pick all the languages of the videos to keep

In [None]:
# PARAMETERS
JWT="Bearer XXXXXXXXXXXXXXXXXXXXXXXXXXXXXX"
LNGS=['fr', 'en'] # Consolidation phase will only suggest videos in these languages


# Technical

Below is some technical stuff used for the suggestion mechanism.

Play all the next cells one by one, you do not need to change anything.

In [None]:
# Functions utils
def callTournesol(path: str):
	BASE_URL='https://api.tournesol.app/'
	time.sleep(1)
	response = requests.get(BASE_URL + path, headers={
		'Authorization': JWT
	})
	return response.json()

def callTournesolMulti(path: str, args:str=None):
	URL=path + '?limit=100' + (('&' + args) if args else '')
	rs = callTournesol(URL)
	total=rs['count']
	allRes = rs['results']
	print(f'{1 if total == 0 else len(allRes)/total:.0%}', end=' ')

	while len(allRes) < total:
		rs = callTournesol(URL + f'&offset={len(allRes)}')
		total=rs['count']
		allRes += rs['results']
		print(f'{len(allRes)/total:.0%}', end=' ')
	print(f'(=> {len(allRes)})')
	return allRes

def get(json, default, *fields):
	for f in fields:
		if f in json:
			json = json[f]
			if not json:
				return default
		else:
			return default
	return json

def magic_contribs(contrib: int):
	# When classed by order of the return value:
	#  - 2 contribs (=>-2) (first)
	#  - 1 contrib  (=>-1)
	#  - 3 contribs (=> 3)
	#  - 4 contribs (=> 4)
	#  - ...
	#  - no contributors (=> 999999) (last)
	if contrib == 0:
		return 999999
	if contrib < 3:
		return -contrib
	return contrib


def rndAB():
	return ('A', 'B') if random.random() > 0.5 else ('B', 'A')


def get_individual_score(vdata):
	arr = [s for s in get(vdata, [], 'individual_rating', 'criteria_scores') if s['criteria'] == 'largely_recommended']
	return arr[0]['score'] if arr else None


In [None]:
# Get Already Compared list (COMPARED)
def get_already_compared():
	# First call
	print('Extracting compared videos...', end=' ')
	allRes = callTournesolMulti('users/me/contributor_ratings/videos')

	# Sort by last comparison date first
	allRes.sort(key=lambda v:get(v, 'unknown', 'individual_rating', 'last_compared_at'))

	# Exclude videos compared 1 time or less by me
	# Exclude videos compared by less than 3 contributors
	lowcomps = [v for v in allRes if get(v, 0, 'collective_rating', 'n_contributors') >= 3 and get(v, 0, 'individual_rating', 'n_comparisons') > 1]

	return (allRes, lowcomps)

In [None]:
# Get Rate Later list (RATE_LATER)
def get_rate_later(exclude):
	# First call
	print('Extracting rate_later list...', end=' ')
	allRes = callTournesolMulti('users/me/rate_later/videos')

	exclude_ids = {get(v, '?', 'entity', 'uid') for v in exclude}
	allRes = [v for v in allRes if get(v, '?', 'entity', 'uid') not in exclude_ids]

	allRes.sort(key=lambda v:(
		magic_contribs(get(v, 0, 'collective_rating', 'n_contributors')),
		get(v, 0, 'collective_rating', 'n_comparisons'),
		get(v, 'unknown', 'entity', 'metadata', 'publication_date'),
	))
	return allRes


In [None]:
# Cache DistComparisonChecker
class DistComparisonChecker:
	def __init__(self):
		self.cache: dict[str,set[str]] = dict()
		pass

	def _get_data_cached(self, vid):
		print(f'Obtaining comparisons with {vid}...', end=' ')
		allRes = callTournesolMulti(f"users/me/comparisons/videos/{vid}/")
		self.cache[vid] = set()
		for g in allRes:
			if g['entity_a']['uid'] == vid:
				self.cache[vid].add(g['entity_b']['uid'])
			elif g['entity_b']['uid'] == vid:
				self.cache[vid].add(g['entity_a']['uid'])

	def check(self, vdata1, vdata2):
		vid1 = get(vdata1, None, 'entity', 'uid')
		vid2 = get(vdata2, None, 'entity', 'uid')
		if not vid1 in self.cache:
			self._get_data_cached(vid1)
		if not vid2 in self.cache:
			self._get_data_cached(vid2)
		return vid1 not in self.cache[vid2] and vid2 not in self.cache[vid1] and self.cache[vid1].isdisjoint(self.cache[vid2])
	
	def checkfast(self, vdata1, vdata2):
		vid1 = get(vdata1, None, 'entity', 'uid')
		vid2 = get(vdata2, None, 'entity', 'uid')
		c1 = self.cache.get(vid1,set())
		c2 = self.cache.get(vid2, set())
		return vid1 not in c2 and vid2 not in c1 and c1.isdisjoint(c2)

	def addAsCompared(self, vdata1, vdata2):
		vid1 = get(vdata1, None, 'entity', 'uid')
		vid2 = get(vdata2, None, 'entity', 'uid')
		if not vid1 in self.cache:
			self._get_data_cached(vid1)
		if not vid2 in self.cache:
			self._get_data_cached(vid2)
		self.cache[vid1].add(vid2)
		self.cache[vid2].add(vid1)

		if not vdata1.get('individual_rating', None):
			vdata1['individual_rating'] = dict()
		vdata1['individual_rating']['last_compared_at'] = datetime.datetime.utcnow().isoformat()
		if not vdata2.get('individual_rating', None):
			vdata2['individual_rating'] = dict()
		vdata2['individual_rating']['last_compared_at'] = datetime.datetime.utcnow().isoformat()

	def clear(self):
		self.cache.clear()

In [None]:
# Phase Consolidate
def phase_consolidate(index:int, all_compared:list, DCC: DistComparisonChecker):
	"""
	Find in COMPARED 2 videos such as:
		- Same language
		- Exact same number of (individual) comparison made
		- No comparison in common (DCC)
		- Sort and take the smallest one (and take the first one):
			- Sum of Score difference (all scores: individual & collective; all criteria)
			- + Sqrt(Difference in number of contributors)
			- + Sqrt(Difference of length)
			- + Sqrt(Difference of time between both video aired)
	
	Compare them (Do not remove them from the list)

	If none found, return None
	"""
	# Compute minmax & fast access to some data in vdata
	mins = dict()
	maxs = dict()
	vdata = []
	for v in all_compared:
		lng:str = get(v, None, 'entity', 'metadata', 'language')
		cmp:int = get(v, 0, 'individual_rating', 'n_comparisons')
		cnt:int = get(v, 0, 'collective_rating', 'n_contributors')
		indiv_score:float = get_individual_score(v)
		coll_score:float = get(v, 0, 'collective_rating', 'tournesol_score')
		duration:int = math.sqrt(get(v, 0, 'entity', 'metadata', 'duration'))
		aired = math.sqrt( (datetime.datetime.utcnow() - dateparse.parse(get(v, None, 'entity', 'metadata', 'publication_date'), ignoretz=True)).days )
		
		if indiv_score is not None and cnt >= 2 and cmp >= 3 and lng in LNGS:
			vdata.append({
				'lng': lng,
				'cmp': cmp,
				'ind': indiv_score,
				'col': coll_score,
				'dur': duration,
				'air': aired,
				'full': v
			})
			for (key,val) in (('ind', indiv_score), ('col', coll_score), ('dur', duration), ('air', aired)):
				if not key in mins:
					mins[key] = val
					maxs[key] = val
				elif val < mins[key]:
					mins[key] = val
				elif val > maxs[key]:
					maxs[key] = val

	# Find best pair
	bestpair:tuple[any,any] = None
	bestfitness:float = 999999
	for i1 in range(1,len(vdata)):
		v1 = vdata[i1]

		for v2 in vdata[0:i1]:
			if (v1['lng'] != v2['lng']
				or v1['cmp'] != v2['cmp']
				or (not DCC.checkfast(v1['full'], v2['full']))
			):
				continue

			# Get pair score
			fitness = (
				  ( (v1['ind'] - v2['ind'])/(maxs['ind']-mins['ind']) )**2 * 2
				+ ( (v1['col'] - v2['col'])/(maxs['col']-mins['col']) )**2 * 1
				+ ( (v1['dur'] - v2['dur'])/(maxs['dur']-mins['dur']) )**2 * 1
				+ ( (v1['air'] - v2['air'])/(maxs['air']-mins['air']) )**2 * 1
			)

			if fitness < bestfitness:
				bestfitness = fitness
				bestpair = (v1['full'], v2['full'])

	if not bestpair or not DCC.check(bestpair[0], bestpair[1]):
		return None

	DCC.addAsCompared(bestpair[0], bestpair[1])
	ab = rndAB()
	return f'{index:4d}. [*] https://tournesol.app/comparison?uid{ab[0]}=' + get(bestpair[0], None, 'entity', 'uid') + f"&uid{ab[1]}=" + get(bestpair[1], None, 'entity', 'uid')


In [None]:
# Comparisons Generator

def phase_init(compared:list, all_compared:list, rate_later:list):
	"""
	Print half comparison URL with first video from RATE_LATER (user will pick the last compared one from last session and compare it with this one)
	Then go to phase 1
	"""
	vid_new = rate_later.pop(0)
	compared.append(vid_new)
	all_compared.append(vid_new)
	ab = rndAB()
	return (vid_new, f'{0:4d}. [+] https://tournesol.app/comparison?uid{ab[0]}=' + get(vid_new, None, 'entity', 'uid') + f"&uid{ab[1]}=")


def phase_intricate(index: int, vid_new, compared:list, DCC: DistComparisonChecker):
	"""
	Take first from rate later
	Take first from compared

	Check for both: https://api.tournesol.app/users/me/comparisons/videos/yt:<vid>/
	There should be NO vid in common in both lists of entities.

	If ko, take next one from Compared and retry. If no more next: END
	When ok, print comparison URL and pop the one from COMPARED (if still less than 4 cmps, push it to the end of COMPARED), then go Phase 2
	"""
	ok=False
	i=-1
	while not ok:
		i += 1
		if len(compared) <= i:
			return None
		ok = DCC.check(vid_new, compared[i])

	vid_old = compared.pop(i)
	DCC.addAsCompared(vid_new, vid_old)
	if get(vid_old, 999, 'individual_rating', 'n_comparisons') <= 4:
		compared.append(vid_old)
	ab = rndAB()
	return (vid_new, f'{index:4d}. [#] https://tournesol.app/comparison?uid{ab[0]}=' + get(vid_old, None, 'entity', 'uid') + f"&uid{ab[1]}=" + get(vid_new, None, 'entity', 'uid'))


def phase_expand(index:int, vid_old, all_compared:list, compared:list, rate_later:list, DCC: DistComparisonChecker):
	"""
	Take first 2 in RATE_LATER

	If ko, change the 2nd one with the next and retry. If no more next: END
	When ok, print comparison URL, pop the first one & push it to the end of COMPARED, then go to Phase 1
	"""
	vid_new = rate_later.pop(0)
	DCC.addAsCompared(vid_new, vid_old)
	compared.append(vid_old)
	all_compared.append(vid_new)
	ab = rndAB()
	return (vid_new, f'{index:4d}. [+] https://tournesol.app/comparison?uid{ab[0]}=' + get(vid_new, None, 'entity', 'uid') + f"&uid{ab[1]}=" + get(vid_old, None, 'entity', 'uid'))


def getComparisons(all_compared:list, low_compared: list, rate_later: list, DCC: DistComparisonChecker):
	# Copy input lists
	low_compared = list(low_compared)
	rate_later = list(rate_later)

	# Phase 0
	(vid, cmp) = phase_init(all_compared, low_compared, rate_later)
	yield cmp

	i=1
	while True:
		consolidated = phase_consolidate(i, all_compared, DCC)
		if consolidated:
			yield consolidated
			i+=1

		(vid, cmp) = phase_intricate(i, vid, low_compared, DCC)
		if not vid:
			break
		yield cmp
		i+=1

		consolidated = phase_consolidate(i, all_compared, DCC)
		if consolidated:
			yield consolidated
			i+=1

		(vid, cmp) = phase_expand(i, vid, all_compared, low_compared, rate_later, DCC)
		if not vid:
			break
		yield cmp
		i+=1
	yield 'NO MORE'

# Main part

There are 2 cells bellow:
- "INIT/RESET": To run once to initialize the tool
- "CONTINUE": To run as many time as you want, to get more comparisons links generated

If at anytime you do comparisons not suggested by the tool, plase run again INIT/RESET to synchronize the tool with your current tournesol account

In [None]:
# INIT/RESET ORDO (Replay this cell everytime any comparison other than suggested by this notebook has been made)
(ALL_COMPARED, LOW_CMPS) = get_already_compared()
RATE_LATER = get_rate_later(ALL_COMPARED)
DCC = DistComparisonChecker()
comparison = getComparisons(ALL_COMPARED, LOW_CMPS, RATE_LATER, DCC)
comparisons = []
print('Initialized !\n')

# First comparison
comparisons.append(next(comparison))
print('To begin, make the following comparison with any already compared video of your choice:')
print(comparisons[0])
print('once comparison done, run next cell')

In [None]:
# CONTINUE ORDO (Replay this cell everytime to get next comparison)
N=10 # How many more to show at once ?

# Print previous
for l in comparisons[-2*N:]:
	print(l)
print('-----     NEW     -----')
# Print new
for _ in range(N):
	comparisons.append(next(comparison))
	print(comparisons[-1])

#### TODO:
Every often (like once every 10 comparisons), goto Phase 3: See challenge.py (display 10 then goto Phase 1 or 2 depending on what phase we were before to goto phase 3)