# Graphs

Compute random graphs and statistics from the tournesol dataset

## 1. Init or update Tournesol dataset (Todo once every week)

- Download zip from https://api.tournesol.app/exports/all/
- Extract zip 

## 2. Init or update Youtube cache (if tournesol dataset was updated)

Run script `py src/rndstats.py (-t <TOURNESOL_DATASET_PATH>) (-c <YTDATA_CACHE_PATH>) (-u <USER>) --fetch`

## 3. Check and update Notebook variables

See bloc [2] below

In [None]:
# Imports
import os
import sys
import math
import time
import numpy as np
import networkx as nx
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import matplotlib.ticker as mtick
from dateutil import parser as dateparser
from datetime import datetime
from dateutil.relativedelta import relativedelta
from matplotlib.dates import MonthLocator, DayLocator, ConciseDateFormatter

# Ensure notebook is running from Tournesol-Stats dir
_pwd = os.path.realpath('.').split(os.sep)
if 'src' in _pwd:
	while _pwd[-1] != 'src':
		_pwd.pop()
	_pwd.pop() # Go up from src dir to Tournesol-Stats
	os.chdir(os.sep.join(_pwd))
print(os.path.realpath('.'))

# Local project requirements
sys.path.append('src/py')
from dao.youtube_api import YoutubeAPI
from model.tournesol_dataset.comparisons import ComparisonFile, ComparisonLine
from model.tournesol_dataset.collectivecriteriascores import CollectiveCriteriaScoresFile, CCSLine
from model.tournesol_dataset.individualcriteriascores import IndividualCriteriaScoresFile

from model.solidago import TournesolInputFromPublicDataset

In [None]:
# Load datasets

# File based dataset
YTDATA_CACHE_PATH='./data/YTData_cache.json.gz'
TOURNESOL_DATASET_PATH='./data/2025/tournesol_dataset_2025-02-24.zip'

# Constants
YTDATA = YoutubeAPI()
try:
	YTDATA.load(YTDATA_CACHE_PATH)
except FileNotFoundError as e:
	pass

COMPARISONS = ComparisonFile(TOURNESOL_DATASET_PATH)
COLLECTIVE_SCORES = CollectiveCriteriaScoresFile(TOURNESOL_DATASET_PATH)
INDIVIDUAL_SCORES = IndividualCriteriaScoresFile(TOURNESOL_DATASET_PATH)

PUBLIC_DATASET = TournesolInputFromPublicDataset.download()

In [None]:
## Top videos
def top_videos():
	scores = COLLECTIVE_SCORES.get_vids_scores(criterion='largely_recommended')
	scores_ordered = sorted(scores.keys(), key=lambda k: scores[k], reverse=True)

	for i in range(10):
		vid = scores_ordered[i]
		score = scores[vid]
		txt = vid
		if vid in YTDATA.videos:
			txt = YTDATA.videos[scores_ordered[i]]
		print(f'Rank {i+1} (score={score}): {txt}')

top_videos()

In [None]:
## Top creators
def top_creators():
	vid_scores = COLLECTIVE_SCORES.get_vids_scores(criterion='largely_recommended')
	creators: dict[str,dict[str,float]] = dict()

	for vid in vid_scores:
		creator = '#UNKNOWN CREATOR#'
		if vid in YTDATA.videos and YTDATA.videos[vid].channel:
			creator = YTDATA.videos[vid].channel.__str__()
		if not creator in creators:
			creators[creator] = dict()
		creators[creator][vid] = vid_scores[vid]

	creators_scores: dict[str, float] = {
		creator: np.average(sorted(creators[creator].values())[math.floor(len(creators[creator].values())*.2):math.ceil(len(creators[creator].values())*.8)])
		for creator in creators
	}

	scores_ordered = sorted((c for c in creators_scores if len(creators[c]) > 1), key=lambda k: creators_scores[k], reverse=True)

	for i in range(len(scores_ordered)):
		creator = scores_ordered[i]
		vals=list(creators[creator].values())
		c_cnt=len(vals)
		c_min=min(vals)
		c_max=max(vals)
		c_avg=sum(vals)/len(vals)
		c_med=np.median(vals)
		c_std=np.std([float(v) for v in creators[creator].values()])

		if c_cnt == 1:
			print(f'{i+1:3d}. (avg={c_avg:0.1f}/med={c_med:0.1f}   1v): {creator}')
		else:
			print(f'{i+1:3d}. (avg={c_avg:0.1f}/med={c_med:0.1f} {c_cnt:3d}v): {creator}     (min={c_min:0.2f} max={c_max:0.2f} stdv={c_std:0.2f})')

top_creators()

In [None]:
## Users proximity
USER_PROX_MIN_VID = 50
USER_PROX_KEEP = 3 # Top x users having the best proximity to keep so that `len(user_prox[username]) <= USER_PROX_KEEP`
TOP_USER_PROX = 50

top_user_prox: list[tuple[str,str,float]] = list()
user_prox:dict[str,dict[str,float]] = dict() # [user1][user2] = similarity
indiv_scores = INDIVIDUAL_SCORES.get_scores() # (score, uncertainty) = indiv_scores[user][video][criterion]

users = [u for u in indiv_scores if len(indiv_scores[u]) > USER_PROX_MIN_VID]

for i,u1 in enumerate(users[:-1]):
	if not u1 in indiv_scores:
		continue
	u1_scores = indiv_scores[u1] # (score, uncertainty) = u1_scores[video][criterion]
	u1_prox:dict[str,float] = user_prox.setdefault(u1, dict()) # proximity = u1_prox[u2]

	for u2 in users[i+1:]:
		if not u2 in indiv_scores:
			continue
		u2_scores = indiv_scores[u2] # (score, uncertainty) = u2_scores[video][criterion]

		vids_in_common = set(u1_scores.keys())
		vids_in_common.intersection_update(u2_scores.keys())

		# Compute proximity between u1 and u2
		user_sim = 0.0
		for vid in vids_in_common:
			vid_sim = 0.0
			common_criteria = set(u1_scores[vid].keys())
			common_criteria.intersection_update(u2_scores[vid].keys())
			for crit in common_criteria:
				u1s = (u1_scores[vid][crit][0] + 100)/100
				u2s = (u2_scores[vid][crit][0] + 100)/100
				vid_sim += 1 - (u1s - u2s)**2
			vid_sim /= (len(u1_scores[vid]) + len(u2_scores[vid]) - len(common_criteria))
			user_sim += vid_sim

		user_sim /= len(u1_scores) + len(u2_scores) - len(vids_in_common)

		# Assign proximities
		if user_sim > 0:
			u1_prox[u2] = user_sim
			user_prox.setdefault(u2, dict())[u1] = user_sim

	if u1_prox:
		top_keys = sorted(u1_prox.keys(), key=u1_prox.get, reverse=True)
		u1_prox = {u2: u1_prox[u2] for u2 in top_keys[:USER_PROX_KEEP]}
		
		top_user_prox.extend((u1_prox[u2],u1,u2) for u2 in u1_prox if u1 < u2)
		top_user_prox = sorted(top_user_prox, reverse=True)[:TOP_USER_PROX]

		# if len(u1_scores) > 200:
		# 	print(f"{u1} ({len(u1_scores)}v):  \t", end='')
		# 	for u2 in u1_prox:
		# 		print(f"{u2} ({u1_prox[u2]:0.2%})  \t", end='')
		# 	print(flush=True)

print()
print()

for (i, (sim,u1,u2)) in enumerate(top_user_prox, start=1):
	print(f"{i:2d}. {sim:6.2%} - {u1} & {u2}")


In [None]:
# Estimate uncompared video notes for specific user according to users similarities

USER_RECOM = 'NatNgs'
RECOM_CRITERION = 'largely_recommended'
MIN_W = 0.05
IGNORE_VIDS: set[str] = {'GWyrwG_QGeI', 'utWMGi8HTjY', 'XhRbt3R41hs'}

def recommend_videos(recom_nb: int):
	# Proximities to other users
	recom_user_prox = user_prox[USER_RECOM] # proximity = recom_user_prox[u2]
	print('recom_user_prox:', len(recom_user_prox))

	# Scores of others
	# indiv_scores (score, uncertainty) = indiv_scores[user][video][criterion]
	print('indiv_scores:', len(indiv_scores))

	vid_scores: dict[str,tuple[float,float,float,int]] = dict() # (minestim_sum, maxestim_sum, total_weights) = vid_scores[vid]
	for u in recom_user_prox:
		w = recom_user_prox[u]
		unseen = set(indiv_scores[u].keys())
		unseen.difference_update(IGNORE_VIDS)
		unseen.difference_update(indiv_scores[USER_RECOM].keys())
		#unseen.intersection_update(indiv_scores[USER_RECOM].keys())
		for v in unseen:
			if not RECOM_CRITERION in indiv_scores[u][v]:
				continue
			##
			# if not v in vid_scores:
			# 	(score, uncertainty) = indiv_scores[USER_RECOM][v].get(RECOM_CRITERION,0.0)
			# 	minestim = max(-100, score - uncertainty)
			# 	maxestim = min(+100, score + uncertainty)
			# 	vid_scores[v] = (maxestim, minestim, 1, 0)
			##

			(max_sum, min_sum, weights, n) = vid_scores.get(v,(0.0, 0.0, 0.0, 0))
			(score, uncertainty) = indiv_scores[u][v][RECOM_CRITERION]

			minestim = max(-100, score - uncertainty)
			maxestim = min(+100, score + uncertainty)

			vid_scores[v] = (max_sum + w*maxestim, min_sum + w*minestim, weights+w, n+1)

	for v in list(vid_scores.keys()):
		(max_sum, min_sum, weights, n) = vid_scores[v]
		if n < 2 or weights <= MIN_W:
			vid_scores.pop(v)
			continue
		vid_scores[v] = ((max_sum+min_sum)/weights/2, round(max_sum/weights), round(min_sum/weights), weights, n)

	vid_recom = sorted(vid_scores.keys(), key=vid_scores.get, reverse=True)

	print(f'Recommending for {USER_RECOM}:')
	for v in vid_recom[:recom_nb]:
		(avg, max_sum, min_sum, weight, n) = vid_scores[v]
		print(f"- {v}: {min_sum:+4.0f} ~ {max_sum:+4.0f}🌻 (w={weight:0.2f} n={n})")

recommend_videos(10)

In [None]:
## User progression graph

USERS={'NatNgs'} #emmanuel.chambost lpfaucon white

def user_progress_graph():
	per_date:dict[str,dict[str,dict[str,int]]] = dict() # per_date[user][date] = {vid1: <nb comparisons>}

	def fetch_user_data(line: ComparisonLine):
		if line.criterion != 'largely_recommended' or (not line.user in USERS):
			return

		cmps = per_date.setdefault(line.user, dict()).setdefault(line.date, dict())
		cmps[line.vid1] = cmps.get(line.vid1, 0) + 1
		cmps[line.vid2] = cmps.get(line.vid2, 0) + 1
	COMPARISONS.foreach(fetch_user_data)

	tt = sorted({d for u in per_date for d in per_date[u].keys()})

	fig, ax = plt.subplots()
	fig.set_size_inches(12, 6)
	ax.grid(visible=True, which='major', axis='both', color='gray')
	ax.grid(visible=True, which='minor', axis='both', color='#eee')
	ax.set_axisbelow(True)

	def user_curve(USER: str):
		yy_set:set[str] = set()
		xx_sum=0
		xx = []
		yy = []
		for d in per_date[USER].keys():
			dt = per_date[USER][d]
			yy_set.update(dt.keys())
			xx_sum += sum(dt.values())
			xx.append(xx_sum)
			yy.append(len(yy_set))
			if len(xx) == 1 or yy[-1] >= yy[-2] + 10:
				ax.text(xx[-1], yy[-1], s=f'{d}  ', fontsize=6, verticalalignment='center', horizontalalignment='right')

		ax.scatter(xx, yy, marker='+', s=8)

		ax.plot(xx, yy, color='#888', linewidth=1)
		ax.plot([2,xx[0]], [1,yy[0]], color='#888', linewidth=1)
		ax.text(xx[-1], yy[-1], s=f' {USER}')
		return (max(xx), max(yy))

	mx = -1
	my = -1
	for user in USERS:
		max_x, max_y = user_curve(user)
		if max_x > mx:
			mx = max_x
		if max_y > my:
			my = max_y

	ax.set_ylim([1, math.ceil(my)*1.1])
	ax.set_xlim([1, math.ceil(mx)*1.1])

	# ax.set_xscale('log')
	# ax.xaxis.set_major_formatter(mtick.ScalarFormatter())

	# ax.set_yscale('log')
	# ax.yaxis.set_major_formatter(mtick.ScalarFormatter())

	# plt.title('How many comparisons every Tournesol users have done')
	ax.set_xlabel('Total number of comparisons (largely_recommended)')
	ax.set_ylabel('Total number of video compared')
user_progress_graph()


In [None]:
## Global progression graph
def global_progress_graph():
	actives: dict[str,set[str]] = dict() # date:{user, ...}
	activesmonth: dict[str, set[str]] = dict() # date:{user, ...}

	cut = (datetime.now() - relativedelta(years=1, weeks=4)).isoformat()
	def fetch_users_data(line: ComparisonLine):
		if line.date < cut:
			return
		
		actives.setdefault(line.date, set()).add(line.user)
	COMPARISONS.foreach(fetch_users_data)

	l_dates: list[str] = sorted(actives.keys())
	for i in range(4,len(l_dates)):
		activesmonth[l_dates[i]] = actives[l_dates[i]].union(actives[l_dates[i-1]]).union(actives[l_dates[i-2]]).union(actives[l_dates[i-3]])


	per_date:dict[str,dict[str,dict[str,int]]] = dict() # per_date[user][date] = {vid1: <nb comparisons>}

	def fetch_user_data(line: ComparisonLine):
		if line.criterion != 'largely_recommended':
			return

		cmps = per_date.setdefault(line.user, dict()).setdefault(line.date, dict())
		cmps[line.vid1] = cmps.get(line.vid1, 0) + 1
		cmps[line.vid2] = cmps.get(line.vid2, 0) + 1
	COMPARISONS.foreach(fetch_user_data)

	tt = sorted(d for u in per_date for d in per_date[u].keys())

	fig, ax = plt.subplots()
	fig.set_size_inches(8, 6)
	ax.grid(visible=True, which='major', axis='both', color='gray')
	ax.grid(visible=True, which='minor', axis='both', color='#eee')
	ax.set_axisbelow(True)

	def user_curve(user: str):
		xx_set:set[str] = set()
		yy_sum=0
		xx = []
		yy = []
		for dt in per_date[user].values():
			xx_set.update(dt.keys())
			yy_sum += sum(dt.values())
			xx.append(len(xx_set))
			yy.append(yy_sum / xx[-1])

		color = '#888'
		alpha = 0.1
		zindex = 1
		if user == 'NatNgs': 
			color = '#F00'
			alpha = 0.5
			zindex = 400
			ax.text(xx[-1], yy[-1], s=user, color=color, horizontalalignment='right', verticalalignment='bottom')
		elif user in actives[l_dates[-1]]:
			color = '#0a4'
			alpha = 0.2
			zindex = 300
		elif user in activesmonth[l_dates[-1]]:
			color = '#04a'
			alpha = 0.1
			zindex = 200

		ax.plot(xx, yy, color=color, alpha=alpha, linewidth=1, zorder=zindex)
		ax.plot([2,xx[0]], [1,yy[0]], color=color, alpha=alpha/2, linewidth=1, zorder=zindex)
		return (max(xx), max(yy))

	mx = -1
	my = -1
	for user in per_date.keys():
		max_x, max_y = user_curve(user)
		if max_x > mx:
			mx = max_x
		if max_y > my:
			my = max_y

	ax.yaxis.set_ticks(np.arange(start=0, stop=my+1, step=5))
	ax.set_ylim([1, math.ceil(my)])
	ax.set_xlim([2, math.ceil(mx)])

	ax.set_xscale('log')
	ax.xaxis.set_major_formatter(mtick.ScalarFormatter())

	# plt.title('How many comparisons every Tournesol users have done')
	ax.set_ylabel('Average number of comparisons per video')
	ax.set_xlabel('Total number of video compared')

global_progress_graph()

In [None]:
## User against global scores

USER='NatNgs'
CRITERION='largely_recommended'

def user_vs_global():
	uscores = INDIVIDUAL_SCORES.get_scores(criterion=CRITERION, users=[USER])[USER] # `(score, uncertainty) = out[video][criterion]`
	vids = sorted(uscores.keys())
	gscores = COLLECTIVE_SCORES.get_scores(criterion=CRITERION, vids=vids) # `(score, uncertainty) = out[video][criterion]`

	fig, ax = plt.subplots()
	fig.set_size_inches(8, 6)

	ax.grid(visible=True, which='major', axis='both', color='gray')
	ax.grid(visible=True, which='minor', axis='both', color='#eee')
	ax.set_axisbelow(True)

	ax.plot([-100,100], [0,0], color='#000', zorder=-1)
	ax.plot([0,0], [-100,100], color='#000', zorder=-1)

	xminmax = [0,0]
	def add_comparisons(line:ComparisonLine):
		if line.criterion != CRITERION or line.vid1 not in uscores or line.vid2 not in uscores:
			return
		if line.user == USER:
			color = None
			zorder = None
			if ((uscores[line.vid2][CRITERION][0] - uscores[line.vid1][CRITERION][0] > 10 and line.score < -1) \
				or (uscores[line.vid1][CRITERION][0] - uscores[line.vid2][CRITERION][0] > 10 and line.score > 1)):
				# 'Red' links: Individual score does not match with comparison direction (the one compared as to be recommended has a lower individual score)
				color = '#A008'
				zorder = 4
				# print(f"- {line.vid1} # {-line.score:+0d} # {line.vid2}")
				# print(f"  {YTDATA.videos.get(line.vid1,line.vid1)} (me:{uscores[line.vid1][CRITERION][0]:0.2f}, glob:{gscores[line.vid1][CRITERION][0]:0.2f})")
				# print(f"  {YTDATA.videos.get(line.vid2,line.vid2)} (me:{uscores[line.vid2][CRITERION][0]:0.2f}, glob:{uscores[line.vid2][CRITERION][0]:0.2f})\n")
			elif (gscores[line.vid2][CRITERION][0] - gscores[line.vid1][CRITERION][0] > 10 and line.score < -1) \
				or (gscores[line.vid1][CRITERION][0] - gscores[line.vid2][CRITERION][0] > 10 and line.score > 1):
				# 'Blue' links (individual score corresponds to comparison direction, but not collective score)
				color = '#0284'
				zorder = 3
			else:
				# 'Gray' links (individual score AND collective score corresponds to comparison direction)
				color = '#8881' # '#0821'
				zorder = 2

			ax.plot(
				[gscores[line.vid1][CRITERION][0], gscores[line.vid2][CRITERION][0]], # Global scores
				[uscores[line.vid1][CRITERION][0], uscores[line.vid2][CRITERION][0]], # User scores
				color=color,
				linewidth=1,
				zorder=zorder
			)
		xminmax[0] = min(xminmax[0], gscores[line.vid1][CRITERION][0], gscores[line.vid2][CRITERION][0])
		xminmax[1] = max(xminmax[1], gscores[line.vid1][CRITERION][0], gscores[line.vid2][CRITERION][0])
		"""else:
			if (gscores[line.vid2][CRITERION][0] > gscores[line.vid1][CRITERION][0] and line.score <= 0) \
				or (gscores[line.vid2][CRITERION][0] < gscores[line.vid1][CRITERION][0] and line.score >= 0):
				ax.plot(
					[gscores[line.vid1][CRITERION][0], gscores[line.vid2][CRITERION][0]], # Global scores
					[uscores[line.vid1][CRITERION][0], uscores[line.vid2][CRITERION][0]], # User scores
					color='#8041',
					linewidth=1
				)"""
	COMPARISONS.foreach(add_comparisons)

	# plt.title('How many comparisons every Tournesol users have done')
	ax.set_xlim(xmin=xminmax[0]-1, xmax=xminmax[1]+1)
	ax.xaxis.zorder = 1
	ax.xaxis.set_ticks([t for t in [-100, -75, -50, -25, 0, 25, 50, 75, 100] if t >= xminmax[0] and t <= xminmax[1]])
	ax.xaxis.set_ticks([t for t in range(-100,101,5) if t >= xminmax[0] and t <= xminmax[1]], minor=True)
	ax.set_xlabel('Collective tournesol recommendation')

	ax.set_ylim([-100, 100])
	ax.yaxis.zorder = 1
	ax.yaxis.set_ticks([-100, -75, -50, -25, 0, 25, 50, 75, 100])
	ax.yaxis.set_ticks(list(range(-100,101,5)), minor=True)
	ax.set_ylabel('Individual recommendation for user "' + USER + '"')

	# Plot videos as black dots
	ax.scatter(
		[gscores[v][CRITERION][0] for v in vids], # Global scores
		[uscores[v][CRITERION][0] for v in vids], # User scores
		c='#000', marker='+', s=8,
		zorder=10
	)
user_vs_global()

In [None]:
## Users comparisons count graph
plt.rc('axes', unicode_minus=False)
def cmp_count_graph():
	users_vieos: dict[str,dict[str,int]] = dict() # data[user][vid] = nbComparisons
	def fetch_user_data(line: ComparisonLine):
		if line.criterion != 'largely_recommended':
			return

		cmps = users_vieos.setdefault(line.user, dict())
		cmps[line.vid1] = cmps.get(line.vid1, 0) + 1
		cmps[line.vid2] = cmps.get(line.vid2, 0) + 1
	COMPARISONS.foreach(fetch_user_data)
	# print(users_vieos['NatNgs'])

	# Aggregating
	users_nbcomp:dict[str,dict[int,int]] = dict() # data[user][nbcomp] = nb videos compared <nbcomp> times
	for u in users_vieos:
		users_nbcomp[u] = {0:0}
		for cnt in users_vieos[u].values():
			users_nbcomp[u][cnt] = users_nbcomp[u].get(cnt,0) + 1
	users_vieos.clear()
	users_vieos = None # flush
	# print(users_nbcomp['NatNgs'])


	# Preparing plot
	fig, ax = plt.subplots()
	fig.set_size_inches(18, 6)
	ax.grid(visible=True, which='major', axis='both', color='gray')
	ax.grid(visible=True, which='minor', axis='both', color='#eee')
	ax.set_axisbelow(True)

	def user_curve(user: str):
		xx = []
		yy = []
		maxy = 0
		summ = sum(users_nbcomp[user].values())
		if summ < 50: # Skip users with less than 10 comparisons
			return (-1,-1)
		
		for i in range(min(users_nbcomp[user].keys()), max(users_nbcomp[user].keys())+1):
			xx.append(i)
			y = users_nbcomp[user].get(i,0) / summ
			yy.append(y)
			if y > yy[maxy]:
				maxy = len(yy)-1

		color = '#04a'
		alpha = 0.1
		if user in ['NatNgs', 'Champi', 'white']:
			color = None
			alpha = 1
			# print(user, ', '.join([f'{cmp}: {cnt}' for cmp,cnt in sorted(users_nbcomp[user].items())]))

		if alpha > 0.1 or summ > 200 or max(xx) > 75:
			ax.text(xx[maxy], max(yy), s=user, color=color, alpha=alpha, horizontalalignment='center', verticalalignment='bottom')
		ax.plot(xx, yy, color=color, alpha=alpha, linewidth=1)
		return (max(xx), max(yy))

	mx = -1
	my = -1
	for user in users_nbcomp.keys():
		max_x, max_y = user_curve(user)
		if max_x > mx:
			mx = max_x
		if max_y > my:
			my = max_y

	ax.set_xlim(xmin=0,xmax=40)

	ax.set_ylim(ymin=0, ymax=1)
	ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1))

	# plt.title('How many comparisons every Tournesol users have done')
	ax.set_xlabel('Nombre de comparaisons par video')
	ax.set_ylabel('Videos ayant ce nombre de comparaisons')

cmp_count_graph()

In [None]:
def user_histogram(USER: str):
	## User against global scores
	CRITERION='largely_recommended'
	uscores = INDIVIDUAL_SCORES.get_scores(criterion=CRITERION, users=[USER])[USER] # `(score, uncertainty) = out[video][criterion]`
	vids = sorted(uscores.keys())

	LINEAR=5
	xx=sorted([uscores[v][CRITERION][0] for v in vids])
	yy=range(len(xx))

	fig, ax = plt.subplots()
	fig.set_size_inches(8, 6)

	ax.grid(visible=True, which='major', axis='both', color='gray')
	ax.grid(visible=True, which='minor', axis='both', color='#eee')
	ax.set_axisbelow(True)

	ax.set_xlim(xmin=-100, xmax=100)
	# ax.plot(xx,yy)

	etl = 5
	xx2=[x/10.0 for x in range(-1000, 1001,etl)]
	yy2=[len([a for a in xx if a >= (x-etl/2) and a <= (x+etl/2)]) for x in xx2]
	ax.set_ylim(ymin=0,ymax=max(yy2))
	ax.fill(xx2,yy2)

	# counts, bins = np.histogram(xx, bins=100)
	# ax.hist(bins[:-1], bins, weights=counts)

	# plt.title('How many comparisons every Tournesol users have done')
	ax.set_xlabel('largely_recommended by "' + USER + '"')
	ax.set_ylabel('Number of videos')

user_histogram('NatNgs')


In [None]:
def allusers_histogram():
	## User against global scores
	CRITERION='largely_recommended'

	votes: dict[str,list[int]] = dict()
	overall = list()
	def parseComparisons(line: ComparisonLine):
		if line.criterion == CRITERION:
			votes.setdefault(line.user, list()).append(line.score)
			overall.append(line.score)
	COMPARISONS.foreach(parseComparisons)

	# Remove users with less than 10 comparisons
	for u in list(votes):
		if len(votes[u]) < 10:
			votes.pop(u)
		elif min(votes[u]) == max(votes[u]):
			votes.pop(u)

	# Prepare plot
	fig, ax = plt.subplots()
	fig.set_size_inches(8, 6)


	# ax.set_ylim(ymin=0, ymax=1)
	# ax.yaxis.set_major_formatter(mtick.PercentFormatter(xmax=1, decimals=0))
	ax.set_yscale('log')
	ax.yaxis.set_major_formatter(mtick.ScalarFormatter())

	ax.set_axisbelow(True)
	ax.set_xlim(xmin=-10.5, xmax=10.5)
	ax.xaxis.set_ticks(range(-10,11))
	ax.tick_params(axis='x', length=0)

	# Plot histograms
	alpha = max(2/len(votes), 0.01)
	bins = [i/10.0 for i in range(-105,106,10)]
	for u in votes:
		ax.hist(bins[:-1], bins, weights=np.histogram(votes[u], bins=21, density=False)[0], color='black', alpha=alpha)

	# Plot white lines to separate columns
	#for i in bins:
	#	ax.axvline(i, color='white')
	
	# Plot collective histogram
	# ax.hist(bins[:-1], bins, weights=np.histogram(overall, bins=21, density=True)[0], color='blue', histtype='step')

	# plt.title('How many comparisons every Tournesol users have done')
	ax.set_xlabel('largely_recommended comparisons values')
	ax.set_ylabel('How much used (All users having 10+ comparisons)')

allusers_histogram()

In [None]:
# Videos having comparisons from 2 users from channel having videos with high score
def recom_2contrib():
	data:dict[str,dict[str,set[str]]] = dict() # channel, vid, [users]
	def extractcmps(line: ComparisonLine):
		if line.criterion != 'largely_recommended':
			return
		if not line.vid1 in YTDATA.videos or YTDATA.videos[line.vid1].channel is None \
			or not line.vid2 in YTDATA.videos or YTDATA.videos[line.vid2].channel is None:
			return
		
		data.setdefault(YTDATA.videos.get(line.vid1).channel.id, dict()).setdefault(line.vid1, set()).add(line.user)
		data.setdefault(YTDATA.videos.get(line.vid2).channel.id, dict()).setdefault(line.vid2, set()).add(line.user)
	COMPARISONS.foreach(extractcmps)

	to_recom:dict[str,set[str]] = dict() # [channel] = {vid}
	is_recom:dict[str,set[str]] = dict() # [channel] = {vid}
	for c in data:
		for vid in data[c]:
			if len(data[c][vid]) < 3:
				if len(data[c][vid]) == 2:
					to_recom.setdefault(c,set()).add(vid)
			else:
				is_recom.setdefault(c,set()).add(vid)
		if not c in to_recom and c in is_recom:
			is_recom.pop(c)
		elif not c in is_recom and c in to_recom:
			to_recom.pop(c)

	vscores: dict[str,float] = {v:0 for c in to_recom for v in to_recom[c]}
	recom_val: dict[str,list[float]] = {c:[] for c in is_recom}
	def extractscores(line: CCSLine):
		if line.criterion != 'largely_recommended':
			return
		if line.video in vscores:
			vscores[line.video] = line.score
			return
		if not line.video in {v for c in is_recom for v in is_recom[c]}:
			return
		if not line.video in YTDATA.videos or YTDATA.videos.get(line.video).channel is None:
			return
		channel = YTDATA.videos.get(line.video).channel.id
		if not channel in recom_val:
			return
		recom_val[channel].append(line.score)
	COLLECTIVE_SCORES.foreach(extractscores)

	vfinal = {v:sum(recom_val[c])*vscores[v]/len(to_recom[c]) for c in to_recom for v in to_recom[c]}
	for v in sorted(vfinal, key=vfinal.get, reverse=True)[:250]:
		print(f'{vfinal[v]:10.2f} - [{v}]', YTDATA.videos.get(v))

print('Suggestion of videos having only 2 public Contributors')
recom_2contrib()

In [None]:
def best_contribs():
	data:dict[str,dict[str,set[str]]] = dict() # channel, vid, [users]
	def extractcmps(line: ComparisonLine):
		udata = data.setdefault(line.user, {'cmps': 0, 'vids': set(), 'rcms': 0, 'last': line.date})
		udata['cmps'] += 1
		if line.criterion == 'largely_recommended':
			udata['rcms'] += 1
		udata['vids'].add(line.vid1)
		udata['vids'].add(line.vid2)
		if line.date > udata['last']:
			udata['last'] = line.date
	COMPARISONS.foreach(extractcmps)

	for u in data:
		data[u]['vids'] = len(data[u]['vids'])

	reduced = [user for user in data if data[user]['last'] > '2023-12' and data[user]['vids'] > 100]
	best_cmps = sorted(reduced, key=lambda u: data[u]['rcms'], reverse=True)[:4]
	best_recm = sorted([u for u in reduced if not u in best_cmps], key=lambda u: data[u]['cmps'], reverse=True)[:4]
	best_vids = sorted([u for u in reduced if not u in best_cmps and u not in best_recm], key=lambda u: data[u]['vids'], reverse=True)[:4]
	best_mixx = sorted([u for u in reduced if not u in best_cmps and u not in best_recm and u not in best_vids], key=lambda u: (data[u]['cmps']/data[u]['rcms']*data[u]['vids']), reverse=True)[:4]

	final = best_cmps + best_recm + best_vids + best_mixx
	for u in final:
		print(f"{u}: {data[u]['vids']} videos / {data[u]['rcms']} recom. / {data[u]['cmps']} comp.")
best_contribs()

In [None]:
## Total comparisons count over time
def comps_over_time():

	data:dict[str,dict[str,int]] = {'Total':{},} # data[lang][date] = <nb comparisons>
	def fetch_data(line: ComparisonLine):
		if line.criterion != 'largely_recommended':
			return
		
		lng1 = '??' if not line.vid1 in YTDATA.videos else YTDATA.videos[line.vid1].get('??', 'defaultLng').title()
		lng2 = '??' if not line.vid2 in YTDATA.videos else YTDATA.videos[line.vid2].get('??', 'defaultLng').title()

		if lng1 == '??' and lng2 != '??':
			lng1 = lng2
		elif lng2 == '??' and lng1 != '??':
			lng2 = lng1

		subdata = data.setdefault(f"{lng1}-{lng2}" if lng1 < lng2 else f"{lng2}-{lng1}", dict())
		subdata[line.date] = subdata.get(line.date, 0) + 1
		data['Total'][line.date] = data['Total'].get(line.date, 0) + 1

	COMPARISONS.foreach(fetch_data)
	
	tt = sorted(set(date for sub in data.values() for date in sub))
	csum = {lng:np.cumsum([data[lng].get(t,0) for t in tt]) for lng in data}

	fig, ax = plt.subplots()
	fig.set_size_inches(8, 6)
	ax.grid(visible=True, which='major', axis='both', color='gray')
	ax.grid(visible=True, which='minor', axis='both', color='#eee')
	ax.set_axisbelow(True)


	cut = (datetime.now() - relativedelta(years=1, weeks=4)).isoformat()
	xx = [dateparser.parse(tt[i]) for i in range(len(tt)) if tt[i] > cut]
	cutindex = len(tt) - len(xx)

	cumcum = [0]*len(xx)
	othrs = cumcum
	for lng,pts in sorted(csum.items(), key=lambda l: l[1][-1]):
		yy = [pts[i+cutindex] for i in range(len(xx))]
		cumcum = np.add(cumcum, yy)
		#if yy[-1] > 100:
		if yy[-1] > 1000:
			ax.plot_date(xx, yy, linestyle='-', linewidth=1, marker=None, label=lng)
			ax.text(xx[-1], yy[-1], f" {lng}", horizontalalignment='left', verticalalignment='center', size=math.log(yy[-1]))
		else:
			othrs = cumcum
	
	ax.plot_date(xx, othrs, linestyle='-', linewidth=1, marker=None, label='other')
	ax.text(xx[-1], othrs[-1], ' others', horizontalalignment='left', verticalalignment='center')

	#ax.set_yscale('log')
	#ax.set_ylim([100, csum['Total'][-1]*1.1])
	ax.set_ylim([0, csum['Total'][-1]*1.1])
	ax.set_yticklabels([f"{y:.0f}" if y < 1000 else f"{y/1000:.0f}k" for y in ax.get_yticks()])
	ax.set_xlim([min(xx), max(xx) + relativedelta(weeks=3)])
	ax.xaxis.set_major_locator(MonthLocator())
	ax.xaxis.set_minor_locator(DayLocator(interval=7))
	ax.xaxis.set_major_formatter(ConciseDateFormatter(MonthLocator()))

	# plt.title('How many comparisons every Tournesol users have done')
	ax.set_ylabel('Total number of public comparisons')

comps_over_time()