In [None]:
# Python dependencies -- REQUIREMENT: Python >=3.9, <3.12
%pip install --upgrade pip
%pip install --upgrade setuptools wheel networkx matplotlib scipy ipywidgets seaborn
%pip install "git+https://github.com/tournesol-app/tournesol.git@solidago-pipeline#egg=solidago&subdirectory=solidago"

# If anything was installed, restart the notebook kernel

In [None]:
# Imports
import math
import time
import itertools

import numpy as np
import pandas as pd
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

from scipy.optimize import curve_fit
from scipy.stats import norm
from matplotlib.axes import Axes
from solidago.pipeline.inputs import TournesolInputFromPublicDataset

PUBLIC_DATASET = TournesolInputFromPublicDataset.download()

In [3]:
USER_TO_PLOT = 'NatNgs'

In [None]:
# Load comparisons
user_comparisons = (PUBLIC_DATASET.comparisons[PUBLIC_DATASET.comparisons.public_username == USER_TO_PLOT]
	.loc[PUBLIC_DATASET.comparisons.criteria == 'largely_recommended']
	[['entity_a', 'entity_b']]
	.merge(PUBLIC_DATASET.entity_id_to_video_id, left_on='entity_a', right_index=True)
	.drop('entity_a', axis=1).rename(columns={'video_id': 'vid_a'})
	.merge(PUBLIC_DATASET.entity_id_to_video_id, left_on='entity_b', right_index=True)
	.drop('entity_b', axis=1).rename(columns={'video_id': 'vid_b'})
)
user_comparisons_list = zip(user_comparisons['vid_a'], user_comparisons['vid_b'])

FULL_COMPARISONS_GRAPH = nx.Graph()
FULL_COMPARISONS_GRAPH.add_edges_from(user_comparisons_list)

CONNECTED_COMPONENTS = nx.connected_components(FULL_COMPARISONS_GRAPH)
largest_group = max(CONNECTED_COMPONENTS, key=len)

MAX_CONNECTED_GRAPH = FULL_COMPARISONS_GRAPH.subgraph(largest_group)
print('Loaded', FULL_COMPARISONS_GRAPH, 'with largest group', MAX_CONNECTED_GRAPH)

-----

# User criteria statistics

In [None]:
# User cursors position
def user_histogram(ax: Axes, username: str, CRITERION: str, title: bool=False):
	votes = PUBLIC_DATASET.comparisons[PUBLIC_DATASET.comparisons.public_username == username].loc[PUBLIC_DATASET.comparisons.criteria == CRITERION].score

	ax.set_axisbelow(True)

	ax.set_xlim(xmin=-10.5, xmax=10.5)
	ax.xaxis.set_ticks(range(-10,11,1 if title else 5))
	ax.tick_params(axis='x', length=0)

	counts, bins = np.histogram(votes, bins=21)
	bins = [i/10.0 for i in range(-105,106,10)]
	counts_highlights = [(b if i%5 == 0 else 0) for i,b in enumerate(counts, -10)]
	counts_others = [(b if i%5 != 0 else 0) for i,b in enumerate(counts, -10)]


	# Display bars
	ax.hist(bins[:-1], bins, weights=counts_others, align='mid', color='#0088AA')
	ax.hist(bins[:-1], bins, weights=counts_highlights, align='mid', color='#0022FF')
	for i in bins: # Plot white lines to separate columns
		ax.axvline(i, color='white')


	# Compute 2-bell-curves approximation
	tt = len(votes)
	def binary_bell(x:float, m:float, s:float):
		m = abs(m)
		return (norm(-m, s).pdf(x) + norm(+m, s).pdf(x))*tt/2
	popt, pcov = curve_fit(f=binary_bell, xdata=bins[:-1], ydata=counts, p0=(5, 1))
	approx = list(map(lambda x: binary_bell(x, *popt), bins))
	ax.plot(bins, approx, 'y-')
	#ax.text(0, approx[len(approx)/2], s=f"Fit: {popt[0]:.2f}", horizontalalignment='center')


	for i in range(0,len(counts)):
		if counts[i] <= (approx[i]+approx[i+1])/2:
			continue
		currdiff = counts[i] - (approx[i]+approx[i+1])/2
		leftdiff = counts[i-1] - (approx[i-1]+approx[i])/2 if i > 0 else None
		rightdiff = counts[i+1] - (approx[i+1]+approx[i+2])/2 if i < len(approx)-2 else None

		s=None
		if ((not leftdiff) or currdiff < leftdiff) and ((not rightdiff) or currdiff < rightdiff):
			pass
		elif (leftdiff is not None) and ((rightdiff is None) or rightdiff > leftdiff):
			s="<"
		elif rightdiff is not None:
			s=">"
		if s:
			ax.text(x=bins[i]+.5, y=counts[i], s=s, horizontalalignment='center', verticalalignment='bottom', fontdict=dict(size=6))


	# Display title
	if title:
		ax.set_xlabel(CRITERION + ' by "' + username + '"')
		ax.set_ylabel('Number of comparisons')
	else:
		ax.set_title(CRITERION)

	ax.set_ylim(bottom=0)


# Largely recommended
fig, ax = plt.subplots()
fig.set_size_inches(8, 6)
user_histogram(ax, USER_TO_PLOT, 'largely_recommended', True)

# Sub criteria
fig, ax = plt.subplots(3,3)
user_histogram(ax[0][0], USER_TO_PLOT, 'importance')
user_histogram(ax[0][1], USER_TO_PLOT, 'layman_friendly')
user_histogram(ax[0][2], USER_TO_PLOT, 'diversity_inclusion')
user_histogram(ax[1][0], USER_TO_PLOT, 'better_habits')
user_histogram(ax[1][1], USER_TO_PLOT, 'engaging')
user_histogram(ax[1][2], USER_TO_PLOT, 'entertaining_relaxing')
user_histogram(ax[2][0], USER_TO_PLOT, 'pedagogy')
user_histogram(ax[2][1], USER_TO_PLOT, 'reliability')
user_histogram(ax[2][2], USER_TO_PLOT, 'backfire_risk')
fig.tight_layout()
fig.set_size_inches(8, 6)

In [None]:
# Correlations
df = PUBLIC_DATASET.comparisons[PUBLIC_DATASET.comparisons.public_username == USER_TO_PLOT][['entity_a', 'entity_b', 'criteria', 'score']]
criteria = df.criteria.unique()

dataMtrix = {
	"cid": [],
	"largely_recommended": [],
	"backfire_risk": [], 
	"better_habits": [], 
	"diversity_inclusion": [], 
	"engaging": [],
	"entertaining_relaxing": [], 
	"importance": [], 
	"layman_friendly": [], 
	"pedagogy": [],
	"reliability": [],
}
for ea in df.entity_a.unique():
	df_a = df.loc[df.entity_a == ea]
	for eb in df_a.entity_b.unique():
		dataMtrix["cid"].append(f"{ea} {eb}")

		df_b = df_a.loc[df_a.entity_b == eb]

		# Criteria
		for c in criteria:
			score = df_b.loc[df_b.criteria == c].score.unique()
			dataMtrix[c].append(score[0] if score.size > 0 else None)

df_corr = pd.DataFrame(dataMtrix).set_index(keys="cid").corr()
sns.heatmap(
	df_corr,
	cmap="RdBu",
	linewidths=0.5,
	fmt=".0%",
	annot=True,
	annot_kws={"size": 8},
	vmin=-1,
	vmax=+1,
)

-----

# Distances

In [None]:
# For each node, compute the average distance to every other node
CONNECTIVITY_CUTOFF=32
def connectivity(grf:nx.Graph):
	cnct:dict[str,float] = dict()
	mx = grf.number_of_nodes()
	for n1,tgt in nx.all_pairs_shortest_path_length(grf, cutoff=CONNECTIVITY_CUTOFF):
		ttl = mx
		for n2,ln in tgt.items():
			if n2 == n1: continue
			ttl -= 1/ln
		cnct[n1] = mx/(mx - ttl)
	return cnct

def connectivity_plot(grf:nx.Graph):
	cnct = connectivity(grf)
	filtered = [x for x in cnct.values() if x < CONNECTIVITY_CUTOFF]
	xx = [x for x in range(int(min(filtered)), int(max(filtered)+2))]
	yy = [sum(1 for f in filtered if f > x-1 and f <= x) for x in xx]

	disc = sorted(k for k,v in cnct.items() if v > xx[-1])
	print('Disconnected videos (Ignored in the graph):', len(disc), disc)

	fig = px.histogram(
		x=xx, y=yy, nbins=len(xx),
		category_orders={'Videos': xx},
		labels={'x': 'Average distance (# of comparisons)', 'y': 'Videos'}, 
		text_auto=False
	)
	fig.update_layout(yaxis_title="Videos")
	fig.for_each_trace(lambda t: t.update(hovertemplate=t.hovertemplate.replace('sum of ', '')))
	fig.show()
	return cnct

FCG_CONNECTIVITIES = connectivity_plot(FULL_COMPARISONS_GRAPH)

-----

# User comparisons Graph

In [None]:
# Display graph
def lens(x:float, y:float):
	d = math.sqrt(x*x + y*y)
	r = math.sqrt(d)/d
	return (x*r, y*r)
	# return (x,y)

def pos_to_graphlocs(graph:nx.Graph, pos:dict[str,list[int]]):
	node_x = []
	node_y = []
	for node in graph:
		x, y = lens(*pos[node])
		node_x.append(x)
		node_y.append(y)

	edge_x = []
	edge_y = []
	for edge in graph.edges():
		if edge[0] in pos and edge[1] in pos:
			x0, y0 = lens(*pos[edge[0]])
			x1, y1 = lens(*pos[edge[1]])
			edge_x.append(x0)
			edge_x.append(x1)
			edge_x.append(None)
			edge_y.append(y0)
			edge_y.append(y1)
			edge_y.append(None)	
	
	return (edge_x, edge_y, node_x, node_y)

def user_comparisons_graph(graph: nx.Graph, initial_pos, avg_dists:dict[str,float]):
	(edge_x, edge_y, node_x, node_y) = pos_to_graphlocs(graph, initial_pos)

	scatter_edges = go.Scatter(
		x=edge_x, y=edge_y,
		line=dict(
			width=0.5,
			color='#888',
		),
		hoverinfo='none',
		mode='lines',
	)

	node_color = []
	node_text = []
	for node, adjacencies in graph.adjacency():
		node_color.append(avg_dists[node])
		node_text.append(f"{node}<br>{len(adjacencies)} public comparisons<br>Average distance: {avg_dists[node]:.1f}")

	scatter_nodes = go.Scatter(
		x=node_x, y=node_y,
		mode='markers',
		hoverinfo='text',
		marker=dict(
			showscale=True,
			colorscale='Portland',
			reversescale=False,
			color=node_color,
			size=2,
			line=dict(width=0),
		),
		text=node_text,
	)

	fig = go.FigureWidget(data=[scatter_edges, scatter_nodes],
		layout=go.Layout(
			showlegend=False,
			hovermode='closest',
			margin=dict(b=0,l=0,r=0,t=0),
			xaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
			yaxis=dict(showgrid=False, zeroline=False, showticklabels=False),
			height=720,
		),
	)

	# Fix aspect ratio
	fig.update_yaxes(scaleanchor="x", scaleratio=1)

	return fig

pos = nx.spring_layout(MAX_CONNECTED_GRAPH, pos=nx.spiral_layout(MAX_CONNECTED_GRAPH, equidistant=True), iterations=10)
fig = user_comparisons_graph(MAX_CONNECTED_GRAPH, pos, avg_dists=FCG_CONNECTIVITIES)
fig # Display graph (should be the last line of the notebook cell)

In [None]:
# Improve graph above
TIME_TO_RUN = 120 # Seconds, the longer the prettier
STEP_TIME = 10 # Seconds

def improve_graph_pos(time_to_run:int, pos:dict[str,list[float]], callback=None, target_refresh_interval:int=None):
	start = time.time()

	target_total_it_count=math.ceil(time_to_run/target_refresh_interval) if target_refresh_interval > 0 else 10
	iterations_count=10
	total_iterations=0
	timer_a = time.time()
	loops_count = 0
	while timer_a - start < time_to_run:
		loops_count += 1
		# Move nodes towards eachother if connected, move them apart from eachother if not connected
		pos = nx.spring_layout(MAX_CONNECTED_GRAPH, pos=pos, iterations=iterations_count, center=[0,0])
		total_iterations += iterations_count
		timer_b = time.time()
		speed = iterations_count / (timer_b-timer_a)
		expected_remaining_iterations = speed * (time_to_run - timer_b + start)
		if callback:
			callback(pos)
		print(f"Iterations: {total_iterations}/{total_iterations + expected_remaining_iterations:.0f} -- Time: {timer_b-start:.1f}/{time_to_run}s -- Speed: {speed:.1f}/s")
		next_iteration_count = int(math.ceil(expected_remaining_iterations / (target_total_it_count - loops_count if loops_count < target_total_it_count else 1)))
		if loops_count > target_total_it_count or next_iteration_count > iterations_count*2 and loops_count > 1:
			# Spring Layout may stop iterating if found an equilibrium. Try to detect this event and stop before max_duration
			break
		# Prepare next iteration
		iterations_count = next_iteration_count
		timer_a = timer_b

	return pos

def onupdate(pos):
	with fig.batch_update():
		(fig.data[0]['x'],fig.data[0]['y'],fig.data[1]['x'],fig.data[1]['y']) = pos_to_graphlocs(MAX_CONNECTED_GRAPH, pos)

pos = improve_graph_pos(TIME_TO_RUN, pos, callback=onupdate, target_refresh_interval=STEP_TIME)

-----

# 3D Graph

In [None]:
# Display graph
def pos_to_graphlocs3d(graph:nx.Graph, pos3d:dict[str,list[int]]):
	node_x = []
	node_y = []
	node_z = []
	for node in graph:
		x, y, z = pos3d[node]
		node_x.append(x)
		node_y.append(y)
		node_z.append(z)

	edge_x = []
	edge_y = []
	edge_z = []
	for edge in graph.edges():
		if edge[0] in pos3d and edge[1] in pos3d:
			for i in range(2):
				x, y, z = pos3d[edge[i]]
				edge_x.append(x)
				edge_y.append(y)
				edge_z.append(z)
			edge_x.append(None)
			edge_y.append(None)
			edge_z.append(None)
	
	return (edge_x, edge_y, edge_z, node_x, node_y, node_z)

def user_comparisons_graph3d(graph: nx.Graph, initial_pos3d, colors:dict[str,float]):
	(edge_x, edge_y, edge_z, node_x, node_y, node_z) = pos_to_graphlocs3d(graph, initial_pos3d)

	scatter_edges = go.Scatter3d(
		x=edge_x, y=edge_y, z=edge_z,
		line=dict(
			width=0.25,
			color='#888',
		),
		hoverinfo='none',
		mode='lines',
	)

	node_adjacencies = []
	node_text = []
	for node, adjacencies in graph.adjacency():
		node_adjacencies.append(colors[node])
		node_text.append(f"{node}<br>{len(adjacencies)} public comparisons")

	scatter_nodes = go.Scatter3d(
		x=node_x, y=node_y, z=node_z,
		mode='markers',
		hoverinfo='text',
		marker=dict(
			colorscale='Portland',
			reversescale=True,
			color=node_adjacencies,
			size=[2+math.sqrt(adj)*2 for adj in node_adjacencies],
			line=dict(width=0),
		),
		text=node_text,
	)

	fig = go.FigureWidget(data=[scatter_edges, scatter_nodes],
		layout=go.Layout(
			showlegend=False,
			hovermode='closest',
			margin=dict(b=0,l=0,r=0,t=0),
			height=720,
		),
	)


	fig.update_layout(scene=dict(
		xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, visible=False),
		yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, visible=False),
		zaxis=dict(showgrid=False, zeroline=False, showticklabels=False, visible=False),
	))

	# Fix aspect ratio
	fig.update_yaxes(scaleanchor="x", scaleratio=1)

	return fig

pos3d = nx.spring_layout(MAX_CONNECTED_GRAPH, iterations=10, dim=3)
fig = user_comparisons_graph3d(MAX_CONNECTED_GRAPH, pos3d, colors=FCG_CONNECTIVITIES)
fig # Display graph (should be the last line of the notebook cell)

In [None]:
# Improve graph above
TIME_TO_RUN = 120 # Seconds, the longer the prettier
STEP_TIME = 10 # Seconds

def improve_graph_pos3d(time_to_run:int, pos3d:dict[str,list[float]], callback=None, target_refresh_interval:int=None):
	start = time.time()

	target_total_it_count=math.ceil(time_to_run/target_refresh_interval) if target_refresh_interval > 0 else 10
	iterations_count=10
	total_iterations=0
	timer_a = time.time()
	loops_count = 0
	while timer_a - start < time_to_run:
		loops_count += 1
		# Move nodes towards eachother if connected, move them apart from eachother if not connected
		pos3d = nx.spring_layout(MAX_CONNECTED_GRAPH, pos=pos3d, iterations=iterations_count, center=[0,0,0], dim=3)
		total_iterations += iterations_count
		timer_b = time.time()
		speed = iterations_count / (timer_b-timer_a)
		expected_remaining_iterations = speed * (time_to_run - timer_b + start)
		if callback:
			callback(pos3d)
		print(f"Iterations: {total_iterations}/{total_iterations + expected_remaining_iterations:.0f} -- Time: {timer_b-start:.1f}/{time_to_run}s -- Speed: {speed:.1f}/s")
		next_iteration_count = int(math.ceil(expected_remaining_iterations / (target_total_it_count - loops_count if loops_count < target_total_it_count else 1)))
		if loops_count > target_total_it_count or next_iteration_count > iterations_count*2 and loops_count > 1:
			# Spring Layout may stop iterating if found an equilibrium. Try to detect this event and stop before max_duration
			break
		# Prepare next iteration
		iterations_count = next_iteration_count
		timer_a = timer_b

	return pos3d

def onupdate3d(pos3d):
	with fig.batch_update():
		(fig.data[0]['x'],fig.data[0]['y'],fig.data[0]['z'],fig.data[1]['x'],fig.data[1]['y'],fig.data[1]['z']) = pos_to_graphlocs3d(MAX_CONNECTED_GRAPH, pos3d)

pos3d = improve_graph_pos3d(TIME_TO_RUN, pos3d, callback=onupdate3d, target_refresh_interval=STEP_TIME)