In [None]:
# Python dependencies
%pip install --upgrade pip
%pip install --upgrade setuptools wheel networkx matplotlib scipy ipywidgets seaborn
%pip install "git+https://github.com/tournesol-app/tournesol.git@main#egg=solidago&subdirectory=solidago"

# If anything was installed, restart the notebook kernel

In [None]:
# Imports
import numpy as np
import pandas as pd
import seaborn as sns
import networkx as nx
import matplotlib.pyplot as plt
import plotly.express as px

from scipy.optimize import curve_fit
from scipy.stats import norm
from matplotlib.axes import Axes
from solidago.pipeline.inputs import TournesolDataset

PUBLIC_DATASET = TournesolDataset.download()

In [None]:
USER_TO_PLOT = 'Champi'

In [None]:
# Load comparisons
user_comparisons = (PUBLIC_DATASET.comparisons[PUBLIC_DATASET.comparisons.public_username == USER_TO_PLOT]
	.loc[PUBLIC_DATASET.comparisons.criterion == 'largely_recommended']
	[['entity_a', 'entity_b']]
	.merge(PUBLIC_DATASET.entity_id_to_video_id, left_on='entity_a', right_index=True)
	.drop('entity_a', axis=1).rename(columns={'video_id': 'vid_a'})
	.merge(PUBLIC_DATASET.entity_id_to_video_id, left_on='entity_b', right_index=True)
	.drop('entity_b', axis=1).rename(columns={'video_id': 'vid_b'})
)
user_comparisons_list = zip(user_comparisons['vid_a'], user_comparisons['vid_b'])

FULL_COMPARISONS_GRAPH = nx.Graph()
FULL_COMPARISONS_GRAPH.add_edges_from(user_comparisons_list)

CONNECTED_COMPONENTS = nx.connected_components(FULL_COMPARISONS_GRAPH)
largest_group = max(CONNECTED_COMPONENTS, key=len)

MAX_CONNECTED_GRAPH = FULL_COMPARISONS_GRAPH.subgraph(largest_group)
print('Loaded', FULL_COMPARISONS_GRAPH, 'with largest group', MAX_CONNECTED_GRAPH)

-----

# User criteria statistics

In [None]:
# User cursors position
def user_histogram(ax: Axes, username: str, CRITERION: str, title: bool=False):
	votes = PUBLIC_DATASET.comparisons[PUBLIC_DATASET.comparisons.public_username == username].loc[PUBLIC_DATASET.comparisons.criterion == CRITERION].score

	ax.set_axisbelow(True)

	ax.set_xlim(xmin=-10.5, xmax=10.5)
	ax.xaxis.set_ticks(range(-10,11,1 if title else 5))
	ax.tick_params(axis='x', length=0)

	counts, bins = np.histogram(votes, bins=21)
	bins = [i/10.0 for i in range(-105,106,10)]
	counts_highlights = [(b if i%5 == 0 else 0) for i,b in enumerate(counts, -10)]
	counts_others = [(b if i%5 != 0 else 0) for i,b in enumerate(counts, -10)]


	# Display bars
	ax.hist(bins[:-1], bins, weights=counts_others, align='mid', color='#0088AA')
	ax.hist(bins[:-1], bins, weights=counts_highlights, align='mid', color='#0022FF')
	for i in bins: # Plot white lines to separate columns
		ax.axvline(i, color='white')


	# Compute 2-bell-curves approximation
	tt = len(votes)
	def binary_bell(x:float, m:float, s:float):
		m = abs(m)
		return (norm(-m, s).pdf(x) + norm(+m, s).pdf(x))*tt/2
	popt, pcov = curve_fit(f=binary_bell, xdata=bins[:-1], ydata=counts, p0=(5, 1))
	approx = list(map(lambda x: binary_bell(x, *popt), bins))
	ax.plot(bins, approx, 'y-')
	#ax.text(0, approx[len(approx)/2], s=f"Fit: {popt[0]:.2f}", horizontalalignment='center')


	for i in range(0,len(counts)):
		if counts[i] <= (approx[i]+approx[i+1])/2:
			continue
		currdiff = counts[i] - (approx[i]+approx[i+1])/2
		leftdiff = counts[i-1] - (approx[i-1]+approx[i])/2 if i > 0 else None
		rightdiff = counts[i+1] - (approx[i+1]+approx[i+2])/2 if i < len(approx)-2 else None

		s=None
		if ((not leftdiff) or currdiff < leftdiff) and ((not rightdiff) or currdiff < rightdiff):
			pass
		elif (leftdiff is not None) and ((rightdiff is None) or rightdiff > leftdiff):
			s="<"
		elif rightdiff is not None:
			s=">"
		#if s:
			#ax.text(x=bins[i]+.5, y=counts[i], s=s, horizontalalignment='center', verticalalignment='bottom', fontdict=dict(size=6))


	# Display title
	if title:
		ax.set_xlabel(CRITERION + ' by "' + username + '"')
		ax.set_ylabel('Number of comparisons')
	else:
		ax.set_title(CRITERION)

	ax.set_ylim(bottom=0)


# Largely recommended
fig, ax = plt.subplots()
fig.set_size_inches(8, 6)
user_histogram(ax, USER_TO_PLOT, 'largely_recommended', True)

# Sub criteria
fig, ax = plt.subplots(3,3)
user_histogram(ax[0][0], USER_TO_PLOT, 'importance')
user_histogram(ax[0][1], USER_TO_PLOT, 'layman_friendly')
user_histogram(ax[0][2], USER_TO_PLOT, 'diversity_inclusion')
user_histogram(ax[1][0], USER_TO_PLOT, 'better_habits')
user_histogram(ax[1][1], USER_TO_PLOT, 'engaging')
user_histogram(ax[1][2], USER_TO_PLOT, 'entertaining_relaxing')
user_histogram(ax[2][0], USER_TO_PLOT, 'pedagogy')
user_histogram(ax[2][1], USER_TO_PLOT, 'reliability')
user_histogram(ax[2][2], USER_TO_PLOT, 'backfire_risk')
fig.tight_layout()
fig.set_size_inches(8, 6)

In [None]:
# Correlations
df = PUBLIC_DATASET.comparisons[PUBLIC_DATASET.comparisons.public_username == USER_TO_PLOT][['entity_a', 'entity_b', 'criterion', 'score']]
criteria = df.criterion.unique()

dataMtrix = {
	"cid": [],
	"largely_recommended": [],
	"backfire_risk": [], 
	"better_habits": [], 
	"diversity_inclusion": [], 
	"engaging": [],
	"entertaining_relaxing": [], 
	"importance": [], 
	"layman_friendly": [], 
	"pedagogy": [],
	"reliability": [],
}
for ea in df.entity_a.unique():
	df_a = df.loc[df.entity_a == ea]
	for eb in df_a.entity_b.unique():
		dataMtrix["cid"].append(f"{ea} {eb}")

		df_b = df_a.loc[df_a.entity_b == eb]

		# Criteria
		for row in criteria:
			score = df_b.loc[df_b.criterion == row].score.unique()
			dataMtrix[row].append(score[0] if score.size > 0 else None)

df_corr = pd.DataFrame(dataMtrix).set_index(keys="cid").corr()
sns.heatmap(
	df_corr,
	cmap="RdBu",
	linewidths=0.5,
	fmt=".0%",
	annot=True,
	annot_kws={"size": 8},
	vmin=-1,
	vmax=+1,
)

-----

# Comparisons network depth graph

In [None]:
CUTOFF = 16 # Nodes connected by more than that are considered disconnected

def comparisons_network_depth():
	# For every comparison made by the user, make an oriented graph where:
	#   - comparison score <= 0 : edge is towards 1st video
	#   - comparison score >= 0 : edge is towards 2nd video
	#   - (comparison score = 0 : both edges to be created)

	df = (PUBLIC_DATASET.comparisons
	   .loc[PUBLIC_DATASET.comparisons.public_username == USER_TO_PLOT]
	   .loc[PUBLIC_DATASET.comparisons.criterion == 'largely_recommended']
	   [['score', 'entity_a', 'entity_b']]
		.merge(PUBLIC_DATASET.entity_id_to_video_id, left_on='entity_a', right_index=True)
		.drop('entity_a', axis=1).rename(columns={'video_id': 'vid_a'})
		.merge(PUBLIC_DATASET.entity_id_to_video_id, left_on='entity_b', right_index=True)
		.drop('entity_b', axis=1).rename(columns={'video_id': 'vid_b'})
	)

	dirgraph = nx.DiGraph()
	for row in df.itertuples():
		if row.score <= 0:
			dirgraph.add_edge(row.vid_a, row.vid_b)
		if row.score >= 0:
			dirgraph.add_edge(row.vid_b, row.vid_a)

	# Then for every node, compute the average distance to every other node, and the average distance from every other node (when disconnected, consider the distance to be equal as the total number of nodes -1)
	xx = []
	yy = []
	tt = []
	mx = []
	for vid in dirgraph:
		# Compute for every node, how many other nodes can be accessed from and to them
		desc:dict[str,int] = dict(nx.single_source_shortest_path_length(dirgraph, vid, CUTOFF))
		ancs:dict[str,int] = dict(nx.single_target_shortest_path_length(dirgraph, vid, CUTOFF))
		del desc[vid]
		del ancs[vid]

		both = {}
		for k in list(desc):
			if k in ancs:
				if desc[k] > ancs[k]:
					del desc[k]
				elif ancs[k] > desc[k]:
					del ancs[k]
				else:
					both[k] = ancs[k]
					del desc[k]
					del ancs[k]

		if vid in ['NW7Jv-xC3Sw', 'g_UR5FlOLKA', 'ce1-63wUlc0']:
			print(f"## {vid}\n\tBetter than:")
			for i in sorted(set(desc.values())):
				print(f"\t\tdist. {i:3d}: {', '.join([k for k in desc if desc[k] == i])}")
			print("\tEqual to:")
			for i in sorted(set(both.values())):
				print(f"\t\tdist. {i:3d}: {', '.join([k for k in both if both[k] == i])}")
			print("\tWorst than:")
			for i in sorted(set(ancs.values())):
				print(f"\t\tdist. {i:3d}: {', '.join([k for k in ancs if ancs[k] == i])}")
			print('')

		tt.append(vid)
		xx.append( len(ancs) + len(both)/2 )
		yy.append( len(desc) + len(both)/2 )
		mx.append( xx[-1] + yy[-1] )

	# Scatter every node, x = distance to every other node, y = distance from every other node
	fig = px.scatter(
		x=xx, y=yy, 
		hover_name=tt,
		labels={'x': 'Number of negative comparisons received', 'y': 'Number of positive comparisons received'},
	)

	# Add completion line every 10%
	for cmpl in range(10,101,10):
		val = cmpl/100*dirgraph.number_of_nodes()
		fig.add_shape(type='line', x0=0, y0=val, x1=val, y1=0, line=dict(color='lightgray', width=1))
		fig.add_annotation(text=f"{cmpl/100:.0%}", x=val/2, y=val/2, font=dict(color='gray'), textangle=45, yanchor='bottom', showarrow=False)
	
	# Add label of the best comparisons count in percent of max(mx)
	#fig.add_shape(type='line', x0=0, y0=maxmx, x1=maxmx, y1=0, line=dict(color='gray', width=1))
	#fig.add_annotation(text=f"{maxmx/dirgraph.number_of_nodes():.0%}", y=maxmx/2, x=maxmx/2, font_size=12, textangle=45, yanchor='bottom', showarrow=False)

	# Force graph to be displayed such as x width is equal to y height
	fig.update_xaxes(scaleanchor="y", scaleratio=1)
	fig.update_yaxes(scaleanchor="x", scaleratio=1)
	fig.update_layout(width=600, height=600)

	# No overflow
	mmx = max(max(xx), max(yy))
	fig.update_xaxes(range=[0, mmx])
	fig.update_yaxes(range=[0, mmx])

	# Set figure title
	fig.update_layout(title_text=f"Direct and Indirect comparisons {USER_TO_PLOT}", title_x=0.5)

	fig.show()


comparisons_network_depth()

-----

# 3D Graph

In [None]:
# Display graph
def pos_to_graphlocs3d(graph:nx.Graph, pos3d:dict[str,list[int]]):
	node_x = []
	node_y = []
	node_z = []
	for node in graph:
		x, y, z = pos3d[node]
		node_x.append(x)
		node_y.append(y)
		node_z.append(z)

	edge_x = []
	edge_y = []
	edge_z = []
	for edge in graph.edges():
		if edge[0] in pos3d and edge[1] in pos3d:
			for i in range(2):
				x, y, z = pos3d[edge[i]]
				edge_x.append(x)
				edge_y.append(y)
				edge_z.append(z)
			edge_x.append(None)
			edge_y.append(None)
			edge_z.append(None)
	
	return (edge_x, edge_y, edge_z, node_x, node_y, node_z)

def user_comparisons_graph3d(graph: nx.Graph, initial_pos3d, colors:dict[str,float]):
	(edge_x, edge_y, edge_z, node_x, node_y, node_z) = pos_to_graphlocs3d(graph, initial_pos3d)

	scatter_edges = go.Scatter3d(
		x=edge_x, y=edge_y, z=edge_z,
		line=dict(
			width=0.25,
			color='#888',
		),
		hoverinfo='none',
		mode='lines',
	)

	node_adjacencies = []
	node_text = []
	for node, adjacencies in graph.adjacency():
		node_adjacencies.append(colors[node])
		node_text.append(f"{node}<br>{len(adjacencies)} public comparisons")

	scatter_nodes = go.Scatter3d(
		x=node_x, y=node_y, z=node_z,
		mode='markers',
		hoverinfo='text',
		marker=dict(
			colorscale='Portland',
			reversescale=False,
			color=node_adjacencies,
			size=[2+math.sqrt(adj)*2 for adj in node_adjacencies],
			line=dict(width=0),
		),
		text=node_text,
	)

	fig = go.FigureWidget(data=[scatter_edges, scatter_nodes],
		layout=go.Layout(
			showlegend=False,
			hovermode='closest',
			margin=dict(b=0,l=0,r=0,t=0),
			height=720,
		),
	)


	fig.update_layout(scene=dict(
		xaxis=dict(showgrid=False, zeroline=False, showticklabels=False, visible=False),
		yaxis=dict(showgrid=False, zeroline=False, showticklabels=False, visible=False),
		zaxis=dict(showgrid=False, zeroline=False, showticklabels=False, visible=False),
	))

	# Fix aspect ratio
	fig.update_yaxes(scaleanchor="x", scaleratio=1)

	return fig

pos3d = nx.spring_layout(MAX_CONNECTED_GRAPH, iterations=10, dim=3)
fig = user_comparisons_graph3d(MAX_CONNECTED_GRAPH, pos3d, colors=FCG_CONNECTIVITIES)
fig # Display graph (should be the last line of the notebook cell)

In [None]:
# Improve graph above
TIME_TO_RUN = 120 # Seconds, the longer the prettier
STEP_TIME = 10 # Seconds

def improve_graph_pos3d(time_to_run:int, pos3d:dict[str,list[float]], callback=None, target_refresh_interval:int=None):
	start = time.time()

	target_total_it_count=math.ceil(time_to_run/target_refresh_interval) if target_refresh_interval > 0 else 10
	iterations_count=10
	total_iterations=0
	timer_a = time.time()
	loops_count = 0
	while timer_a - start < time_to_run:
		loops_count += 1
		# Move nodes towards eachother if connected, move them apart from eachother if not connected
		pos3d = nx.spring_layout(MAX_CONNECTED_GRAPH, pos=pos3d, iterations=iterations_count, center=[0,0,0], dim=3)
		total_iterations += iterations_count
		timer_b = time.time()
		speed = iterations_count / (timer_b-timer_a)
		expected_remaining_iterations = speed * (time_to_run - timer_b + start)
		if callback:
			callback(pos3d)
		print(f"Iterations: {total_iterations}/{total_iterations + expected_remaining_iterations:.0f} -- Time: {timer_b-start:.1f}/{time_to_run}s -- Speed: {speed:.1f}/s")
		next_iteration_count = int(math.ceil(expected_remaining_iterations / (target_total_it_count - loops_count if loops_count < target_total_it_count else 1)))
		if loops_count > target_total_it_count or next_iteration_count > iterations_count*2 and loops_count > 1:
			# Spring Layout may stop iterating if found an equilibrium. Try to detect this event and stop before max_duration
			break
		# Prepare next iteration
		iterations_count = next_iteration_count
		timer_a = timer_b

	return pos3d

def onupdate3d(pos3d):
	with fig.batch_update():
		(fig.data[0]['x'],fig.data[0]['y'],fig.data[0]['z'],fig.data[1]['x'],fig.data[1]['y'],fig.data[1]['z']) = pos_to_graphlocs3d(MAX_CONNECTED_GRAPH, pos3d)

pos3d = improve_graph_pos3d(TIME_TO_RUN, pos3d, callback=onupdate3d, target_refresh_interval=STEP_TIME)