In [None]:
# Python dependencies -- REQUIREMENT: Python >=3.9, <3.12
%pip install --upgrade pip
%pip install --upgrade setuptools wheel networkx matplotlib scipy plotly nbformat
%pip install --upgrade "git+https://github.com/tournesol-app/tournesol.git@solidago-pipeline#egg=solidago&subdirectory=solidago"

# If anything was installed, restart the notebook kernel

In [9]:
# Imports
import math
import time

import numpy as np
import pandas as pd
import networkx as nx
import plotly.express as px
import plotly.offline as pyo
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as mtick

from matplotlib.axes import Axes
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from solidago.pipeline.inputs import TournesolInputFromPublicDataset

PUBLIC_DATASET = TournesolInputFromPublicDataset.download()

In [4]:
def prepare_graph_pos(graph: nx.Graph, time_to_run:int, weight_key:str):
	## Preparing Graph Layout
	start = time.time()

	# Initialy put all nodes in a circle around the center
	pos=nx.circular_layout(graph)

	iterations_count=10
	total_iterations=0
	timer_a = time.time()
	loops_count = 0
	while timer_a - start < time_to_run:
		loops_count += 1
		# Move nodes towards eachother if connected, move them apart from eachother if not connected
		pos = nx.spring_layout(graph, pos=pos, weight=weight_key, iterations=iterations_count)
		total_iterations += iterations_count
		timer_b = time.time()
		speed = iterations_count / (timer_b-timer_a)
		expected_remaining_iterations = speed * (time_to_run - timer_b + start)
		print(f"Iterations: {total_iterations}/{total_iterations + expected_remaining_iterations:.0f} -- Time: {timer_b-start:.1f}/{time_to_run}s -- Speed: {speed:.1f}/s")
		next_iteration_count = int(math.ceil(expected_remaining_iterations / (10 - loops_count if loops_count < 10 else 1)))
		if loops_count > 10 or next_iteration_count > iterations_count*2 and loops_count > 1:
			# Spring Layout may stop iterating if found an equilibrium. Try to detect this event and stop before max_duration
			break
		# Prepare next iteration
		iterations_count = next_iteration_count
		timer_a = timer_b

	return pos

-----

# Active Users

In [None]:
## User count over time
def usercount_graph():
	actives:dict[str,set[str]] = {k:{v for v in lst} for k,lst in PUBLIC_DATASET.comparisons.groupby('week_date')['public_username'].unique().to_dict().items()} # date:{user, ...}
	account_creation:dict[str,str] = PUBLIC_DATASET.comparisons.groupby('public_username')['week_date'].min().to_dict() # user: date

	activesmonth: dict[str, set[str]] = dict() # date:{user, ...}
	activesyear: dict[str, set[str]] = dict() # date:{user, ...}
	l_dates: list[str] = sorted(actives.keys())
	for i in range(4,len(l_dates)):
		activesmonth[l_dates[i]] = actives[l_dates[i]].union(actives[l_dates[i-1]]).union(actives[l_dates[i-2]]).union(actives[l_dates[i-3]])
	for i in range(52,len(l_dates)):
		activesyear[l_dates[i]] = actives[l_dates[i]]
		for x in range(1,52):
			activesyear[l_dates[i]] = activesyear[l_dates[i]].union(actives[l_dates[i-x]])

	one_year_ago = (datetime.now() - relativedelta(years=1, weeks=5)).isoformat()

	dates = [d for d in l_dates if d > one_year_ago]

	l_total: list[int] = [len([a for a in account_creation if account_creation[a] <= d]) for d in dates]
	l_actives_52: list[int] = [len(activesyear[d]) for d in dates[1:]]
	l_actives_4: list[int] = [len(activesmonth[d]) for d in dates[1:]]
	l_actives: list[int] = [len(actives[d]) for d in dates[1:]]
	l_news: list[int] = [l_total[i] - l_total[i-1] for i in range(1,len(l_total))]
	l_datetimes: list[datetime] = [datetime.fromisoformat(k) for k in dates[1:]]
	l_total.pop(0)


	fig, ax = plt.subplots()
	fig.set_size_inches(14, 6)
	ax.set_yscale('log')
	ax.set_ylim(ymin=1, ymax=10**math.ceil(np.log10(l_total[-1])))
	ax.set_xlim(xmin=l_datetimes[1], xmax=l_datetimes[-1])
	ax.yaxis.set_ticks_position('right')
	ax.yaxis.set_major_formatter(mtick.ScalarFormatter())
	ax.yaxis.set_minor_formatter(mtick.ScalarFormatter())
	ax.yaxis.set_tick_params('minor', labelsize=7, labelcolor='gray')

	myFmt = mdates.DateFormatter('%Y-%m')
	ax.xaxis.set_ticks(pd.date_range(one_year_ago, datetime.now(), freq='MS'))
	ax.xaxis.set_major_formatter(myFmt)
	ax.xaxis.set_ticks(l_datetimes, minor=True)
	ax.xaxis.set_tick_params('minor', color='gray')
	ax.grid(visible=True, which='major', axis='y', color='gray')
	ax.grid(visible=True, which='minor', axis='y', color='lightgray')
	ax.grid(visible=True, which='major', axis='x', color='gray')
	ax.grid(visible=True, which='minor', axis='x', color='lightgray', linestyle=':')
	ax.plot(l_datetimes, l_total, '|--', color='blue', label='Total users (min. 1cmp total) # Public dataset only #')
	ax.plot(l_datetimes, l_actives_52, ':', color='darkred', label='Yearly active users (min. 1cmp in the last 52 weeks)')
	ax.plot(l_datetimes, l_actives_4, '|:', color='red', label='Monthly active users (min. 1cmp in the last 4 weeks)')
	ax.plot(l_datetimes, l_actives, '|--', color='orange', label='Weekly active users (min. 1cmp in the week)')
	ax.plot(l_datetimes, l_news, '|-', color='green', label='New users (first cmp ever)')
	ax.legend()
usercount_graph()

In [None]:
## User count over time
def pretty_season(season_as_datetime: datetime) -> str:
	return season_as_datetime.strftime("%Y-%m (%b") + " to " + (season_as_datetime + relativedelta(months=2)).strftime("%b)")

def first_last_users_comaprisons():
	# Prepare dataframe for needed data
	df = PUBLIC_DATASET.comparisons.drop_duplicates(subset=["public_username", "week_date"])[["public_username", "week_date"]].reset_index(drop=True)  # Keep only needed data, remove duplicates
	df.week_date = pd.to_datetime(df.week_date).astype("datetime64[ns]")  # Convert dates to sortable dates
	weeks = pd.date_range(start=df.week_date.min(), end=df.week_date.max(), freq="W-MON").to_list()  # List of all weeks

	# Categories: One category for every season between min(week_date) and max(week_date) (season is a 3 month period)
	seasons =pd.date_range(
		start=df.week_date.min().replace(month=1, day=1),
		end=df.week_date.max(),
		freq="3M",
	).to_list()

	# For every season, create a new dataframe
	sub_dfs = []

	# Generate a new dataframe, with for each public_username, assign the season of their first comparison
	users_seasons = df.groupby("public_username", as_index=False).min().rename(columns={'week_date': 'first_week'})
	last_user_weeks = df.groupby("public_username", as_index=False).max().rename(columns={'week_date': 'last_week'})
	users_seasons['last_week'] = last_user_weeks['last_week']

	# Add new column in users_season, with value is the minimum season such as the week_date is greater than the season date
	users_seasons["season"] = users_seasons.first_week.apply(lambda first_week: pretty_season(max((s for s in seasons if s <= first_week), default=seasons[0]))).reindex()
	# If user min week_date is same as user max week_date, change its season by 'single week'
	users_seasons.loc[users_seasons.loc[users_seasons.first_week.eq(users_seasons.last_week)].index, "season"] = "single week"
	seasons_users = users_seasons.groupby("season")['public_username'].aggregate(list).to_dict()

	for s in seasons_users:
		# Filter df to keep only users of season s
		season_df = df.loc[df.public_username.isin(seasons_users[s])].groupby('week_date').public_username.nunique()

		if s == 'single week':
			sub_dfs.append(('= last comparison date', season_df))
		else:
			sub_dfs.append((s, season_df))

	# Merge previous computed series into one, by week_date
	dtf = pd.DataFrame({"week_date": weeks}).reset_index()
	for name, subdf in sub_dfs:
		dtf = pd.merge(dtf, subdf.to_frame(name=name), on="week_date", how='left').fillna(0)

	# Plot
	fig = px.bar(
		dtf,
		x="week_date",
		y=[name for name, _ in sub_dfs],
		labels={"value": "Users", "week_date": "Week", "variable": "First comparison date"},
		color_discrete_sequence=px.colors.sample_colorscale("turbo", samplepoints=len(sub_dfs)),
		color_discrete_map={'= last comparison date': 'grey'},
	)
	fig.update_layout(
		legend={'traceorder': 'reversed'},
		margin=dict(l=5, r=5, t=30, b=5)
	)
	pyo.iplot(fig)

first_last_users_comaprisons()

-----

# All users Videos vs Comparisons Scatter

In [None]:
## Users comparisons graph
plt.rc('axes', unicode_minus=False)
def users_cmp_graph():
	recom = PUBLIC_DATASET.get_comparisons(criteria='largely_recommended')
	mirrored = pd.concat([recom.rename(columns={'entity_a': 'vid', 'entity_b': 'comparedwith'}), recom.rename(columns={'entity_a': 'comparedwith', 'entity_b': 'vid'})], ignore_index=True)
	
	videos_per_user = recom.groupby('user_id')[['entity_a']].count().rename(columns={'entity_a': 'videos'})
	cmps_per_video_per_user = mirrored[['user_id', 'vid', 'comparedwith']].groupby(['user_id', 'vid']).count().groupby('user_id').mean().rename(columns={'comparedwith': 'averagecmps'})

	data = (videos_per_user.join(cmps_per_video_per_user, on='user_id')
	                       .join(PUBLIC_DATASET.users, on='user_id')
	                       .rename(columns={'videos': 'x', 'averagecmps': 'y'})
						   [['public_username', 'x', 'y']]
	)
	data = data[data.x > 1]
	sizes = data.groupby(['x', 'y']).count().rename(columns={'public_username': 's'})
	polyfit = data.groupby(lambda _: True).apply(lambda l: np.polyfit(np.log(l.x), l.y, 1))[True]

	fig, ax = plt.subplots()
	fig.set_size_inches(8, 6)

	#ax.set_yscale('log')
	ax.set_xscale('log')
	ax.yaxis.set_major_formatter(mtick.ScalarFormatter())
	ax.xaxis.set_major_formatter(mtick.ScalarFormatter())
	
	ymax = data['y'].max()
	xmax = data['x'].max()
	ax.set_xlim(xmin=1, xmax=10**math.ceil(math.log10( xmax )))
	y_ticks_spacing = math.ceil(( ymax - data['y'].min() )/10)
	ax.yaxis.set_ticks(np.arange(0, (1+round(ymax/y_ticks_spacing))*y_ticks_spacing, y_ticks_spacing))
	ax.yaxis.set_ticks(np.arange(0, (1+round(ymax)), 1), minor=True)
	ax.set_axisbelow(True)

	sizes.reset_index().plot.scatter(x='x', y='y', c='blue', marker='.', s='s', label='Users', ax=ax)

	for _,row in data.iterrows():
		x= row['x']
		y= row['y']
		if y > 12 or x > 5000 or row['public_username'] == 'NatNgs':
			ax.annotate(row['public_username'], (x, y), fontsize=5, color="#300")

	###########################################


	poly_xx = [2, xmax]
	poly_yy = [1, polyfit[0]*math.log(xmax)+polyfit[1]]
	ax.plot(poly_xx, poly_yy, color='#000', linewidth=1, label=f"Log. trend (y={polyfit[0]:0.2f}*x/log(x){polyfit[1]:+0.2f})")
	ax.set_ylim(ymin=0, ymax=math.floor(ymax)+1)


	# plt.title('How many comparisons every Tournesol users have done')
	ax.legend(loc='upper left')
	ax.set_ylabel('Average number of comparisons per video')
	ax.set_xlabel('Total number of video compared (Log. scale)')
	ax.grid(visible=True, which='major', axis='both', color='#888')
	ax.grid(visible=True, which='minor', axis='both', color='#eee')
	
users_cmp_graph()

-----

# Active, Inactive & Returning users

In [None]:
## User count over time
def active_inactive_returning():
	account_creation:dict[str,str] = PUBLIC_DATASET.comparisons.groupby('public_username')['week_date'].min().to_dict() # user: date
	last_activities:dict[str,str] = PUBLIC_DATASET.comparisons.groupby('public_username')['week_date'].aggregate(lambda a: sorted(set(a))[-2:]).to_dict() # user: [date, date]

	maxactiv = max(map(max, last_activities.values()))
	print(maxactiv)
	w_1 = datetime.fromisoformat(maxactiv) - timedelta(days = 7)
	week_1 = w_1.isoformat()[:10]

	newusers = [u for u,dates in last_activities.items() if dates[0] == maxactiv]
	active_users = [u for u,dates in last_activities.items() if dates == [week_1, maxactiv]]
	returning_users = {u:dates[0] for u,dates in last_activities.items() if dates[-1] == maxactiv and dates[0] < week_1}
	inactive_users = {u:dates[-1] for u,dates in last_activities.items() if dates[-1] < maxactiv}
	
	print('New members :', len(newusers), f"(No public comparison prior to {maxactiv})")
	print('Actives members :', len(active_users), f"(Having also done comparisons in week {week_1})")
	print('Returning members :', len(returning_users))
	for w in sorted(set(returning_users.values()), reverse=True):
		usrs = sorted(u for u,d in returning_users.items() if d == w)
		print('\tPrevious comparison on', w, ':', len(usrs), '' if len(usrs) > 1 else ('(' + usrs[0] + ')'))

	print('Inactive members :', len(inactive_users))
	for w in sorted(set(inactive_users.values()), reverse=True):
		usrs = sorted(u for u,d in inactive_users.items() if d == w)
		print('\tPrevious comparison on', w, ':', len(usrs), '' if len(usrs) > 1 else ('(' + usrs[0] + ')'))


active_inactive_returning()

In [None]:
# User cursors position
def user_histogram(ax: Axes, CRITERION: str, title: bool=False):
	# votes = PUBLIC_DATASET.comparisons.loc[PUBLIC_DATASET.comparisons.criteria == CRITERION].score
	users = PUBLIC_DATASET.users[['public_username', 'trust_score']].set_index('public_username').rename(columns={'trust_score':'weight'})
	votes = (PUBLIC_DATASET.comparisons
		.loc[PUBLIC_DATASET.comparisons.criteria == CRITERION]
		.groupby(['public_username', 'score'], as_index=False)
		.count()
		[['public_username', 'score', 'criteria']]
		.rename(columns={'criteria':'count'})
		.join(users, on='public_username')
	)
	votes['pct'] = votes['count']*votes['weight']

	ax.set_axisbelow(True)

	ax.set_xlim(xmin=-10.5, xmax=10.5)
	ax.xaxis.set_ticks(range(-10,11,1 if title else 5))
	ax.tick_params(axis='x', length=0)

	counts, bins = np.histogram(votes['score'], weights=votes['pct'], bins=21)
	bins = [i/10.0 for i in range(-105,106,10)]
	counts_highlights = [(b if i%5 == 0 else 0) for i,b in enumerate(counts, -10)]
	counts_others = [(b if i%5 != 0 else 0) for i,b in enumerate(counts, -10)]

	ax.hist(bins[:-1], bins, weights=counts_others, align='mid', color='#0088AA')
	ax.hist(bins[:-1], bins, weights=counts_highlights, align='mid', color='#0022FF')
	for i in bins: # Plot white lines to separate columns
		ax.axvline(i, color='white')

	ax.set_title(CRITERION)
	ax.set_ylim(bottom=0)


# Largely recommended
fig, ax = plt.subplots()
fig.set_size_inches(8, 6)
user_histogram(ax, 'largely_recommended')
ax.set_ylabel('Number of comparisons')

# Sub criteria
fig, ax = plt.subplots(3,3)
user_histogram(ax[0][0], 'reliability')
user_histogram(ax[0][1], 'pedagogy')
user_histogram(ax[0][2], 'importance')
user_histogram(ax[1][0], 'layman_friendly')
user_histogram(ax[1][1], 'entertaining_relaxing')
user_histogram(ax[1][2], 'engaging')
user_histogram(ax[2][0], 'diversity_inclusion')
user_histogram(ax[2][1], 'better_habits')
user_histogram(ax[2][2], 'backfire_risk')
fig.tight_layout()
fig.set_size_inches(8, 6)