diff --git a/backend/requirements.txt b/backend/requirements.txt index b882d5947c..dd11aa18e0 100644 --- a/backend/requirements.txt +++ b/backend/requirements.txt @@ -5,6 +5,7 @@ numpy==1.22.4 psycopg2-binary==2.9.5 pydantic==1.9.1 python-dotenv==0.21.0 +scipy==1.8.1 SQLAlchemy==1.4.41 sqlmodel==0.0.8 starlette==0.22.0 diff --git a/scripts/postprocessing/infogain_selector.py b/scripts/postprocessing/infogain_selector.py new file mode 100644 index 0000000000..51f60fa7d1 --- /dev/null +++ b/scripts/postprocessing/infogain_selector.py @@ -0,0 +1,98 @@ +# -*- coding: utf-8 -*- +import numpy as np +from scipy import log2 +from scipy.integrate import nquad +from scipy.special import gammaln, psi +from scipy.stats import dirichlet + + +def make_range(*x): + """ + constructs leftover values for the simplex given the first k entries + (0,x_k) = 1-(x_1+...+x_(k-1)) + """ + return (0, max(0, 1 - sum(x))) + + +def relative_entropy(p, q): + """ + relative entropy of the two given dirichlet distributions + """ + + def tmp(*x): + """ + First adds the last always forced entry to the input (the last x_last = 1-(x_1+...+x_(N)) ) + Then computes the relative entropy of posterior and prior for that datapoint + """ + x_new = np.append(x, 1 - sum(x)) + return p(x_new) * log2(p(x_new) / q(x_new)) + + return tmp + + +def naive_monte_carlo_integral(fun, dim, samples=10_000_000): + s = np.random.rand(dim - 1, samples) + s = np.sort(np.concatenate((np.zeros((1, samples)), s, np.ones((1, samples)))), 0) + # print(s) + pos = np.diff(s, axis=0) + # print(pos) + res = fun(pos) + return np.mean(res) + + +def analytic_solution(a_post, a_prior): + """ + Analytic solution to the KL-divergence between two dirichlet distributions. + Proof is in the Notion design doc. + """ + post_sum = np.sum(a_post) + prior_sum = np.sum(a_prior) + info = ( + gammaln(post_sum) + - gammaln(prior_sum) + - np.sum(gammaln(a_post)) + + np.sum(gammaln(a_prior)) + - np.sum((a_post - a_prior) * (psi(a_post) - psi(post_sum))) + ) + + return info + + +def infogain(a_post, a_prior): + raise ( + """For the love of good don't use this: + it's insanely poorly conditioned, the worst numerical code I have ever written + and it's slow as molasses. Use the analytic solution instead. + + Maybe remove + """ + ) + args = len(a_prior) + p = dirichlet(a_post).pdf + q = dirichlet(a_prior).pdf + (info, _) = nquad(relative_entropy(p, q), [make_range for _ in range(args - 1)], opts={"epsabs": 1e-8}) + # info = naive_monte_carlo_integral(relative_entropy(p,q), len(a_post)) + return info + + +def uniform_expected_infogain(a_prior): + mean_weight = dirichlet.mean(a_prior) + print("weight", mean_weight) + results = [] + for i, w in enumerate(mean_weight): + a_post = a_prior.copy() + a_post[i] = a_post[i] + 1 + results.append(w * analytic_solution(a_post, a_prior)) + return np.sum(results) + + +if __name__ == "__main__": + a_prior = np.array([1, 1, 9, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + a_post = np.array([1, 1, 20, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) + + print("algebraic", analytic_solution(a_post, a_prior)) + # print("raw",infogain(a_post, a_prior)) + print("large infogain", uniform_expected_infogain(a_prior)) + print("post infogain", uniform_expected_infogain(a_post)) + # a_prior = np.array([1,1,1000]) + # print("small infogain",uniform_expected_infogain(a_prior)) diff --git a/scripts/postprocessing/scoring.py b/scripts/postprocessing/scoring.py new file mode 100644 index 0000000000..3c145b285b --- /dev/null +++ b/scripts/postprocessing/scoring.py @@ -0,0 +1,183 @@ +# -*- coding: utf-8 -*- +from dataclasses import dataclass, replace +from typing import Any + +import numpy as np +import numpy.typing as npt +from scipy.stats import kendalltau + + +@dataclass +class Voter: + """ + Represents a single voter. + This tabulates the number of good votes, total votes, + and points. + We only put well-behaved people on the scoreboard and filter out the badly behaved ones + """ + + uid: Any + num_votes: int + num_good_votes: int + num_prompts: int + num_good_prompts: int + num_rankings: int + num_good_rankings: int + + ##################### + voting_points: int + prompt_points: int + ranking_points: int + + def voter_quality(self): + return self.num_good_votes / self.num_votes + + def rank_quality(self): + return self.num_good_rankings / self.num_rankings + + def prompt_quality(self): + return self.num_good_prompts / self.num_prompts + + def is_well_behaved(self, threshhold_vote, threshhold_prompt, threshhold_rank): + return ( + self.voter_quality() > threshhold_vote + and self.prompt_quality() > threshhold_prompt + and self.rank_quality() > threshhold_rank + ) + + def total_points(self, voting_weight, prompt_weight, ranking_weight): + return ( + voting_weight * self.voting_points + + prompt_weight * self.prompt_points + + ranking_weight * self.ranking_points + ) + + +def score_update_votes(new_vote: int, consensus: npt.ArrayLike, voter_data: Voter) -> Voter: + """ + This function returns the new "quality score" and points for a voter, + after that voter cast a vote on a question. + + This function is only to be run when archiving a question + i.e. the question has had sufficiently many votes, or we cann't get more than "K" bits of information + + The consensus is the array of all votes cast by all voters for that question + We then update the voter data using the new information + + Parameters: + new_vote (int): the index of the vote cast by the voter + consensus (ArrayLike): all votes cast for this question + voter_data (Voter): a "Voter" object that represents the person casting the "new_vote" + + Returns: + updated_voter (Voter): the new "quality score" and points for the voter + """ + # produces the ranking of votes, e.g. for [100,300,200] it returns [0, 2, 1], + # since 100 is the lowest, 300 the highest and 200 the middle value + consensus_ranking = np.argsort(np.argsort(consensus)) + new_points = consensus_ranking[new_vote] + voter_data.voting_points + + # we need to correct for 0 indexing, if you are closer to "right" than "wrong" of the conensus, + # it's a good vote + new_good_votes = int(consensus_ranking[new_vote] > (len(consensus) - 1) / 2) + voter_data.num_good_votes + new_num_votes = voter_data.num_votes + 1 + return replace(voter_data, num_votes=new_num_votes, num_good_votes=new_good_votes, voting_points=new_points) + + +def score_update_prompts(consensus: npt.ArrayLike, voter_data: Voter) -> Voter: + """ + This function returns the gain of points for a given prompt's votes + + This function is only to be run when archiving a question + i.e. the question has had sufficiently many votes, or we cann't get more than "K" bits of information + + Parameters: + consensus (ArrayLike): all votes cast for this question + voter_data (Voter): a "Voter" object that represents the person that wrote the prompt + + Returns: + updated_voter (Voter): the new "quality score" and points for the voter + """ + # produces the ranking of votes, e.g. for [100,300,200] it returns [0, 2, 1], + # since 100 is the lowest, 300 the highest and 200 the middle value + consensus_ranking = np.arange(len(consensus)) - len(consensus) // 2 + 1 + delta_votes = np.sum(consensus_ranking * consensus) + new_points = delta_votes + voter_data.prompt_points + + # we need to correct for 0 indexing, if you are closer to "right" than "wrong" of the conensus, + # it's a good vote + new_good_prompts = int(delta_votes > 0) + voter_data.num_good_prompts + new_num_prompts = voter_data.num_prompts + 1 + return replace( + voter_data, + num_prompts=new_num_prompts, + num_good_prompts=new_good_prompts, + prompt_points=new_points, + ) + + +def score_update_ranking(user_ranking: npt.ArrayLike, consensus_ranking: npt.ArrayLike, voter_data: Voter) -> Voter: + """ + This function returns the gain of points for a given ranking's votes + + This function is only to be run when archiving a question + i.e. the question has had sufficiently many votes, or we cann't get more than "K" bits of information + + we use the bubble-sort distance (or "kendall-tau" distance) to compare the two rankings + we use this over spearman correlation since: + "[Kendall's τ] approaches a normal distribution more rapidly than ρ, as N, the sample size, increases; + and τ is also more tractable mathematically, particularly when ties are present" + Gilpin, A. R. (1993). Table for conversion of Kendall's Tau to Spearman's + Rho within the context measures of magnitude of effect for meta-analysis + + Further in + "research design and statistical analyses, second edition, 2003" + the authors note that at least from an significance test POV they will yield the same p-values + + Parameters: + user_ranking (ArrayLike): ranking produced by the user + consensus (ArrayLike): ranking produced after running the voting algorithm to merge into the consensus ranking + voter_data (Voter): a "Voter" object that represents the person that wrote the prompt + + Returns: + updated_voter (Voter): the new "quality score" and points for the voter + """ + bubble_sort_distance, p_value = kendalltau(user_ranking, consensus_ranking) + # normalize kendall-tau from [-1,1] into [0,1] range + bubble_sort_distance = (1 + bubble_sort_distance) / 2 + new_points = bubble_sort_distance + voter_data.ranking_points + new_good_rankings = int(bubble_sort_distance > 0.5) + voter_data.num_good_rankings + new_num_rankings = voter_data.num_rankings + 1 + return replace( + voter_data, + num_rankings=new_num_rankings, + num_good_rankings=new_good_rankings, + ranking_points=new_points, + ) + + +if __name__ == "__main__": + demo_voter = Voter( + "abc", + num_votes=10, + num_good_votes=2, + num_prompts=10, + num_good_prompts=2, + num_rankings=10, + num_good_rankings=2, + voting_points=6, + prompt_points=0, + ranking_points=0, + ) + new_vote = 3 + consensus = np.array([200, 300, 100, 500]) + print(demo_voter) + print("best vote ", score_update_votes(new_vote, consensus, demo_voter)) + new_vote = 2 + print("worst vote ", score_update_votes(new_vote, consensus, demo_voter)) + new_vote = 1 + print("medium vote ", score_update_votes(new_vote, consensus, demo_voter)) + print("prompt writer", score_update_prompts(consensus, demo_voter)) + print("best rank ", score_update_ranking(np.array([0, 2, 1]), np.array([0, 2, 1]), demo_voter)) + print("medium rank ", score_update_ranking(np.array([2, 0, 1]), np.array([0, 2, 1]), demo_voter)) + print("worst rank ", score_update_ranking(np.array([1, 0, 2]), np.array([0, 2, 1]), demo_voter))