From 31f81a0c9749a9d61ab6d028e3c0e6dc5124e2c2 Mon Sep 17 00:00:00 2001 From: lsabor Date: Sat, 24 Aug 2024 16:58:56 -0700 Subject: [PATCH] make scoring much more efficient by only calculating cp history once --- scoring/score_math.py | 267 ++++++++++++++++++++++-------------------- scoring/utils.py | 58 ++++----- 2 files changed, 169 insertions(+), 156 deletions(-) diff --git a/scoring/score_math.py b/scoring/score_math.py index ad314acc98..6645287d01 100644 --- a/scoring/score_math.py +++ b/scoring/score_math.py @@ -143,7 +143,7 @@ def evaluate_forecasts_baseline_spot_forecast( else: baseline = (1 - 0.05 * open_bounds_count) / (len(pmf) - 2) forecast_score = 100 * np.log(pmf[resolution_bucket] / baseline) / 2 - forecast_scores.append(ForecastScore(forecast_score)) + forecast_scores.append(ForecastScore(forecast_score, 1.0)) else: forecast_scores.append(ForecastScore(0)) return forecast_scores @@ -157,9 +157,10 @@ def evaluate_forecasts_peer_accuracy( actual_close_time: float, forecast_horizon_end: float, question_type: str, + geometric_means: list[AggregationEntry] | None = None, ) -> list[ForecastScore]: base_forecasts = base_forecasts or forecasts - geometric_mean_forecasts = get_geometric_means(base_forecasts) + geometric_mean_forecasts = geometric_means or get_geometric_means(base_forecasts) for gm in geometric_mean_forecasts: gm.timestamp = max(gm.timestamp, forecast_horizon_start) total_duration = forecast_horizon_end - forecast_horizon_start @@ -214,9 +215,10 @@ def evaluate_forecasts_peer_spot_forecast( resolution_bucket: int, spot_forecast_timestamp: float, question_type: str, + geometric_means: list[AggregationEntry] | None = None, ) -> list[ForecastScore]: base_forecasts = base_forecasts or forecasts - geometric_mean_forecasts = get_geometric_means(base_forecasts) + geometric_mean_forecasts = geometric_means or get_geometric_means(base_forecasts) g = None for gm in geometric_mean_forecasts[::-1]: if gm.timestamp < spot_forecast_timestamp: @@ -240,7 +242,7 @@ def evaluate_forecasts_peer_spot_forecast( ) if question_type in ["numeric", "date"]: forecast_score /= 2 - forecast_scores.append(ForecastScore(forecast_score)) + forecast_scores.append(ForecastScore(forecast_score, 1.0)) else: forecast_scores.append(ForecastScore(0)) return forecast_scores @@ -305,7 +307,7 @@ def evaluate_forecasts_legacy_relative( def evaluate_question( question: Question, resolution_bucket: int, - score_type: Score.ScoreTypes, + score_types: list[Score.ScoreTypes], spot_forecast_timestamp: float | None = None, ) -> list[Score]: forecast_horizon_start = question.open_time.timestamp() @@ -318,133 +320,142 @@ def evaluate_question( minimize=False, aggregation_method=AggregationMethod.RECENCY_WEIGHTED, ) + geometric_means: list[AggregationEntry] = [] - score_types = Score.ScoreTypes - match score_type: - case score_types.BASELINE: - open_bounds_count = bool(question.open_upper_bound) + bool( - question.open_lower_bound - ) - user_scores = evaluate_forecasts_baseline_accuracy( - user_forecasts, - resolution_bucket, - forecast_horizon_start, - actual_close_time, - forecast_horizon_end, - question.type, - open_bounds_count, - ) - community_scores = evaluate_forecasts_baseline_accuracy( - community_forecasts, - resolution_bucket, - forecast_horizon_start, - actual_close_time, - forecast_horizon_end, - question.type, - open_bounds_count, - ) - case score_types.SPOT_BASELINE: - open_bounds_count = bool(question.open_upper_bound) + bool( - question.open_lower_bound - ) - user_scores = evaluate_forecasts_baseline_spot_forecast( - user_forecasts, - resolution_bucket, - spot_forecast_timestamp, - question.type, - open_bounds_count, - ) - community_scores = evaluate_forecasts_baseline_spot_forecast( - community_forecasts, - resolution_bucket, - spot_forecast_timestamp, - question.type, - open_bounds_count, - ) - case score_types.PEER: - user_scores = evaluate_forecasts_peer_accuracy( - user_forecasts, - user_forecasts, - resolution_bucket, - forecast_horizon_start, - actual_close_time, - forecast_horizon_end, - question.type, - ) - community_scores = evaluate_forecasts_peer_accuracy( - community_forecasts, - user_forecasts, - resolution_bucket, - forecast_horizon_start, - actual_close_time, - forecast_horizon_end, - question.type, - ) - case score_types.SPOT_PEER: - user_scores = evaluate_forecasts_peer_spot_forecast( - user_forecasts, - user_forecasts, - resolution_bucket, - spot_forecast_timestamp, - question.type, - ) - community_scores = evaluate_forecasts_peer_spot_forecast( - community_forecasts, - user_forecasts, - resolution_bucket, - spot_forecast_timestamp, - question.type, - ) - case score_types.RELATIVE_LEGACY: - user_scores = evaluate_forecasts_legacy_relative( - user_forecasts, - community_forecasts, - resolution_bucket, - forecast_horizon_start, - actual_close_time, - forecast_horizon_end, - ) - community_scores = evaluate_forecasts_legacy_relative( - community_forecasts, - community_forecasts, - resolution_bucket, - forecast_horizon_start, - actual_close_time, - forecast_horizon_end, - ) - case other: - raise NotImplementedError(f"Score type {other} not implemented") + ScoreTypes = Score.ScoreTypes + if ScoreTypes.PEER in score_types: + geometric_means = get_geometric_means(user_forecasts) scores: list[Score] = [] - users = {forecast.author for forecast in user_forecasts} - for user in users: - user_score = 0 - user_coverage = 0 - for forecast, score in zip(user_forecasts, user_scores): - if forecast.author == user: - user_score += score.score - user_coverage += score.coverage - if user_coverage > 0: - scores.append( - Score( - user=user, - score=user_score, - coverage=user_coverage, - score_type=score_type, + for score_type in score_types: + match score_type: + case ScoreTypes.BASELINE: + open_bounds_count = bool(question.open_upper_bound) + bool( + question.open_lower_bound + ) + user_scores = evaluate_forecasts_baseline_accuracy( + user_forecasts, + resolution_bucket, + forecast_horizon_start, + actual_close_time, + forecast_horizon_end, + question.type, + open_bounds_count, ) + community_scores = evaluate_forecasts_baseline_accuracy( + community_forecasts, + resolution_bucket, + forecast_horizon_start, + actual_close_time, + forecast_horizon_end, + question.type, + open_bounds_count, + ) + case ScoreTypes.SPOT_BASELINE: + open_bounds_count = bool(question.open_upper_bound) + bool( + question.open_lower_bound + ) + user_scores = evaluate_forecasts_baseline_spot_forecast( + user_forecasts, + resolution_bucket, + spot_forecast_timestamp, + question.type, + open_bounds_count, + ) + community_scores = evaluate_forecasts_baseline_spot_forecast( + community_forecasts, + resolution_bucket, + spot_forecast_timestamp, + question.type, + open_bounds_count, + ) + case ScoreTypes.PEER: + user_scores = evaluate_forecasts_peer_accuracy( + user_forecasts, + user_forecasts, + resolution_bucket, + forecast_horizon_start, + actual_close_time, + forecast_horizon_end, + question.type, + geometric_means=geometric_means, + ) + community_scores = evaluate_forecasts_peer_accuracy( + community_forecasts, + user_forecasts, + resolution_bucket, + forecast_horizon_start, + actual_close_time, + forecast_horizon_end, + question.type, + geometric_means=geometric_means, + ) + case ScoreTypes.SPOT_PEER: + user_scores = evaluate_forecasts_peer_spot_forecast( + user_forecasts, + user_forecasts, + resolution_bucket, + spot_forecast_timestamp, + question.type, + geometric_means=geometric_means, + ) + community_scores = evaluate_forecasts_peer_spot_forecast( + community_forecasts, + user_forecasts, + resolution_bucket, + spot_forecast_timestamp, + question.type, + geometric_means=geometric_means, + ) + case ScoreTypes.RELATIVE_LEGACY: + user_scores = evaluate_forecasts_legacy_relative( + user_forecasts, + community_forecasts, + resolution_bucket, + forecast_horizon_start, + actual_close_time, + forecast_horizon_end, + ) + community_scores = evaluate_forecasts_legacy_relative( + community_forecasts, + community_forecasts, + resolution_bucket, + forecast_horizon_start, + actual_close_time, + forecast_horizon_end, + ) + case other: + raise NotImplementedError(f"Score type {other} not implemented") + + users = {forecast.author for forecast in user_forecasts} + for user in users: + user_score = 0 + user_coverage = 0 + for forecast, score in zip(user_forecasts, user_scores): + if forecast.author == user: + user_score += score.score + user_coverage += score.coverage + if user_coverage > 0: + scores.append( + Score( + user=user, + score=user_score, + coverage=user_coverage, + score_type=score_type, + ) + ) + community_score = 0 + community_coverage = 0 + for score in community_scores: + community_score += score.score + community_coverage += score.coverage + scores.append( + Score( + user=None, + aggregation_method=AggregationMethod.RECENCY_WEIGHTED, + score=community_score, + coverage=community_coverage, + score_type=score_type, ) - community_score = 0 - community_coverage = 0 - for score in community_scores: - community_score += score.score - community_coverage += score.coverage - scores.append( - Score( - user=None, - aggregation_method=AggregationMethod.RECENCY_WEIGHTED, - score=community_score, - coverage=community_coverage, - score_type=score_type, ) - ) return scores diff --git a/scoring/utils.py b/scoring/utils.py index faf9495dad..5d0ce251c2 100644 --- a/scoring/utils.py +++ b/scoring/utils.py @@ -22,39 +22,41 @@ def score_question( question: Question, resolution: str, - spot_forecast_time: datetime | None = None, + spot_forecast_time: float | None = None, score_types: list[str] | None = None, ): resolution_bucket = string_location_to_bucket_index(resolution, question) + spot_forecast_time = spot_forecast_time or question.cp_reveal_time.timestamp() score_types = score_types or Score.ScoreTypes.choices - for score_type in score_types: - seen = set() - previous_scores = list( - Score.objects.filter(question=question, score_type=score_type) - ) - new_scores = evaluate_question( - question, resolution_bucket, score_type, spot_forecast_time - ) - for new_score in new_scores: - is_new = True - for previous_score in previous_scores: - if (previous_score.user == new_score.user) and ( - previous_score.aggregation_method == new_score.aggregation_method - ): - is_new = False - previous_score.score = new_score.score - previous_score.coverage = new_score.coverage - previous_score.edited_at = question.resolution_set_time - previous_score.save() - seen.add(previous_score) - break - if is_new: - new_score.question = question - new_score.edited_at = question.resolution_set_time - new_score.save() + seen = set() + previous_scores = list( + Score.objects.filter(question=question, score_type__in=score_types) + ) + new_scores = evaluate_question( + question, resolution_bucket, score_types, spot_forecast_time + ) + for new_score in new_scores: + is_new = True for previous_score in previous_scores: - if previous_score not in seen: - previous_score.delete() + if ( + (previous_score.user == new_score.user) + and (previous_score.aggregation_method == new_score.aggregation_method) + and (previous_score.score_type == new_score.score_type) + ): + is_new = False + previous_score.score = new_score.score + previous_score.coverage = new_score.coverage + previous_score.edited_at = question.resolution_set_time + previous_score.save() + seen.add(previous_score) + break + if is_new: + new_score.question = question + new_score.edited_at = question.resolution_set_time + new_score.save() + for previous_score in previous_scores: + if previous_score not in seen: + previous_score.delete() def generate_scoring_leaderboard_entries(