Metaculus · lsabor · Aug 26, 2024 · Aug 24, 2024 · Aug 26, 2024 · Aug 26, 2024
diff --git a/scoring/score_math.py b/scoring/score_math.py
@@ -143,7 +143,7 @@ def evaluate_forecasts_baseline_spot_forecast(
                 else:
                     baseline = (1 - 0.05 * open_bounds_count) / (len(pmf) - 2)
                 forecast_score = 100 * np.log(pmf[resolution_bucket] / baseline) / 2
-            forecast_scores.append(ForecastScore(forecast_score))
+            forecast_scores.append(ForecastScore(forecast_score, 1.0))
         else:
             forecast_scores.append(ForecastScore(0))
     return forecast_scores
@@ -157,9 +157,10 @@ def evaluate_forecasts_peer_accuracy(
     actual_close_time: float,
     forecast_horizon_end: float,
     question_type: str,
+    geometric_means: list[AggregationEntry] | None = None,
 ) -> list[ForecastScore]:
     base_forecasts = base_forecasts or forecasts
-    geometric_mean_forecasts = get_geometric_means(base_forecasts)
+    geometric_mean_forecasts = geometric_means or get_geometric_means(base_forecasts)
     for gm in geometric_mean_forecasts:
         gm.timestamp = max(gm.timestamp, forecast_horizon_start)
     total_duration = forecast_horizon_end - forecast_horizon_start
@@ -214,9 +215,10 @@ def evaluate_forecasts_peer_spot_forecast(
     resolution_bucket: int,
     spot_forecast_timestamp: float,
     question_type: str,
+    geometric_means: list[AggregationEntry] | None = None,
 ) -> list[ForecastScore]:
     base_forecasts = base_forecasts or forecasts
-    geometric_mean_forecasts = get_geometric_means(base_forecasts)
+    geometric_mean_forecasts = geometric_means or get_geometric_means(base_forecasts)
     g = None
     for gm in geometric_mean_forecasts[::-1]:
         if gm.timestamp < spot_forecast_timestamp:
@@ -240,7 +242,7 @@ def evaluate_forecasts_peer_spot_forecast(
             )
             if question_type in ["numeric", "date"]:
                 forecast_score /= 2
-            forecast_scores.append(ForecastScore(forecast_score))
+            forecast_scores.append(ForecastScore(forecast_score, 1.0))
         else:
             forecast_scores.append(ForecastScore(0))
     return forecast_scores
@@ -305,7 +307,7 @@ def evaluate_forecasts_legacy_relative(
 def evaluate_question(
     question: Question,
     resolution_bucket: int,
-    score_type: Score.ScoreTypes,
+    score_types: list[Score.ScoreTypes],
     spot_forecast_timestamp: float | None = None,
 ) -> list[Score]:
     forecast_horizon_start = question.open_time.timestamp()
@@ -318,133 +320,142 @@ def evaluate_question(
         minimize=False,
         aggregation_method=AggregationMethod.RECENCY_WEIGHTED,
     )
+    geometric_means: list[AggregationEntry] = []
 
-    score_types = Score.ScoreTypes
-    match score_type:
-        case score_types.BASELINE:
-            open_bounds_count = bool(question.open_upper_bound) + bool(
-                question.open_lower_bound
-            )
-            user_scores = evaluate_forecasts_baseline_accuracy(
-                user_forecasts,
-                resolution_bucket,
-                forecast_horizon_start,
-                actual_close_time,
-                forecast_horizon_end,
-                question.type,
-                open_bounds_count,
-            )
-            community_scores = evaluate_forecasts_baseline_accuracy(
-                community_forecasts,
-                resolution_bucket,
-                forecast_horizon_start,
-                actual_close_time,
-                forecast_horizon_end,
-                question.type,
-                open_bounds_count,
-            )
-        case score_types.SPOT_BASELINE:
-            open_bounds_count = bool(question.open_upper_bound) + bool(
-                question.open_lower_bound
-            )
-            user_scores = evaluate_forecasts_baseline_spot_forecast(
-                user_forecasts,
-                resolution_bucket,
-                spot_forecast_timestamp,
-                question.type,
-                open_bounds_count,
-            )
-            community_scores = evaluate_forecasts_baseline_spot_forecast(
-                community_forecasts,
-                resolution_bucket,
-                spot_forecast_timestamp,
-                question.type,
-                open_bounds_count,
-            )
-        case score_types.PEER:
-            user_scores = evaluate_forecasts_peer_accuracy(
-                user_forecasts,
-                user_forecasts,
-                resolution_bucket,
-                forecast_horizon_start,
-                actual_close_time,
-                forecast_horizon_end,
-                question.type,
-            )
-            community_scores = evaluate_forecasts_peer_accuracy(
-                community_forecasts,
-                user_forecasts,
-                resolution_bucket,
-                forecast_horizon_start,
-                actual_close_time,
-                forecast_horizon_end,
-                question.type,
-            )
-        case score_types.SPOT_PEER:
-            user_scores = evaluate_forecasts_peer_spot_forecast(
-                user_forecasts,
-                user_forecasts,
-                resolution_bucket,
-                spot_forecast_timestamp,
-                question.type,
-            )
-            community_scores = evaluate_forecasts_peer_spot_forecast(
-                community_forecasts,
-                user_forecasts,
-                resolution_bucket,
-                spot_forecast_timestamp,
-                question.type,
-            )
-        case score_types.RELATIVE_LEGACY:
-            user_scores = evaluate_forecasts_legacy_relative(
-                user_forecasts,
-                community_forecasts,
-                resolution_bucket,
-                forecast_horizon_start,
-                actual_close_time,
-                forecast_horizon_end,
-            )
-            community_scores = evaluate_forecasts_legacy_relative(
-                community_forecasts,
-                community_forecasts,
-                resolution_bucket,
-                forecast_horizon_start,
-                actual_close_time,
-                forecast_horizon_end,
-            )
-        case other:
-            raise NotImplementedError(f"Score type {other} not implemented")
+    ScoreTypes = Score.ScoreTypes
+    if ScoreTypes.PEER in score_types:
+        geometric_means = get_geometric_means(user_forecasts)
 
     scores: list[Score] = []
-    users = {forecast.author for forecast in user_forecasts}
-    for user in users:
-        user_score = 0
-        user_coverage = 0
-        for forecast, score in zip(user_forecasts, user_scores):
-            if forecast.author == user:
-                user_score += score.score
-                user_coverage += score.coverage
-        if user_coverage > 0:
-            scores.append(
-                Score(
-                    user=user,
-                    score=user_score,
-                    coverage=user_coverage,
-                    score_type=score_type,
+    for score_type in score_types:
+        match score_type:
+            case ScoreTypes.BASELINE:
+                open_bounds_count = bool(question.open_upper_bound) + bool(
+                    question.open_lower_bound
+                )
+                user_scores = evaluate_forecasts_baseline_accuracy(
+                    user_forecasts,
+                    resolution_bucket,
+                    forecast_horizon_start,
+                    actual_close_time,
+                    forecast_horizon_end,
+                    question.type,
+                    open_bounds_count,
                 )
+                community_scores = evaluate_forecasts_baseline_accuracy(
+                    community_forecasts,
+                    resolution_bucket,
+                    forecast_horizon_start,
+                    actual_close_time,
+                    forecast_horizon_end,
+                    question.type,
+                    open_bounds_count,
+                )
+            case ScoreTypes.SPOT_BASELINE:
+                open_bounds_count = bool(question.open_upper_bound) + bool(
+                    question.open_lower_bound
+                )
+                user_scores = evaluate_forecasts_baseline_spot_forecast(
+                    user_forecasts,
+                    resolution_bucket,
+                    spot_forecast_timestamp,
+                    question.type,
+                    open_bounds_count,
+                )
+                community_scores = evaluate_forecasts_baseline_spot_forecast(
+                    community_forecasts,
+                    resolution_bucket,
+                    spot_forecast_timestamp,
+                    question.type,
+                    open_bounds_count,
+                )
+            case ScoreTypes.PEER:
+                user_scores = evaluate_forecasts_peer_accuracy(
+                    user_forecasts,
+                    user_forecasts,
+                    resolution_bucket,
+                    forecast_horizon_start,
+                    actual_close_time,
+                    forecast_horizon_end,
+                    question.type,
+                    geometric_means=geometric_means,
+                )
+                community_scores = evaluate_forecasts_peer_accuracy(
+                    community_forecasts,
+                    user_forecasts,
+                    resolution_bucket,
+                    forecast_horizon_start,
+                    actual_close_time,
+                    forecast_horizon_end,
+                    question.type,
+                    geometric_means=geometric_means,
+                )
+            case ScoreTypes.SPOT_PEER:
+                user_scores = evaluate_forecasts_peer_spot_forecast(
+                    user_forecasts,
+                    user_forecasts,
+                    resolution_bucket,
+                    spot_forecast_timestamp,
+                    question.type,
+                    geometric_means=geometric_means,
+                )
+                community_scores = evaluate_forecasts_peer_spot_forecast(
+                    community_forecasts,
+                    user_forecasts,
+                    resolution_bucket,
+                    spot_forecast_timestamp,
+                    question.type,
+                    geometric_means=geometric_means,
+                )
+            case ScoreTypes.RELATIVE_LEGACY:
+                user_scores = evaluate_forecasts_legacy_relative(
+                    user_forecasts,
+                    community_forecasts,
+                    resolution_bucket,
+                    forecast_horizon_start,
+                    actual_close_time,
+                    forecast_horizon_end,
+                )
+                community_scores = evaluate_forecasts_legacy_relative(
+                    community_forecasts,
+                    community_forecasts,
+                    resolution_bucket,
+                    forecast_horizon_start,
+                    actual_close_time,
+                    forecast_horizon_end,
+                )
+            case other:
+                raise NotImplementedError(f"Score type {other} not implemented")
+
+        users = {forecast.author for forecast in user_forecasts}
+        for user in users:
+            user_score = 0
+            user_coverage = 0
+            for forecast, score in zip(user_forecasts, user_scores):
+                if forecast.author == user:
+                    user_score += score.score
+                    user_coverage += score.coverage
+            if user_coverage > 0:
+                scores.append(
+                    Score(
+                        user=user,
+                        score=user_score,
+                        coverage=user_coverage,
+                        score_type=score_type,
+                    )
+                )
+        community_score = 0
+        community_coverage = 0
+        for score in community_scores:
+            community_score += score.score
+            community_coverage += score.coverage
+        scores.append(
+            Score(
+                user=None,
+                aggregation_method=AggregationMethod.RECENCY_WEIGHTED,
+                score=community_score,
+                coverage=community_coverage,
+                score_type=score_type,
             )
-    community_score = 0
-    community_coverage = 0
-    for score in community_scores:
-        community_score += score.score
-        community_coverage += score.coverage
-    scores.append(
-        Score(
-            user=None,
-            aggregation_method=AggregationMethod.RECENCY_WEIGHTED,
-            score=community_score,
-            coverage=community_coverage,
-            score_type=score_type,
         )
-    )
     return scores
diff --git a/scoring/utils.py b/scoring/utils.py
@@ -22,39 +22,41 @@
 def score_question(
     question: Question,
     resolution: str,
-    spot_forecast_time: datetime | None = None,
+    spot_forecast_time: float | None = None,
     score_types: list[str] | None = None,
 ):
     resolution_bucket = string_location_to_bucket_index(resolution, question)
+    spot_forecast_time = spot_forecast_time or question.cp_reveal_time.timestamp()
     score_types = score_types or Score.ScoreTypes.choices
-    for score_type in score_types:
-        seen = set()
-        previous_scores = list(
-            Score.objects.filter(question=question, score_type=score_type)
-        )
-        new_scores = evaluate_question(
-            question, resolution_bucket, score_type, spot_forecast_time
-        )
-        for new_score in new_scores:
-            is_new = True
-            for previous_score in previous_scores:
-                if (previous_score.user == new_score.user) and (
-                    previous_score.aggregation_method == new_score.aggregation_method
-                ):
-                    is_new = False
-                    previous_score.score = new_score.score
-                    previous_score.coverage = new_score.coverage
-                    previous_score.edited_at = question.resolution_set_time
-                    previous_score.save()
-                    seen.add(previous_score)
-                    break
-            if is_new:
-                new_score.question = question
-                new_score.edited_at = question.resolution_set_time
-                new_score.save()
+    seen = set()
+    previous_scores = list(
+        Score.objects.filter(question=question, score_type__in=score_types)
+    )
+    new_scores = evaluate_question(
+        question, resolution_bucket, score_types, spot_forecast_time
+    )
+    for new_score in new_scores:
+        is_new = True
         for previous_score in previous_scores:
-            if previous_score not in seen:
-                previous_score.delete()
+            if (
+                (previous_score.user == new_score.user)
+                and (previous_score.aggregation_method == new_score.aggregation_method)
+                and (previous_score.score_type == new_score.score_type)
+            ):
+                is_new = False
+                previous_score.score = new_score.score
+                previous_score.coverage = new_score.coverage
+                previous_score.edited_at = question.resolution_set_time
+                previous_score.save()
+                seen.add(previous_score)
+                break
+        if is_new:
+            new_score.question = question
+            new_score.edited_at = question.resolution_set_time
+            new_score.save()
+    for previous_score in previous_scores:
+        if previous_score not in seen:
+            previous_score.delete()
 
 
 def generate_scoring_leaderboard_entries(