# Habermas Machine data and preprocessing

The purpose of this colab is to demonstrate the loading of data from Google Cloud Storage and the basic preprocessing steps that were performed on the data used in the paper:

Tessler, M. H., Bakker, M. A., Jarrett D., Sheahan, H., Chadwick, M. J., Koster, R., Evans, G., Campbell-Gillingham, J., Collins, T., Parkes, D. C., Botvinick, M., and Summerfield, C. "AI can help humans find common ground in democratic deliberation." *Science*. (2024).

# Setup and Importing Packages


In [None]:
# Clone github repo locally.
!git clone https://github.com/google-deepmind/habermas_machine

# Adjust path.
import sys
sys.path.insert(0,'/content/habermas_machine')

In [None]:
# Imports
import ast
import io
import requests

import pandas as pd

from IPython.display import clear_output

# Local imports
from analysis import live_loading, serialise, types

# Load helper keys used with dataframes.
DFKeys = serialise.SerialisedComparisonKeys
DFGroupedKeys = serialise.GroupedSerialisedComparisonKeys()

In [None]:
#@title Load all comparison data from Google Cloud Storage.
comparison_data_location = (
    'https://storage.googleapis.com/habermas_machine/datasets/hm_all_candidate_comparisons.parquet'
)
response = requests.get(comparison_data_location)
with io.BytesIO(response.content) as f:
  df_all = pd.read_parquet(f)
clear_output()

df_all.shape # Shape of full comparison data frame.

In [None]:
#@title Explore data

print("Number of participant sessions (before pre-processing):",
      df_all[DFKeys.COMPARISON_PARTICIPANT_ID].nunique())

print(
    "Number of participant sessions (before pre-processing) of each collection:",
    df_all[
        [DFKeys.COMPARISON_VERSION, DFKeys.COMPARISON_PARTICIPANT_ID]
    ].drop_duplicates().groupby(DFKeys.COMPARISON_VERSION).count()
)

# Example preprocessing

In [None]:
#@title Select dataset

dataset_name = 'training' # @param ["training", "cohort1_ablation_iid_v1", "cohort2_ablation_iid_v2", "cohort3_ablation_ood_v1", 'cohort4_critique_exclusion', 'cohort5_opinion_exposure', 'cohort6_human_mediator', 'virtual_citizens_assembly']

# Set dataset and parameters based on dataset_name.
if dataset_name == 'training':
  df = df_all[
      df_all[DFKeys.COMPARISON_VERSION].isin([
          'TRAINING_DATA_V1',
          'TRAINING_DATA_V2',
          'TRAINING_DATA_V3',
          'TRAINING_DATA_V4',
          'TRAINING_DATA_V5',
      ])
  ]
  # Backwards incompatibility issue with training data.
  df = df.drop(columns=[
      DFKeys.CANDIDATES_ALL_REWARD_DATA_WELFARE_OR_RANK,
      DFKeys.CANDIDATES_REWARD_DATA_WELFARE_OR_RANK,
  ])
  min_size_parameters = None
  remove_groups_with_repeat_participants = False
  valid_candidate_provenances = (types.ResponseProvenance.MODEL_MEDIATOR,)
elif dataset_name == 'cohort1_ablation_iid_v1':
  df = df_all[df_all[DFKeys.COMPARISON_VERSION] == 'EVAL_COHORT1_ABLATION_IID_V1']
  min_size_parameters = live_loading.GroupMinSizeParameters.ITERATION_EVAL_ABLATION_IID_V1
  remove_groups_with_repeat_participants = True
  valid_candidate_provenances = (types.ResponseProvenance.MODEL_MEDIATOR,)
elif dataset_name == 'cohort2_ablation_iid_v2':
  df = df_all[df_all[DFKeys.COMPARISON_VERSION] == 'EVAL_COHORT2_ABLATION_IID_V2']
  min_size_parameters = live_loading.GroupMinSizeParameters.ITERATION_EVAL_ABLATION_IID_V2
  remove_groups_with_repeat_participants = True
  valid_candidate_provenances = (types.ResponseProvenance.MODEL_MEDIATOR,)
elif dataset_name == 'cohort3_ablation_ood_v1':
  df = df_all[df_all[DFKeys.COMPARISON_VERSION] == 'EVAL_COHORT3_ABLATION_OOD_V1']
  min_size_parameters = live_loading.GroupMinSizeParameters.ITERATION_EVAL_ABLATION_OOD_V1
  remove_groups_with_repeat_participants = True
  valid_candidate_provenances = (types.ResponseProvenance.MODEL_MEDIATOR,)
elif dataset_name == 'cohort4_critique_exclusion':
  df = df_all[df_all[DFKeys.COMPARISON_VERSION] == 'EVAL_COHORT4_CRITIQUE_EXCLUSION']
  min_size_parameters = live_loading.GroupMinSizeParameters.ITERATION_EVAL_CRITIQUE_EXCLUSION
  remove_groups_with_repeat_participants = True
  valid_candidate_provenances = (types.ResponseProvenance.MODEL_MEDIATOR,)
elif dataset_name == 'cohort5_opinion_exposure':
  df = df_all[df_all[DFKeys.COMPARISON_VERSION] == 'EVAL_COHORT5_OPINION_EXPOSURE']
  min_size_parameters = live_loading.GroupMinSizeParameters.ITERATION_EVAL_OPINION_EXPOSURE
  remove_groups_with_repeat_participants = False
  valid_candidate_provenances = (
      types.ResponseProvenance.HUMAN_CITIZEN, # Candidates are other opinions.
  )
elif dataset_name == 'cohort6_human_mediator':
  df = df_all[df_all[DFKeys.COMPARISON_VERSION] == 'EVAL_COHORT6_HUMAN_MEDIATOR']
  min_size_parameters = live_loading.GroupMinSizeParameters.ITERATION_EVAL_HUMAN_MEDIATOR
  remove_groups_with_repeat_participants = False
  # Candidates can be either model or human statements.
  valid_candidate_provenances = (
        types.ResponseProvenance.MODEL_MEDIATOR,
        types.ResponseProvenance.HUMAN_MEDIATOR, )
elif dataset_name == 'virtual_citizens_assembly':
  df = df_all[df_all[DFKeys.COMPARISON_VERSION].isin([
      'EVAL_VIRTUAL_CITIZENS_ASSEMBLY_WEEK3',
      'EVAL_VIRTUAL_CITIZENS_ASSEMBLY_WEEK4',
      'EVAL_VIRTUAL_CITIZENS_ASSEMBLY_WEEK5'
  ])]
  min_size_parameters = None
  remove_groups_with_repeat_participants = False
  valid_candidate_provenances = (types.ResponseProvenance.MODEL_MEDIATOR,)

In [None]:
#@title Example pre-processing
print('processing', dataset_name)
print('original df shape', df.shape)
live_loading.check_consistent_tuple_lengths_in_grouped_columns(
    df, groups_columns=[
        DFGroupedKeys.OTHER_OPINIONS, DFGroupedKeys.CANDIDATES])

# First, unnest columns (e.g., ratings of statements).
df_unnested = live_loading.unnest_nested_columns(df)
print('unnested df shape', df_unnested.shape)

# Remove rows where OWN_OPINION is not HUMAN_CITIZEN (e.g., MOCKs).
df_unnested = live_loading.filter_on_response_provenances(
    df_unnested,
    provenance_column=DFKeys.OWN_OPINION_PROVENANCE,
    valid_provenances=(types.ResponseProvenance.HUMAN_CITIZEN,),
)
print('filtered df shape after removing invalid opinions', df_unnested.shape)

# Remove rows where CANDIDATES_PROVEANACE is not as expected:
# MODEL_MEDIATOR for most data sets. Can also be HUMAN_CITIZEN or HUMAN_MEDIATOR
# for opinion exposure and human mediator comparison, respectively.
df_unnested = live_loading.filter_on_response_provenances(
    df_unnested,
    provenance_column=DFKeys.CANDIDATES_PROVENANCE,
    valid_provenances=valid_candidate_provenances,
)
print('filtered df shape after removing invalid candidates', df_unnested.shape)

# Remove mock ratings.
df_unnested = live_loading.filter_out_mock_ratings(
    df_unnested, rating_type=live_loading.RatingTypes.AGREEMENT)
print('filtered df shape after removing mock ratings', df_unnested.shape)

# Remove mock rankings.
df_unnested = live_loading.filter_out_mock_rankings(df_unnested)
print('filtered df shape after removing mock rankings', df_unnested.shape)

# Add a column with the numerical equivalents for the Likerts.
df_unnested = live_loading.add_numerical_ratings(df_unnested)
print('added numerical ratings df shape', df_unnested.shape)

# Optional (not used in training or human mediator eval):
# Remove groups with repeat participants.
if remove_groups_with_repeat_participants:
  df_unnested = live_loading.filter_groups_with_repeat_participants(
      df_unnested, 'worker_id')
  print('filtered df after removing groups with repeat participants', df_unnested.shape)

# Renest previously unnested columns.
df_nested = live_loading.nest_columns_as_tuples(df_unnested)
print('renested df shape', df_nested.shape)

# Human Mediator specific preprocessing: Only keep rounds where both human and
# model generated statements.
if dataset_name == 'cohort6_human_mediator':
  df_nested = df_nested[
      df_nested[DFKeys.CANDIDATES_PROVENANCE].apply(len) == 2
  ]
  print('only keeping rounds where both human and model made statement',
        df_nested.shape)

# Optional: Filter by number of groups of min size (pre-registration criteria).
# Note, this should be applied to only a single evaluation dataset and not
# multiple datasets at the same time.
if min_size_parameters is not None:
  df_nested = live_loading.filter_by_number_of_groups_of_min_size(
      df_nested,
      **min_size_parameters.value)
  print('filtered df after setting number of groups of min size', df_nested.shape)

print('Number of groups in preprocessed dataframe:',
      df_nested[DFKeys.LAUNCH_ID].nunique())

```
Copyright 2024 DeepMind Technologies Limited.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
```

