In [4]:
import jsonlines
from sklearn.metrics import f1_score, accuracy_score

### Baseline - 1 : Human Assessment

The dataset had class imbalance where less than 5% of messages are annotated as lies. The first baseline is reproducing the results and is inspired by the methodology in the ACL 2020 Diplomacy paper. Here, we have established a human baseline by comparing the sender’s intended truthfulness labels with the receiver’s perceived truthfulness labels. The results obtained from our implementation replicate the results from the paper 

In [None]:
class Preprocessing_class:
    def __init__(self, file_path):
        self.file_path = file_path
        self.data = None
        self.aggregated_messages = None

    def load_data(self):
        with jsonlines.open(self.file_path, 'r') as reader:
            self.data = list(reader)
        return self.data
    
    @staticmethod
    def convert_to_bool(label):
        convert = lambda x: x if isinstance(x, bool) else True if x.lower() == 'true' else False
        return convert(label)
    
    @staticmethod
    def is_valid_label(label):
        return label in [True, False, 'true', 'false']

    def process_single_dialog(self, dialog):
        messages = dialog.get('messages', [])
        senders = dialog.get('sender_labels', [])
        receivers = dialog.get('receiver_labels', [])
        return [{'message': msg, 'sender_annotation': senders[i], 'receiver_annotation': receivers[i]}
            for i, msg in enumerate(messages)]

    def _aggregate_dialogs(self):
        if self.data is None: self.load_data()
        return [msg for dialog in self.data for msg in self.process_single_dialog(dialog)]
    
    def aggregate_data(self):
        self.aggregated_messages = self._aggregate_dialogs()
        return self.aggregated_messages

    def filter_valid_messages(self):
        if self.aggregated_messages is None:
            self.aggregate_data()
        valid_msgs = [{**msg,'sender_annotation': self.convert_to_bool(msg['sender_annotation']),'receiver_annotation': self.convert_to_bool(msg['receiver_annotation'])}
            for msg in self.aggregated_messages
            if Preprocessing_class.is_valid_label(msg['receiver_annotation'])
        ]
        return valid_msgs

    def get_labels(self):
        valid_msgs = self.filter_valid_messages()
        sender_labels = [0 if msg['sender_annotation'] else 1 for msg in valid_msgs]
        receiver_labels = [0 if msg['receiver_annotation'] else 1 for msg in valid_msgs]
        return sender_labels, receiver_labels

In [None]:
class Human_model_class:
    def __init__(self, sender_labels, receiver_labels):
        self.sender_labels = sender_labels
        self.receiver_labels = receiver_labels

    def compute_metric(self, metric_func, **kwargs):
        return metric_func(self.sender_labels, self.receiver_labels, **kwargs)

    macro_f1 = lambda self: self.compute_metric(f1_score, average='macro', zero_division=0)
    lie_f1   = lambda self: self.compute_metric(f1_score, pos_label=1, average='binary', zero_division=0)
    accuracy = lambda self: self.compute_metric(accuracy_score)
    
    def compute_metrics(self):
        metrics = {"Macro F1 Score": self.macro_f1(),"Lie F1 Score": self.lie_f1(), "Accuracy": self.accuracy()}
        return metrics
    
    def label_distribution(self):
        sender_dist = {label: self.sender_labels.count(label) for label in set(self.sender_labels)}
        receiver_dist = {label: self.receiver_labels.count(label) for label in set(self.receiver_labels)}
        return sender_dist, receiver_dist

    def print_metrics(self):
        metrics = self.compute_metrics()
        formatted_metrics = {name: round(value, 3) for name, value in metrics.items()}
        print("Human Model Results : Baseline 1:")
        for name, value in formatted_metrics.items():
            print(f"  {name:<15}: {value}")

In [7]:
file_path = '/Users/varun/Desktop/College/sem6/NLP/Group Project/Data/test.jsonl'

dataset_preprocessing = Preprocessing_class(file_path)
dataset_preprocessing.load_data()
dataset_preprocessing.aggregate_data()
sender_labels, receiver_labels = dataset_preprocessing.get_labels()  

baseline_1 = Human_model_class(sender_labels, receiver_labels)
baseline_1.print_metrics()

Human Model Results : Baseline 1:
  Macro F1 Score : 0.581
  Lie F1 Score   : 0.226
  Accuracy       : 0.884
