# Loading and Reformatting

In [2]:
from typing import List

import pandas as pd
import torch
from torch import Tensor

filename = 'gogi_chats'
messages = pd.read_csv(f'data/{filename}.csv')

In [3]:
def mean_non_na(series: pd.Series) -> float:
    as_numbers = pd.to_numeric(series, errors='coerce')
    return as_numbers.dropna().mean()

eval_dimensions = ['friendliness', 'helpfulness', 'clearness', 'astuteness', 'tactfulness']
eval_agg_funcs = {dim: mean_non_na for dim in eval_dimensions}
chats = messages[['conversation_id', 'message', *eval_dimensions]].groupby('conversation_id').agg(
    {
        **eval_agg_funcs,
        'message': lambda x: list(x)
    }
).reset_index().rename(columns={'message': 'chat'})
chats = chats.dropna(subset=eval_dimensions)
chats.head()

Unnamed: 0,conversation_id,friendliness,helpfulness,clearness,astuteness,tactfulness,chat
0,44481518456,3.5,3.5,4.5,1.5,4.5,[Hei ? Jeg la inn bestilling hos dere 27.12 på...
1,44481550385,2.333333,3.333333,4.333333,1.666667,2.0,[Hei! I romjula slo jeg til og kjøpte en del j...
2,44481553718,4.5,5.0,5.0,1.5,5.0,[Hei. Jeg trenger å bytte størrelse på denne m...
3,44481559213,4.5,4.5,5.0,1.0,4.5,"[﻿Hei, Det gjelder bestilling 1000319581 :) Se..."
4,44481564265,4.0,4.5,5.0,1.0,4.0,[Hei Jeg bestilte nettopp Kähler Hammershøi sa...


# Node Encodings

In [4]:
from transformers import BertTokenizer

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

def tokenize_chat(chat: List[str]) -> List[Tensor]:
    return [
        tokenizer(u, padding=True, truncation=True, return_tensors="pt")['input_ids'].squeeze()
        for u in chat
    ]

node_encodings = []
for _, chat_row in chats.iterrows():
    node_encodings.append(tokenize_chat(chat_row['chat']))

torch.save(node_encodings, f'data/{filename}_node_encodings.pt')

  from .autonotebook import tqdm as notebook_tqdm


## Edges

In [5]:
edges = []
edge_idxs = []
for _, chat_row in chats.head(1).iterrows():
    chat = chat_row['chat']

    human_idxs = [i for i in range(0, len(chat), 2)]

    chat_edges = []
    chat_edges_idxs = []
    for ui in range(len(chat)):
        for uj in range(len(chat)):
            if ui == uj:
                continue

            edge_type = [
                ui > uj,
                ui in human_idxs,
                uj in human_idxs,
            ]

            chat_edges_idxs.append((ui, uj))
            chat_edges.append(edge_type)
    
    edges.append(Tensor(chat_edges).bool())
    edge_idxs.append(Tensor(chat_edges_idxs).T.long())

torch.save(edges, f'data/{filename}_edges.pt')
torch.save(edge_idxs, f'data/{filename}_edge_idxs.pt')

## Labels

In [10]:
torch.save(Tensor(chats[eval_dimensions].values), f'data/{filename}_labels.pt')