In [32]:
# Parameters
x_encoding = [
    {"encoding": "ordinal", "column": "prev_contribution"},
    {"encoding": "ordinal", "column": "prev_punishment"},
]
u_encoding = [
    {"encoding": "numeric", "column": "prev_common_good"}
]
y_encoding = [
    {"encoding": "ordinal", "column": "contribution"}
]
n_contributions = 21
n_punishments = 31
n_cross_val = 2
fraction_training = 1.0
data = "../../data/experiments/pilot1_player_round_slim.csv"
output_path = "../../data/training/dev"
labels = {}
model_args = {"n_units": 40}
optimizer_args = {"lr": 0.0001, "weight_decay": 1e-05}
train_args = {"epochs": 100, "batch_size": 40, "clamp_grad": 1, "eval_period": 10}
device = "cpu"


In [33]:
%load_ext autoreload
%autoreload 2

import os
import numpy as np
import pandas as pd
import torch as th
from aimanager.artificial_humans.cross_validation import split_xy, get_cross_validations, get_fraction_of_groups
from aimanager.generic.encoder import ordinal_to_int, onehot_to_int, joined_encoder, int_encode
from aimanager.artificial_humans.metrics import create_metrics, create_confusion_matrix
from aimanager.artificial_humans.synthesize_data import syn_con_pun
from aimanager.utils.array_to_df import add_labels, using_multiindex
from aimanager.utils.utils import make_dir
from aimanager.generic.mlp import MultiLayer

output_path = os.path.join(output_path, 'data')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [34]:
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
th_device = th.device(device)

def create_fully_connected(n_nodes):
    return th.tensor([[i,j]
        for i in range(n_nodes)
        for j in range(n_nodes)
    ])
n_rounds = 8
n_player = 4


df = pd.read_csv(data)

df['contribution'] = pd.Categorical(
    df['contribution'], categories=np.arange(n_contributions), ordered=True
)
df['punishment'] = pd.Categorical(
    df['punishment'], categories=np.arange(n_punishments), ordered=True
)

index = ['session', 'global_group_id', 'episode', 'participant_code', 'round_number']

eposide_data_count = df.groupby(by=index[0:3])['contribution'].transform('count')
max_eposide_data_count = eposide_data_count.max()
w = eposide_data_count != max_eposide_data_count
print(f'We found a maximum of {max_eposide_data_count} data points per episode.')
print(f'Remove incomplete episodes. {w.sum()} of {w.count()} data points are going to be removed.')
df = df[~w]

df = df.set_index(index).sort_index()
prev_df = df.groupby(index[:-1]).shift(1)
rename = {
    c: 'prev_' + c
    for c in prev_df.columns
}
prev_df = prev_df.rename(columns=rename)
df = df.join(prev_df)
df = df[~df['prev_contribution'].isnull()]


n_rounds = n_rounds - 1
n_episodes = 2
n_episodes_total = len(df) // (n_player*n_rounds)
n_groups = n_episodes_total // n_episodes

x = th.tensor(joined_encoder(df, x_encoding), dtype=th.float, device=th_device).reshape(n_episodes_total,n_rounds,n_player,-1)
u = th.tensor(joined_encoder(df, u_encoding), dtype=th.float, device=th_device).reshape(n_episodes_total,n_rounds,n_player,-1)[:,:,0,:]
y = th.tensor(joined_encoder(df, y_encoding), dtype=th.float, device=th_device).reshape(n_episodes_total,n_rounds,n_player,-1)
edge_attr = th.zeros(n_player*n_player,0)
edge_index = create_fully_connected(n_player)
group_idx = [
    g
    for g in range(n_groups)
    for e in range(n_episodes)
]

dataset = [
    Data(x=x[i],u=u[i],y=y[i], edge_attr=edge_attr, edge_index=edge_index, idx=i, group_idx=group_idx[i])
    for i in range(n_episodes_total)
]

We found a maximum of 32 data points per episode.
Remove incomplete episodes. 84 of 3092 data points are going to be removed.


In [35]:
for i,d in enumerate(iter(DataLoader(dataset, shuffle=True))):
    print(d.idx)
    if i > 10000:
        break
print(i)

tensor([83])
tensor([24])
tensor([65])
tensor([56])
tensor([50])
tensor([60])
tensor([20])
tensor([1])
tensor([40])
tensor([23])
tensor([90])
tensor([6])
tensor([9])
tensor([77])
tensor([25])
tensor([13])
tensor([18])
tensor([76])
tensor([62])
tensor([59])
tensor([57])
tensor([41])
tensor([51])
tensor([93])
tensor([30])
tensor([84])
tensor([71])
tensor([38])
tensor([37])
tensor([91])
tensor([33])
tensor([64])
tensor([46])
tensor([49])
tensor([32])
tensor([0])
tensor([44])
tensor([78])
tensor([69])
tensor([81])
tensor([89])
tensor([68])
tensor([73])
tensor([26])
tensor([43])
tensor([75])
tensor([72])
tensor([58])
tensor([53])
tensor([28])
tensor([79])
tensor([35])
tensor([80])
tensor([47])
tensor([70])
tensor([34])
tensor([63])
tensor([15])
tensor([2])
tensor([11])
tensor([22])
tensor([39])
tensor([54])
tensor([82])
tensor([4])
tensor([61])
tensor([10])
tensor([21])
tensor([86])
tensor([8])
tensor([12])
tensor([67])
tensor([5])
tensor([92])
tensor([14])
tensor([42])
tensor([17])
tensor(

In [36]:
import random

def get_cross_validations(dataset, n_splits):
    group_ids = list(set(d.group_idx for d in dataset))
    random.shuffle(group_ids)
    groups = [group_ids[i::n_splits] for i in range(n_splits)]

    for i in range(n_splits):
        test_groups = groups[i]
        test_dataset = [d for d in dataset if d.group_idx in test_groups]
        train_dataset = [d for d in dataset if not d.group_idx in test_groups]
        yield train_dataset, test_dataset

In [37]:
from torch.nn import Sequential as Seq, Linear as Lin, Tanh
import torch as th
from torch_scatter import scatter_mean
from torch_geometric.nn import MetaLayer

class EdgeModel(th.nn.Module):
    def __init__(self, x_features, edge_features, u_features, out_features, n_rounds):
        super().__init__()
        in_features = 2*x_features+edge_features+u_features
        self.edge_mlp = Seq(Lin(in_features=in_features, out_features=out_features), Tanh())

    def forward(self, src, dest, edge_attr, u, batch):
        # src, dest: [E, F_x], where E is the number of edges.
        # edge_attr: [E, F_e]
        # u: [B, F_u], where B is the number of graphs.
        # batch: [E] with max entry B - 1.
        out = th.cat([src, dest, edge_attr, u[batch]], 1)
        return self.edge_mlp(out)


class NodeModel(th.nn.Module):
    def __init__(self, x_features, edge_features, u_features, out_features, n_rounds):
        super().__init__()
        in_features = x_features+edge_features+u_features
        self.node_mlp = Seq(Lin(in_features=in_features, out_features=out_features), Tanh())

    def forward(self, x, edge_index, edge_attr, u, batch):
        # x: [N, F_x], where N is the number of nodes.
        # edge_index: [2, E] with max entry N - 1.
        # edge_attr: [E, F_e]
        # u: [B, F_u]
        # batch: [N] with max entry B - 1.
        row, col = edge_index
        out = scatter_mean(edge_attr, col, dim=0, dim_size=x.size(0))
        out = th.cat([x, out, u[batch]], dim=1)
        return self.node_mlp(out)

class GlobalModel(th.nn.Module):
    def __init__(self, x_features, edge_features, u_features, out_features, n_rounds):
        super().__init__()
        in_features = u_features+x_features
        self.global_mlp = Seq(Lin(in_features=in_features, out_features=out_features), Tanh())

    def forward(self, x, edge_index, edge_attr, u, batch):
        # x: [N, F_x], where N is the number of nodes.
        # edge_index: [2, E] with max entry N - 1.
        # edge_attr: [E, F_e]
        # u: [B, F_u]
        # batch: [N] with max entry B - 1.
        out = th.cat([u, scatter_mean(x, batch, dim=0)], dim=1)
        return self.global_mlp(out)

class GraphNetwork(th.nn.Module):
    def __init__(
            self, x_features, edge_features, u_features, n_rounds, n_units, out_features, y_encoding):
        super().__init__()
        self.out_features = out_features
        self.y_encoding = y_encoding
        self.op1 = MetaLayer(
            EdgeModel(
                x_features=x_features, edge_features=edge_features, 
                u_features=u_features, out_features=n_units, n_rounds=n_rounds), 
            NodeModel(
                x_features=x_features, edge_features=n_units, 
                u_features=u_features, out_features=n_units, n_rounds=n_rounds), 
            GlobalModel(
                x_features=n_units, edge_features=n_units, 
                u_features=u_features, out_features=n_units, n_rounds=n_rounds)
        )
        self.op2 = MetaLayer(
            EdgeModel(
                x_features=n_units, edge_features=n_units, 
                u_features=n_units, out_features=n_units, n_rounds=n_rounds), 
            NodeModel(
                x_features=n_units, edge_features=n_units, 
                u_features=n_units, out_features=out_features, n_rounds=n_rounds), 
            None
        )
    
    def forward(self, x, edge_index, edge_attr, u, batch):
        x, edge_attr, u = self.op1(x, edge_index, edge_attr, u, batch)
        x, edge_attr, u = self.op2(x, edge_index, edge_attr, u, batch)
        return x

    def _pred_and_proba(self, x, edge_index, edge_attr, u, batch):
        out = self(x, edge_index, edge_attr, u, batch)
        if self.encoding == 'ordinal':
            y_pred_proba = th.sigmoid(out).detach().cpu().numpy()
            y_pred = ordinal_to_int(y_pred_proba)
            y_pred_proba = np.concatenate([np.ones_like(y_pred_proba[:,[0]]), y_pred_proba[:,:]], axis=1)
        elif self.encoding == 'onehot':
            y_pred_proba = th.nn.functional.softmax(out, dim=-1).detach().cpu().numpy()
            y_pred = onehot_to_int(y_pred_proba)
        elif self.encoding == 'numeric':
            y_pred = th.sigmoid(out).detach().cpu().numpy()
            y_pred = np.around(y_pred*self.out_features, decimals=0).astype(np.int64)
            y_pred_proba = None
        return y_pred, y_pred_proba


In [38]:
def predict_dataset(model, dataset, batch_size):
    y_pred = []
    y_true = []
    y_pred_proba = []
    for batch in DataLoader(dataset, batch_size=batch_size):
        y_true.append(batch.y)
        _y_pred, _y_pred_proba = model._pred_and_proba(
            x=batch.x, edge_attr=batch.edge_attr, edge_index=batch.edge_index, u=batch.u)
        y_pred_proba.append(_y_pred_proba)
        y_pred.append(_y_pred)
    y_pred = th.cat(y_pred)
    y_true = th.cat(y_true)
    y_pred_proba = th.cat(y_pred_proba)
    return y_true, y_pred, y_pred_proba


class Evaluator:
    def __init__(self, batch_size):
        self.metrics = []
        self.confusion_matrix = []
        # self.synthetic_predicitions = []
        self.batch_size = batch_size

    def set_data(self, datasets):
        self.datasets = datasets

    def set_labels(self, **labels):
        self.labels = labels

    def eval_set(self, model, set_name):
        model.eval()
        y_true, y_pred, y_pred_proba = predict_dataset(model, self.datasets[set_name], self.batch_size)
        self.metrics += create_metrics(y_true, y_pred, set=set_name, **self.labels)
        self.confusion_matrix += create_confusion_matrix(y_true, y_pred, set=set_name, **self.labels)

    def add_loss(self, loss):
        self.metrics.append(dict(name='loss', value=loss, **self.labels))

    def save(self, output_path, labels):
        make_dir(output_path)
        self._save_metric(self.metrics, 'metrics.parquet', output_path, labels)
        self._save_metric(self.confusion_matrix, 'confusion_matrix.parquet', output_path, labels)

    @staticmethod
    def _save_metric(rec, filename, output_path, labels):
        df = pd.DataFrame(rec)
        df = add_labels(df, labels)
        df.to_parquet(os.path.join(output_path, filename))


class Loss(th.nn.Module):
    def __init__(self, encoding, n_contributions):
        super().__init__()
        self.encoding = encoding
        self.n_contributions = n_contributions
        if encoding == 'ordinal':
            self.loss_fn = th.nn.BCEWithLogitsLoss()
        elif encoding == 'onehot':
            self.loss_fn = th.nn.CrossEntropyLoss()
        elif encoding == 'numeric':
            self.loss_fn = th.nn.MSELoss()
        else:
            raise NotImplementedError(f'Unkown target encoding: {encoding}')

    def forward(self, input, target):
        if self.encoding == 'numeric':
            input = th.sigmoid(input)*self.n_contributions
        return self.loss_fn(input, target)

In [40]:
from torch_geometric.data import DataLoader

metrics = []
confusion_matrix = []
syn_pred = []
batch_size = train_args['batch_size']
x_features = dataset[0].x.shape[-1]
edge_features = dataset[0].edge_attr.shape[-1]
u_features = dataset[0].u.shape[-1]
out_features = dataset[0].y.shape[-1]
n_rounds = dataset[0].x.shape[-2]

ev = Evaluator(batch_size=batch_size)

for i, (train_dataset, test_dataset) in enumerate(get_cross_validations(dataset, n_cross_val)):
    model = GraphNetwork(
        x_features=x_features, edge_features=edge_features, 
        u_features=u_features, n_rounds=n_rounds, 
        out_features=out_features, y_encoding=y_encoding[0]['encoding'],
        **model_args).to(th_device)
    loss_fn = Loss(y_encoding[0]['encoding'], n_contributions)
    optimizer = th.optim.Adam(model.parameters(), **optimizer_args)

    sum_loss = 0
    n_steps = 0
    for e in range(n_episodes):
        loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
        for batch in loader:
            py = model(x=batch.x, edge_attr=batch.edge_attr, edge_index=batch.edge_index, u=batch.u)
            loss = loss_fn(py, batch.y)
            loss.backward()
            sum_loss += loss.item()
            n_steps +=1
            
            if e % train_args['eval_period'] == 0:
                avg_loss = sum_loss/n_steps
                print(f'CV {i} | Epoch {e} | Loss {avg_loss}')
                ev.add_loss(avg_loss)
                ev.eval_set(model, 'train')
                ev.eval_set(model, 'test')
                sum_loss = 0
                n_steps = 0

    # ev.eval_sync(model)

ev.save(output_path, labels)



TypeError: forward() missing 1 required positional argument: 'batch'