In [None]:
import json

import torch
from torch import nn
from torch import optim

import numpy as np
import pandas as pd
from umap import UMAP
from sklearn.manifold import TSNE

import matplotlib.pyplot as plt
plt.style.use('ggplot')
plt.style.use('seaborn-v0_8-colorblind')
from matplotlib import cm
cmap = cm.get_cmap('tab10')

## Animals

In [None]:

attr_names = json.load(open('./dataset/animals5/attribute_names.json'))
print('Attributes:')
display(attr_names)

if type(attr_names[0]) is list:
    attr_names = [a[0] for a in attr_names]

    
attrs = torch.load('./dataset/animals5/attributes.th')
df = pd.DataFrame(attrs, columns=attr_names)

In [None]:
df.shape

In [None]:
%%time

x = df.to_numpy()
xy = UMAP(min_dist=0.5).fit_transform(x)
# xy = TSNE().fit_transform(x)
plt.scatter(xy[:,0], xy[:,1], s=2)


In [None]:
# np.save('xy.npy', xy)

In [None]:
df['x'] = xy[:,0]
df['y'] = xy[:,1]
df['image_filename'] = [f'animal-{i}.jpg' for i in range(len(df))]


In [None]:
df.to_csv('animals5.csv', index=False)

In [None]:
plt.figure(figsize=[4*5,4*2], dpi=120)

for i, color_by in enumerate(attr_names):
    plt.subplot(3,5,i+1)
    plt.scatter(xy[:,0], xy[:,1], s=2, c=df[color_by])
    plt.axis('equal')
    plt.colorbar()
    plt.title(f'colored by "{color_by}"')
plt.show()


## Gait

In [None]:
df = pd.read_csv('./dataset/gait_raw.csv')

df_join = pd.DataFrame()
for group, group_df in df.groupby(['subject', 'condition', 'replication']):
#     print(group, group_df)
    dfs = []
    for i in range(6):
        d = group_df[i*101:(i+1)*101].reset_index(drop=True)
        leg = d['leg'][0]
        joint = d['joint'][0]
        d = d.rename(columns={
            'angle': f'leg{leg}.joint{joint}.angle'
        })
        d = d.drop(columns=['leg','joint'])     
        if i>0:
            d = d.drop(columns=['subject', 'condition', 'replication','time'])
        dfs.append(d)
    d = pd.concat(dfs, axis=1)
    
    df_join = pd.concat([df_join, d], axis=0)

## choose only one replication
df_join = df_join[df_join['subject']<=2]
## reorder columns
df_join = df_join.iloc[:, [4,5,6,7,8,9,0,1,2,3]]
df_join

In [None]:
%%time

## use only angle columns plus x for UMAP
x = df_join.iloc[:,[0,1,2,3,4,5]].to_numpy()

xy = UMAP(min_dist=0.5).fit_transform(x)
plt.scatter(xy[:,0], xy[:,1], s=2)

# t = df_join['time'].to_numpy()
# xy = UMAP(min_dist=0.5, n_components=1).fit_transform(x)
# plt.scatter(t, xy[:,0], s=2)


In [None]:
c = df_join['condition']
plt.scatter(xy[:,0], xy[:,1], s=2, c=c, cmap='viridis')
plt.colorbar()

In [None]:
c = df_join['time']
plt.scatter(df_join['time'], xy[:,0], s=2, c=c, cmap='viridis')


In [None]:
df_join['x'] = xy[:,0]
df_join['y'] = xy[:,1]
df_join.to_csv('dataset/gait2.csv', index=False)

In [None]:
# # ## combine rows for Gait data

# # df_join = pd.DataFrame()

# # for group, group_df in df.groupby(['subject', 'condition', 'replication']):
# # #     print(group, group_df
# #     dfs = []
# #     for i in range(6):
# #         d = group_df[i*101:(i+1)*101].reset_index(drop=True)
# #         leg = d['leg'][0]
# #         joint = d['joint'][0]
# #         d = d.rename(columns={
# #             'angle': f'leg{leg}.joint{joint}.angle'
# #         })
# #         d = d.drop(columns=['leg','joint'])     
# #         if i>0:
# #             d = d.drop(columns=['subject', 'condition', 'replication','time'])
# #         dfs.append(d)
# #     d = pd.concat(dfs, axis=1)
    
# #     df_join = pd.concat([df_join, d], axis=0)

# df_join
# x0 = df_join.to_numpy()
# df_join
# # df_join.to_csv('gait_joined.csv', index=False)

In [None]:
# %%time
# xy = UMAP().fit_transform(x0[:,4:])
# np.save('xy.npy', xy)

# xy = np.load('xy.npy')

In [None]:
# c = df_join['subject']
c = df_join['condition']
# c = df_join['replication']

plt.scatter(xy[:,0], xy[:,1], s=2, c=c, cmap='tab10')
plt.colorbar()


In [None]:
## Meuse


In [None]:
# df = pd.read_csv('./dataset/meuse.txt')
# plt.scatter(df['x'], df['y'])

## predicates deduction 0

In [None]:
subset = np.logical_and(
    np.logical_and(0 < xy[:,0], xy[:,0] < 5),
    np.logical_and(0 < xy[:,1], xy[:,1] < 5),
)

plt.scatter(xy[:,0], xy[:,1], s=1, c='#666')
plt.scatter(xy[subset,0], xy[subset,1], s=2, c=df['replication'][subset])

In [None]:
b = 4
def predict(x, a, mu):
    return 1/(1+((a*(x-mu)).pow(b)).sum(1))



def predicate(x0, subset):
    '''subset boolean array of selection'''
    
    ## prepare training data
    x = torch.from_numpy(x0.astype(np.float32))
    x_mean = x.mean(0)
    x_std = x.std(0)+1
    x = (x-x_mean)/(x_std)
    label = torch.from_numpy(subset).float()
    
    bce = nn.BCELoss()
    a = torch.randn(x.shape[1]).requires_grad_(True)
    mu = torch.randn(x.shape[1]).requires_grad_(True)
    optimizer = optim.SGD([mu, a,], lr=1e-2, momentum=0.9, weight_decay=0.01)
    for e in range(3000):
        pred = predict(x, a, mu)
        l = bce(pred, label)
        optimizer.zero_grad()
        l.backward()
        optimizer.step()
        if e % 500 == 0:
            print('loss', l.item())    
    a.detach_()
    mu.detach_()
    
#     plt.stem(a.abs().numpy())
#     plt.show()
    
    r = 1/a.abs()
    print(
        'accuracy', 
        ((pred>0.5).float() == label).float().sum().item(), 
    '/', subset.shape[0])
    
    predicates = []
    for k in range(mu.shape[0]):
        if r[k] < 0.5 * (x[:,k].max()-x[:,k].min()):
            r_k = (r[k] * x_std[k]).item()
            mu_k = (mu[k] * x_std[k] + x_mean[k]).item()
            ci = ((mu_k-r_k), (mu_k+r_k))
            predicates.append(dict(
                dim=k, interval=ci
            ))
    return dict(
        predicates=predicates
    )

            
predicate(x0, subset)

In [None]:
plt.scatter(xy[:,0], xy[:,1], s=0.1, c=pred.detach().numpy())
plt.colorbar()


In [None]:
plt.hist(x0[subset,5], bins=60);
plt.hist(x0[~subset,5], bins=60, alpha=0.1);

In [None]:
x[subset,4:7].min(0).values, x[subset,4:7].max(0).values

In [None]:
x[~subset,4:7].min(0).values, x[~subset,4:7].max(0).values