In [22]:
import torch
import numpy as np
import pandas as pd
import torchvision.transforms as transforms
from torch.utils.data import Dataset,DataLoader,sampler
from PIL import Image

## Goal of the module
Training the visual inference model. Use the visual data to predict the position of the hand.

### Input
image

### Output
spatial_coding of the hand position

### Model structure
input-->Conv-->MaxPooling-->Conv-->MaxPooling-->FC-->FC-->Softmax
### To do
1. Build the visual dataset</br>
a. Convert the videos to the images </br>
b. Learn to build the Pytorch image dataset.
2. Train the visual inference network to make it predict the hand position.
3. Build a multisensory dataset</br>
a. Align the visual and prioproceptive data.</br>
b. Merge them in a single dataset.

In [23]:
src_path = '/Users/Jipeng/PycharmProjects/simulated_multisensory_integration/data/'
file_name = 'simulated_data.csv'
df = pd.read_csv(src_path+file_name)

loc_map = range(-4,5)
def label_function(x):
    loc_map = range(-4,5)
    return loc_map.index(x)

feature_names = df.columns.tolist()
print feature_names

label = np.around(df['hand_x'])
df['label'] = np.around(df['hand_x'])
df['label1'] = df['label'].astype(np.int)
df['label'] = df['label1'].apply(label_function)
print (df)

['Unnamed: 0', 'elbow_angle', 'elbow_x', 'elbow_y', 'frame_idx', 'hand_x', 'hand_y', 'shoulder_angle', 'target_x', 'video_idx']
       Unnamed: 0  elbow_angle  elbow_x  elbow_y  frame_idx  hand_x  hand_y  \
0               0       113.81     0.44     2.97          0   -1.25    2.97   
1               1       113.81     0.29     2.99          1   -1.46    2.99   
2               2       113.81     0.44     2.97          2   -1.25    2.97   
3               3       116.81     0.44     2.97          3   -1.19    2.97   
4               4       113.81     0.44     2.97          4   -1.25    2.97   
5               5       113.81     0.60     2.94          5   -1.03    2.94   
6               6       113.81     0.44     2.97          6   -1.25    2.97   
7               7       110.81     0.44     2.97          7   -1.30    2.97   
8               8       107.81     0.44     2.97          8   -1.35    2.97   
9               9       104.81     0.44     2.97          9   -1.39    2.97   
10 

In [24]:
class ImageDataset(Dataset):
    def __init__(self,dataframe,image_root,transform=None):
        self.dataset = dataframe
        self.image_root = image_root
        self.transform = transform
    def __getitem__(self, idx):
        video_idx = self.dataset.loc[idx,'video_idx']
        frame_idx = self.dataset.loc[idx,'frame_idx']
        image_path = self.image_root + str(video_idx) + '/mask%s.jpg'%frame_idx
        label = self.dataset.loc[idx,'label']
        img = Image.open(image_path).convert('RGB')
        if self.transform is not None:
            img = self.transform(img)
        return img, label
    def __len__(self):
        return self.dataset.shape[0]

image_src_path = '/Users/Jipeng/PycharmProjects/simulated_multisensory_integration/data/images/mask/'
transform = transforms.ToTensor()
image_dataset = ImageDataset(df,image_src_path,transform)

In [25]:
def train_test_loader(train_ratio,dataset):
    n_data = len(dataset)
    split = int(n_data*train_ratio)

    indices = list(range(n_data))
    train_sampler = sampler.SubsetRandomSampler(indices[:split])
    test_sampler = sampler.SubsetRandomSampler(indices[split:])

    train_loader = DataLoader(dataset, sampler=train_sampler, shuffle=False, batch_size=4)
    test_loader = DataLoader(dataset, sampler=test_sampler, shuffle=False, batch_size=4)

    return train_loader, test_loader
train_loader, test_loader = train_test_loader(0.7,image_dataset)

In [26]:
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    def __init__(self):
        super(Net,self).__init__()
        # Conv2d(in_channels, out_channels, kernel_size)
        self.conv1 = nn.Conv2d(3,6,10)
        self.pool = nn.MaxPool2d(10,10)
        self.conv2 = nn.Conv2d(6,16,10)
        self.fc1 = nn.Linear(960/4,120)
        self.fc2 = nn.Linear(120,84)
        self.fc3 = nn.Linear(84,len(loc_map))
    def forward(self,x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        #print(x.shape)
        x = x.view(-1,960/4)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

net = Net()

In [27]:
import torch.optim as optim
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(),lr = 0.001, momentum=0.9)

In [28]:
test = 0
for i, data in enumerate(train_loader):
    image,label = data
    #print (image,label)
    if test == 0:
        break

for epoch in range(2):

    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        #print (inputs.shape)
        # Clears the gradients of all optimized tensors
        optimizer.zero_grad()
        outputs = net(inputs)
        #print outputs.shape, labels.shape
        loss = criterion(outputs,labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        if i % 20 == 19:
            print ('[%d, %5d] loss: %.3f'%
                   (epoch+1, i+1, running_loss/20))
            running_loss = 0.0
    print ('Finished Training')

[1,    20] loss: 2.231
[1,    40] loss: 2.207
[1,    60] loss: 2.174
[1,    80] loss: 2.119
[1,   100] loss: 2.111
[1,   120] loss: 2.073
[1,   140] loss: 2.044
[1,   160] loss: 2.049
[1,   180] loss: 1.999
[1,   200] loss: 1.970
[1,   220] loss: 1.881
[1,   240] loss: 1.876
[1,   260] loss: 1.882
[1,   280] loss: 1.887
[1,   300] loss: 1.702
[1,   320] loss: 1.760
[1,   340] loss: 1.700
[1,   360] loss: 1.530
[1,   380] loss: 1.580
[1,   400] loss: 1.395
[1,   420] loss: 1.462
[1,   440] loss: 1.298
[1,   460] loss: 1.137
[1,   480] loss: 1.147
[1,   500] loss: 0.798
[1,   520] loss: 0.758
[1,   540] loss: 0.453
[1,   560] loss: 0.471
[1,   580] loss: 0.350
[1,   600] loss: 0.466
[1,   620] loss: 0.246
[1,   640] loss: 0.295
[1,   660] loss: 0.252
[1,   680] loss: 0.321
[1,   700] loss: 0.214
[1,   720] loss: 0.208
[1,   740] loss: 0.180
[1,   760] loss: 0.183
[1,   780] loss: 0.121
[1,   800] loss: 0.137
[1,   820] loss: 0.171
[1,   840] loss: 0.087
[1,   860] loss: 0.146
[1,   880] 

In [29]:
net_path = './mask_visual_infer.pth'
torch.save(net.state_dict(),net_path)

In [30]:
net = Net()
net.load_state_dict(torch.load(net_path))

<All keys matched successfully>

In [31]:
true_correct = 0
approximate_correct = 0
total = 0
scope = 1
loc_map = range(-4,5)
position_correct = list(0 for i in range(len(loc_map)))
position_total = list(0 for i in range(len(loc_map)))
progress = 0.0
with torch.no_grad():
    for data in test_loader:
         progress += 1
         #print progress,len(test_loader)
         if progress == 20:
            print ('Testing progress: %2d%%'%(progress/(len(test_loader))*100))
            break
         test_inputs, test_labels = data
         test_outputs = net(test_inputs)
         _,predicted = torch.max(test_outputs.data,1)
         #print(predicted, test_label)
         total += test_labels.size(0)
         true_correct += (predicted == test_labels).sum().item()
         c = [0 for i in range(4)]
         for i in range(test_labels.size(0)):
             if test_labels[i]-scope <= predicted[i] <= test_labels[i]+scope:
                 position_correct[predicted[i]] += 1
             position_total[predicted[i]] += 1
for i in range(len(position_total)):
    approximate_correct += position_correct[i]
    if position_total[i] == 0:
        print('Accuracy of position %d : %d %%'%(loc_map[i], 0))
    else:
        print('Accuracy of position %2s : %d %% and the total number is %d'%(loc_map[i], 100 * position_correct[i]/position_total[i],position_total[i]))
print( 'The true accuracy of the network is %d %%'%(100 * true_correct/total))
print( 'The approximate accuracy of the network is %d %%'%(100 * approximate_correct/total))


Testing progress:  2%
Accuracy of position -4 : 100 % and the total number is 1
Accuracy of position -3 : 100 % and the total number is 7
Accuracy of position -2 : 100 % and the total number is 5
Accuracy of position -1 : 100 % and the total number is 3
Accuracy of position  0 : 100 % and the total number is 9
Accuracy of position  1 : 100 % and the total number is 15
Accuracy of position  2 : 100 % and the total number is 16
Accuracy of position  3 : 100 % and the total number is 20
Accuracy of position 4 : 0 %
The true accuracy of the network is 98 %
The approximate accuracy of the network is 100 %


In [33]:
image_dataloader = DataLoader(image_dataset,batch_size=1, shuffle=False)
with torch.no_grad():
    for idx, data in enumerate(image_dataloader):
        if idx%200 == 199:
            print ('%d%% Testing'%(idx*1.0/len(image_dataloader)*100.0))
        image_inputs, iamge_labels = data
        image_outputs = net(image_inputs)
        l1 = image_outputs.tolist()
        df.loc[idx,'visualpredict'] = ' '.join(str(i) for i in l1[0])
print df

0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
0% Testing
       Unnamed: 0  elbow_angle  elbow_x  elbow_y  frame_idx  hand_x  hand_y  \
0               0       113.81     0.44     2.97          0   -1.25    2.97   
1               1       113.81     0.29     2.99          1   -1.46    2.99   
2               2       113.81     0.44     2.97          2   -1.25    2.97   
3               3       116.81     0.44     2.97          3   -1.19    2.97   
4               4     

In [34]:
df.to_csv('/Users/Jipeng/PycharmProjects/simulated_multisensory_integration/data/after_visual.csv')
