In [None]:
'''
该部分实现的内容包括:
1) 对于一个视频去抽帧,抽取到的每一帧照片进行识别
2) 手机上上传来的照片,进行识别
3) 融合视觉和触觉的特征

In [1]:
# resnet-18 

import torch
import torch.nn as nn
import torch.nn.functional as F

#定义残差块ResBlock

class ResBlock(nn.Module):
    def __init__(self, inchannel, outchannel, identity_downsample=None, stride=1):
        super(ResBlock, self).__init__()
        
        #这里定义了残差块内连续的2个卷积层
        self.conv1 = nn.Conv2d(inchannel,outchannel,kernel_size=3,stride=stride,padding=1)
        self.bn1 = nn.BatchNorm2d(outchannel)
        self.conv2 = nn.Conv2d(outchannel,outchannel,kernel_size=3,stride=1,padding=1)
        self.bn2 = nn.BatchNorm2d(outchannel)
        self.relu = nn.ReLU()
        self.identity_downsample = identity_downsample
            
    def forward(self, x):
        identity = x
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.conv2(x)
        x = self.bn2(x)
        
        # if identity_downsample is not None as default, then:
        if self.identity_downsample is not None:
            identity = self.identity_downsample(identity)
        
        x = x + identity
        x = self.relu(x)
        
        return x

In [2]:
class ResNet_18(nn.Module):
    
    def __init__(self, image_channels, num_classes):
        
        super(ResNet_18, self).__init__()
        # self.in_channels = 64
        self.conv1 = nn.Conv2d(image_channels, 64, kernel_size=7, stride=2, padding=3)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU()
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        
        #resnet layers
        self.layer1 = self.__make_layer(64, 64, stride=1)
        self.layer2 = self.__make_layer(64, 128, stride=2)
        self.layer3 = self.__make_layer(128, 256, stride=2)
        self.layer4 = self.__make_layer(256, 512, stride=2)
        
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512, num_classes)
        
    def identity_downsample(self, in_channels, out_channels):
        return nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=2, padding=1), 
            nn.BatchNorm2d(out_channels)
        )  
    
    def __make_layer(self, in_channels, out_channels, stride):
        
        identity_downsample = None #默认是none,即identity-free shortcut
        if stride != 1:
            identity_downsample = self.identity_downsample(in_channels, out_channels)
        #对于64-128.128-256.256-512的第一个block,有stride=2,且outchannel=2*inchannel；
        #其他的block,64-64的全部2个,64-128的第2个，128-256的第2个，256-512的第2个，都是outchannel=inchannel
            
        return nn.Sequential(
            ResBlock(in_channels, out_channels, identity_downsample=identity_downsample, stride=stride), 
            ResBlock(out_channels, out_channels)
        )
    
    def forward(self, x):
        
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        
        x = self.avgpool(x)
        x = x.view(x.shape[0], -1)
        x = self.fc(x)
        return x 
    

In [3]:
#model = ResNet_18(3,10)
#model.load_state_dict(torch.load('best_sign_model.pth'))
model = torch.load("best_sign_model.pth")
device = torch.device("cuda:0" if torch.cuda.is_available () else "cpu")
model=model.to(device)

In [None]:
# 视频抽取帧,并单张照片识别

import cv2 
from PIL import Image
import numpy as np
from torchvision import transforms

cap = cv2.VideoCapture('self_made_signs/dark.mp4')  

normalize = transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
val_transform = transforms.Compose([transforms.ToTensor(),transforms.Resize((224,224)),normalize])

res=[]

while(cap.isOpened()):  
    ret, frame = cap.read()

    frame = val_transform(frame)
    frame = torch.unsqueeze(frame, 0) 
    
    output = model(frame)
    _, pred = torch.max(output, axis=1)
    print(pred)
    res.append(pred)  

cap.release()  
cv2.destroyAllWindows()

In [1]:
# 单张照片识别

import cv2 
from PIL import Image
import numpy as np
from torchvision import transforms

#实时拍摄的照片
img = Image.open("test.png");img = img.resize((224,224))

#定义transform
normalize = transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
val_transform = transforms.Compose([transforms.Resize((224,224)),transforms.ToTensor(),normalize])

#施加transform
img = val_transform(img)
img = torch.reshape(img,(1,3,224,224))

#预测
output = model(img)
_, pred = torch.max(output, axis=1)
#print(pred)

pred = pred.item()
print(pred)

NameError: name 'torch' is not defined

In [29]:
import csv
with open('SV-dataset/part1/Somatosensory_data2.csv', newline='') as csvfile:  
    reader = csv.reader(csvfile)
    somato = list(reader)
    for i in range(0,3000,1):
        for j in range(0,6,1):
            somato[i][j] = float(somato[i][j])

In [30]:
print(somato[0])

somato_index = (num-0)*300 + img_index
out_somato = somato[somato_index] ;  out_somato[-1] = int(out_somato[-1]) 
print(out_somato)

[0.02846, 1.06148, 0.94354, 0.98445, 1.04778, 0.0]
[0.16886, 1.00104, 1.00934, 0.96994, 0.96839, 0]


In [24]:
category = ["bowl","dog","feel","get","I","know","like","must","sick","you"]

In [42]:
out_for_all = []

for num in range(0,10,1):
    for img_index in range(1,91,1): #1→90
        # 图像读取和预处理
        img_path = 'self_made_signs' + '/' + category[num] + '/' + str(img_index) + ".png"
        img = Image.open(img_path)
        normalize = transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
        transform = transforms.Compose([transforms.Resize((224,224)),transforms.ToTensor(),normalize])
        img = transform(img) ; img = torch.reshape(img,(1,3,224,224))
        # 加载模型，进入测试模式
        model = torch.load('best_sign_model.pth')
        device = torch.device("cuda:0" if torch.cuda.is_available () else "cpu") ; model = model.to(device)
        model.eval()
        # 得到model的1*10的tensor
        with torch.no_grad():
            img=img.to(device)
            out=model(img)
            # _, pred = torch.max(output, axis=1);print(pred)
            out=out.tolist(); out=out[0]
            out_img = out
            
        somato_index = (num-0)*300 + (img_index-1)
        out_somato = somato[somato_index] ;  out_somato[-1] = int(out_somato[-1]) 
        
        out_concate = out_img + out_somato
        out_for_all.append(out_concate)

In [41]:
path  = "concate.csv"
with open(path,'w',newline='') as f:
    csv_write = csv.writer(f)
    #csv_write.writerow(["visual[0]","visual[1]","visual[2]","visual[3]","visual[4]","visual[5]","visual[6]","visual[7]","visual[8]","visual[9]","somato[0]","somato[1]","somato[2]","somato[3]","somato[4]","true_index"]) #首行
    csv_write.writerows(out_for_all)