In [1]:
DATA_PATH='D:\数据科学\\1'

In [2]:
from PIL import Image
from transformers import AutoImageProcessor, ViTModel
import torch
import os
import pandas as pd

In [3]:
from torch.utils.data import Dataset, DataLoader
class AppleDataset(Dataset):
    """User defined class to build a datset using Pytorch class Dataset."""
    def __init__(self, file,data_path):
        """Method to initilaize variables."""
        self.data=pd.read_csv(file)
        self.data_path=data_path
        self.image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
    def __getitem__(self, index):
        feature1,feature2,label=self.data['couple1'][index],self.data['couple2'][index],self.data['label1'][index]
        img_path = os.path.join(self.data_path, feature1)
        img_path1 = os.path.join(self.data_path, feature2)
        img = Image.open(img_path).convert("RGB")
        img1 = Image.open(img_path1).convert("RGB")
        x = self.image_processor(img, return_tensors="pt")
        y = self.image_processor(img1, return_tensors="pt")
        return x['pixel_values'], y['pixel_values'],label
    def __len__(self):
        return len(self.data)

In [4]:
train_set = AppleDataset('train.csv', DATA_PATH)
train_loader = DataLoader(train_set, batch_size=10)
test_set=AppleDataset('test_data.csv', DATA_PATH)
test_loader = DataLoader(train_set, batch_size=10)

In [5]:
import torch
import torch.nn as nn
import numpy as np


class dot_attention(nn.Module):
    """ 点积注意力机制"""

    def __init__(self, attention_dropout=0.0):
        super(dot_attention, self).__init__()
        self.dropout = nn.Dropout(attention_dropout)
        self.softmax = nn.Softmax(dim=2)

    def forward(self, q, k, v, scale=None, attn_mask=None):
        """
        前向传播
        :param q:
        :param k:
        :param v:
        :param scale:
        :param attn_mask:
        :return: 上下文张量和attention张量。
        """
        attention = torch.bmm(q, k.transpose(1, 2))
        if scale:
            attention = attention * scale        # 是否设置缩放
        if attn_mask:
            attention = attention.masked_fill(attn_mask, -np.inf)     # 给需要mask的地方设置一个负无穷。
        # 计算softmax
        attention = self.softmax(attention)
        # 添加dropout
        attention = self.dropout(attention)
        # 和v做点积。
        context = torch.bmm(attention, v)
        return context, attention
class MultiHeadAttention(nn.Module):
    """ 多头自注意力"""
    def __init__(self, model_dim=1024, num_heads=4, dropout=0.0):
        super(MultiHeadAttention, self).__init__()

        self.dim_per_head = model_dim//num_heads   # 每个头的维度
        self.num_heads = num_heads
        self.linear_k = nn.Linear(model_dim, self.dim_per_head * num_heads)
        self.linear_v = nn.Linear(model_dim, self.dim_per_head * num_heads)
        self.linear_q = nn.Linear(model_dim, self.dim_per_head * num_heads)

        self.dot_product_attention = dot_attention(dropout)

        self.linear_final = nn.Linear(model_dim, model_dim)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(model_dim)         # LayerNorm 归一化。

    def forward(self, key, value, query, attn_mask=None):
        residual = query
        dim_per_head = self.dim_per_head
        num_heads = self.num_heads
        batch_size = key.size(0)

        # 线性映射。
        key = self.linear_k(key)
        value = self.linear_v(value)
        query = self.linear_q(query)

        # 按照头进行分割
        key = key.view(batch_size * num_heads, -1, dim_per_head)
        value = value.view(batch_size * num_heads, -1, dim_per_head)
        query = query.view(batch_size * num_heads, -1, dim_per_head)

        if attn_mask:
            attn_mask = attn_mask.repeat(num_heads, 1, 1)

        # 缩放点击注意力机制
        scale = (key.size(-1) // num_heads) ** -0.5
        context, attention = self.dot_product_attention(query, key, value, scale, attn_mask)

        # 进行头合并 concat heads
        context = context.view(batch_size, -1, dim_per_head * num_heads)

        # 进行线性映射
        output = self.linear_final(context)

        # dropout
        output = self.dropout(output)

        # 添加残差层和正则化层。
        output = self.layer_norm(residual + output)

        return output, attention


# q = torch.ones((1, 17, 400))
# k = torch.ones((1, 17, 400))
# v = k
# mutil_head_attention = MultiHeadAttention()
# output, attention = mutil_head_attention(q, k, v)
# print("context:", output.shape)
# print("attention:", attention.size(), attention)


In [6]:
import torch.nn as nn
import torch.nn.functional as F
from moe import SparseMoE
class VIT_Couple(nn.Module):
    def __init__(self, pretrained=False):
        super(VIT_Couple, self).__init__()
        self.model=ViTModel.from_pretrained("google/vit-large-patch16-224-in21k")
        for param in self.model.parameters():
            param.requires_grad = False
        self.fc2 = nn.Linear(512, 100)
        self.fc3 = nn.Linear(100, 2)
        self.fc1 = nn.Linear(1024, 512)
        self.sparse_moe = SparseMoE(512, 3, 2)
        self.dropout = nn.Dropout(p=0.2)
        self.multi_head_attention= MultiHeadAttention()

    def forward(self,x,y):
        x=x.view(x.shape[0],x.shape[2],x.shape[3],x.shape[4])#3,3,224,224
        y=y.view(y.shape[0],y.shape[2],y.shape[3],y.shape[4])
        x = self.model(x).pooler_output#1, 197, 768
        y = self.model(y).pooler_output#1, 197, 768
        x=x.view(x.shape[0],1,x.shape[1])
        y=y.view(y.shape[0],1,y.shape[1])
        pos=torch.concat((x,y),dim=1)# 1 2 768
        pos, attention = self.multi_head_attention(pos, pos, pos)# 1 2 768
        x=x.view(pos.shape[0],-1)
        pos=self.fc1(x)
        # pos=self.sparse_moe(pos)
        pos = self.dropout(pos)
        pos = F.relu(pos)
        pos=self.fc2(pos)
        pos = F.relu(pos)
        pos=self.fc3(pos)

        return pos



Shape of the final output: torch.Size([4, 8, 16])


In [27]:
# from transformers import ViTImageProcessor, ViTModel
# from PIL import Image
# import requests
# url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
# image = Image.open(requests.get(url, stream=True).raw)
# processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224-in21k')
# model = ViTModel.from_pretrained('google/vit-base-patch16-224-in21k')
# inputs = processor(images=image, return_tensors="pt")
# outputs = model(**inputs)
# last_hidden_states = outputs.pooler_output
# last_hidden_states.shape# 1 768

In [7]:
def train(model, train_loader, optimizer, epoch):
    model.train()
    for batch_idx, (data1,data2, labels) in enumerate(train_loader):
        data1 = data1.to('cuda')
        data2 = data2.to('cuda')
        labels = labels.to('cuda')
        outputs = model(data1,data2)
        # 计算损失
        loss = F.cross_entropy(outputs, labels)
        # 反向传播
        optimizer.zero_grad()
        loss.backward()
        # 更新参数
        optimizer.step()
        # 打印训练信息
        if batch_idx % 1 == 0:
            print('Epoch: {}, Batch: {}, Loss: {:.4f}'.format(
                epoch, batch_idx, loss.item()))

In [8]:
def test(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data, data1,labels in test_loader:
            # 将数据和标签移动到 GPU
            data = data.to('cuda')
            data1 = data1.to('cuda')
            labels = labels.to('cuda')
            # 预测
            outputs = model(data,data1)
            # 计算准确率
            _, predicted = torch.max(outputs.data, 1)
            print('predicted',predicted)
            print('labels',labels)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            print((predicted == labels).sum().item())
    print(correct)
    print(total)
    print('Accuracy: {:.2f}%'.format(100 * correct / total))

In [9]:
model=VIT_Couple().to('cuda')
# new_model=VIT_Couple().to('cuda')
# new_model.load_state_dict(torch.load('version_test3.pth'))
# optimizer = torch.optim.SGD(new_model.parameters(), lr=0.01)
optimizer = torch.optim.AdamW(model.parameters(),
                  lr=1e-4, weight_decay=1e-3)
for epoch in range(20):
    train(model, train_loader, optimizer, epoch)

Epoch: 0, Batch: 0, Loss: 0.6874
Epoch: 0, Batch: 1, Loss: 0.6888
Epoch: 0, Batch: 2, Loss: 0.6876
Epoch: 0, Batch: 3, Loss: 0.7073
Epoch: 0, Batch: 4, Loss: 0.6884
Epoch: 0, Batch: 5, Loss: 0.6871
Epoch: 0, Batch: 6, Loss: 0.7055
Epoch: 0, Batch: 7, Loss: 0.6703
Epoch: 0, Batch: 8, Loss: 0.6685
Epoch: 0, Batch: 9, Loss: 0.6374
Epoch: 0, Batch: 10, Loss: 0.6587
Epoch: 0, Batch: 11, Loss: 0.6805
Epoch: 0, Batch: 12, Loss: 0.6642
Epoch: 0, Batch: 13, Loss: 0.7462
Epoch: 0, Batch: 14, Loss: 0.6626
Epoch: 0, Batch: 15, Loss: 0.6039
Epoch: 0, Batch: 16, Loss: 0.8106
Epoch: 0, Batch: 17, Loss: 0.7721
Epoch: 0, Batch: 18, Loss: 0.7677
Epoch: 0, Batch: 19, Loss: 0.6798
Epoch: 0, Batch: 20, Loss: 0.8375
Epoch: 0, Batch: 21, Loss: 0.6094
Epoch: 0, Batch: 22, Loss: 0.6837
Epoch: 0, Batch: 23, Loss: 0.7527
Epoch: 0, Batch: 24, Loss: 0.7120
Epoch: 0, Batch: 25, Loss: 0.6605
Epoch: 0, Batch: 26, Loss: 0.6327
Epoch: 0, Batch: 27, Loss: 0.7393
Epoch: 0, Batch: 28, Loss: 0.6740
Epoch: 0, Batch: 29, Los

In [10]:
# new_model=VIT_Couple().to('cuda')
# new_model.load_state_dict(torch.load('version_test3.pth'))
test(model, test_loader)

predicted tensor([1, 1, 0, 1, 1, 1, 0, 1, 0, 1], device='cuda:0')
labels tensor([0, 1, 1, 0, 1, 1, 0, 0, 0, 1], device='cuda:0')
6
predicted tensor([0, 1, 0, 0, 1, 0, 1, 0, 0, 0], device='cuda:0')
labels tensor([1, 0, 0, 0, 1, 1, 0, 1, 1, 0], device='cuda:0')
4
predicted tensor([0, 1, 1, 1, 0, 0, 0, 0, 1, 1], device='cuda:0')
labels tensor([0, 0, 1, 1, 0, 0, 1, 1, 1, 1], device='cuda:0')
7
predicted tensor([1, 0, 1, 1, 0, 1, 0, 0, 1, 0], device='cuda:0')
labels tensor([0, 0, 1, 1, 0, 1, 0, 0, 1, 0], device='cuda:0')
9
predicted tensor([0, 0, 0, 1, 0, 0, 1, 1, 0, 0], device='cuda:0')
labels tensor([0, 0, 0, 1, 0, 0, 1, 1, 0, 0], device='cuda:0')
10
predicted tensor([1, 1, 1, 0, 0, 1, 0, 1, 1, 1], device='cuda:0')
labels tensor([1, 1, 1, 0, 0, 1, 0, 1, 1, 0], device='cuda:0')
9
predicted tensor([1, 1, 0, 0, 0, 1, 1, 0, 1, 1], device='cuda:0')
labels tensor([1, 0, 1, 0, 0, 1, 0, 1, 1, 0], device='cuda:0')
5
predicted tensor([0, 1, 0, 1, 0, 0, 1, 0, 0, 0], device='cuda:0')
labels tensor([1

In [11]:
torch.save(model.state_dict(), 'version_test4.pth')


In [None]:
#base:80epoch:56.14