# 常规赛：PALM病理性近视预测 8月第五名方案
## 解压数据集

In [1]:
%cd /home/aistudio/data/data93479
!unzip ./常规赛：PALM病理性近视预测.zip
!mv -f /home/aistudio/data/data93479/常规赛：PALM病理性近视预测/* /home/aistudio/data

  inflating: __MACOSX/常规赛：PALM病理性近视预测/._Train  

进入工作文件夹

In [2]:
%cd /home/aistudio/work

/home/aistudio/work


## 安装PPIM
本次需要使用PPIM中的预训练模型Diet-base-224
- 持久化至external-libraries

In [3]:
#持久化ppim
!mkdir /home/aistudio/external-libraries
!pip install ppim -i https://pypi.python.org/pypi -t /home/aistudio/external-libraries

Looking in indexes: https://pypi.python.org/pypi
Collecting ppim
[?25l  Downloading https://files.pythonhosted.org/packages/e7/35/369dc6956de64359703bb49d20721e0ae963c1183bb4c88535470f2efe93/ppim-1.1.0-py3-none-any.whl (66kB)
[K     |████████████████████████████████| 71kB 12kB/s eta 0:00:012
[?25hCollecting wget (from ppim)
  Downloading https://files.pythonhosted.org/packages/47/6a/62e288da7bcda82b935ff0c6cfe542970f04e29c756b0e147251b2fb251f/wget-3.2.zip
Building wheels for collected packages: wget
  Building wheel for wget (setup.py) ... [?25ldone
[?25h  Created wheel for wget: filename=wget-3.2-cp37-none-any.whl size=9675 sha256=41741c3de1ae7ae2662a2a12cea3ff76f061cc00381d41351a8439ce4d822e0d
  Stored in directory: /home/aistudio/.cache/pip/wheels/d1/e3/b6/e6be72d63f667cef0226c3eedff3e6658ba97d5be7d9df25dd
Successfully built wget
Installing collected packages: wget, ppim
Successfully installed ppim-1.1.0 wget-3.2


将安装的PPIM导入到环境

In [4]:
import sys 
sys.path.append('/home/aistudio/external-libraries')

## 准备工作
导入包，设置随机种子。

In [5]:
import os
import math
import random
import numpy as np
import pandas as pd
from PIL import Image
import paddle

def set_seed(seed):
    """sets random seed"""
    random.seed(seed)
    np.random.seed(seed)
    paddle.seed(seed)

set_seed(0)
paddle.set_device('gpu')

CUDAPlace(0)

## 生成paddle数据集类
- 对测试集图片的预处理如下：
    - resize至[224,224,3]
    - 改变通道位置Channel Last->Channel First
    - 输入归一化

In [6]:
from paddle.io import Dataset
import paddle.vision.transforms as T

class PALMTsData(Dataset):
    def __init__(self,path):
        super(PALMTsData,self).__init__()
        self.path=path
        self.pic_list=os.listdir(path)
        self.tf=T.Compose([
            T.Resize((224,224), interpolation='bicubic'),
            T.ToTensor(),
            T.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])
        print(len(self.pic_list))
    
    def __getitem__(self,idx):
        pic_name=self.pic_list[idx]
        pic=Image.open(self.path+'/'+pic_name)
        pic=self.tf(pic)
        return pic,0

    def __len__(self):
        return len(self.pic_list)

ts_path='../data/PALM-Testing400-Images'
tsData=PALMTsData(path=ts_path)

400


## 构建模型
- 模型使用预训练模型resnet50 + Diet_base_224为主体，二者直接并联，输出为[1000+1000]
- 下游任务分类器结构为2层全连接网络(2000->100->2)，以relu激活，并辅以Dropout=0.2防止过拟合
- 下游任务被设定为二分类问题

In [7]:
from paddle import nn
import paddle.nn.functional as F
from ppim import deit_b_distilled
from paddle.vision.models import resnet50

class PALM(nn.Layer):
    def __init__(self):
        super(PALM,self).__init__()
        self.deit=deit_b_distilled(pretrained=True)
        self.resnet=resnet50(pretrained=True)
        self.do1=nn.Dropout(p=0.2)
        self.lr1=nn.Linear(in_features=2000, out_features=100)
        self.do2=nn.Dropout(p=0.2)
        self.lr2=nn.Linear(in_features=100, out_features=1)

    def forward(self,x):
        x1=self.deit(x)
        x2=self.resnet(x)
        x=paddle.concat([x1,x2],axis=-1)
        x=F.relu(x)
        x=self.do1(x)
        x=self.lr1(x)
        x=F.relu(x)
        x=self.do2(x)
        x=self.lr2(x)
        x=F.sigmoid(x)
        return x

palm=PALM()
palm.set_state_dict(paddle.load('./savepoint/final'))

100%|██████████| 511043/511043 [00:08<00:00, 62638.32it/s]
100%|██████████| 151272/151272 [00:02<00:00, 62563.23it/s]
  return list(x) if isinstance(x, collections.Sequence) else [x]


## 模型训练参数
- 优化器：adam
- Loss：二分类使用BCELoss
- 学习率：固定5e-5

In [None]:
from paddle.optimizer import Adam
from paddle.nn import BCELoss
from paddle.metric import Accuracy

inputs=paddle.static.InputSpec([-1, 3, 224, 224], dtype='float32', name='input')
label=paddle.static.InputSpec([-1, 1], dtype='float32', name='label')
model=paddle.Model(palm, inputs, label)
model.summary()
model.prepare(optimizer=Adam(learning_rate=5e-5,parameters=model.parameters()),loss=BCELoss(),metrics=Accuracy())

----------------------------------------------------------------------------------------
        Layer (type)             Input Shape          Output Shape         Param #    
          Conv2D-5            [[1, 3, 224, 224]]    [1, 768, 14, 14]       590,592    
        PatchEmbed-1          [[1, 3, 224, 224]]     [1, 196, 768]            0       
         Dropout-1             [[1, 198, 768]]       [1, 198, 768]            0       
        LayerNorm-1            [[1, 198, 768]]       [1, 198, 768]          1,536     
          Linear-1             [[1, 198, 768]]       [1, 198, 2304]       1,771,776   
         Dropout-2           [[1, 12, 198, 198]]   [1, 12, 198, 198]          0       
          Linear-2             [[1, 198, 768]]       [1, 198, 768]         590,592    
         Dropout-3             [[1, 198, 768]]       [1, 198, 768]            0       
        Attention-1            [[1, 198, 768]]       [1, 198, 768]            0       
         Identity-1            [[1, 198, 

## 模型预测
- 对测试集数据预测
- 预测批大小：16
- 预测数据异步读取：2个进程

In [None]:
preds=model.predict(tsData,batch_size=16,num_workers=2)

Predict begin...


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if data.dtype == np.object:
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  if data.dtype == np.object:


Predict samples: 400


## 保存预测结果
- 记得按文件名重新排序

In [None]:
pred=np.squeeze(np.array(preds)).reshape((400,))
pred_list=os.listdir(ts_path)
print(pred.shape)
ans=pd.DataFrame({'FileName':pred_list,'PM Risk':pred})
ans=ans.sort_values(by='FileName',ignore_index=True)
ans.to_csv('./Classification_Results_Test.csv',index=0)

(400,)


## 注意事项与特殊说明
由于GitHub对大文件上传限制，模型训练参数checkpoint无法上传。
- 完整repo详见AI studio项目：https://aistudio.baidu.com/aistudio/projectdetail/2318275