In [57]:
%reload_ext autoreload
%autoreload 2

import numpy as np
import os
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt 

from tensorflow.keras.preprocessing import image
from scipy import spatial
from tensorflow.keras.preprocessing.image import img_to_array
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras import layers, Model
from lib.load import *
from lib.extractor import ImgExtractor
from lib.model import build_model

from PIL import Image, ImageEnhance, ImageOps

tf.config.experimental_run_functions_eagerly(True)

# S0: Load Images

In [4]:
real_images = load_real_images('Dataset/Real_Effusion')
synthetic_images = load_synthetic_images('Dataset/Synthetic_Effusion')

# S1: Extract Features

### 1.1: Prepare images arr into model input format

In [17]:
realImgMIArr = np.squeeze(np.array(real_images[0]))
realImgMIArr.shape

(352, 224, 224, 3)

In [18]:
synImgMIArr = np.squeeze(np.array(synthetic_images[0]))
synImgMIArr.shape

(170, 224, 224, 3)

### 1.2 Use pretrained VGG16 model to extract features

In [10]:
extractor = ImgExtractor(model="VGG16")

In [19]:
realExtFea = extractor.model.predict(realImgMIArr)
synExtFea = extractor.model.predict(synImgMIArr)

In [22]:
print(realExtFea.shape, synExtFea.shape)

(352, 1000) (170, 1000)


### 1.3 Append Extracted feature to current list

- real_images is a list has 3 element -> 1: raw arr 2: file names 3: extracted features
- synthetic_images is a list has 4 element -> 1: raw arr 2: folder name 3: file name 4: extracted features

In [23]:
real_images.append(realExtFea)
synthetic_images.append(synExtFea)

# S2: Compare similarity Score

- 两两对比: 利用index 循环

In [61]:
simi_record = []
for r_index in range(len(real_images[0])):
    for s_index in range(len(synthetic_images[0])):
        r_fea = real_images[2][r_index]
        s_fea = synthetic_images[3][s_index]
        
        # cal similarity + record real path + record syn path
        simi_score = cosine_similarity(r_fea.reshape(1, -1), s_fea.reshape(1, -1))[0][0]
        r_path = real_images[1][r_index]
        s_path = synthetic_images[1][s_index] + "/" + synthetic_images[2][s_index]
        
        r_rawArr = real_images[0][r_index]
        s_rawArr = synthetic_images[0][s_index]
        
        simi_record.append([r_path, s_path, simi_score, r_rawArr, s_rawArr])
simiDF = pd.DataFrame(simi_record, columns=["realPath", "synPath", "simiScore", "rRawArr", "sRawArr"])

In [114]:
# delete data to save memory
del simi_record
del real_images
del synthetic_images

In [68]:
simiDF.iloc[:5, :3]

Unnamed: 0,realPath,synPath,simiScore
0,m7138_a7356_s7425_1_27_US_.png,d12/1.png,0.04013
1,m7138_a7356_s7425_1_27_US_.png,za14/1.png,0.435857
2,m7138_a7356_s7425_1_27_US_.png,za13/1.png,0.390371
3,m7138_a7356_s7425_1_27_US_.png,d15/1.png,0.314453
4,m7138_a7356_s7425_1_27_US_.png,z3/1.png,0.325425


# S3: Build MI datasets based on simi Score

- 分档: 
    - 0.34 - 1
    - 0.14 - 0.34
    - 0.14 - 0.04
    - 0.04 - 0

In [62]:
def build_mi(simiDF, lowB, upB):
    return simiDF.query(f"{lowB} <= simiScore <= {upB}")

In [63]:
simiDF["simiScore"].describe()

count    59840.000000
mean         0.225467
std          0.227435
min          0.000015
25%          0.043149
50%          0.145323
75%          0.343558
max          0.994263
Name: simiScore, dtype: float64

In [64]:
trainDF1 = build_mi(simiDF, 0.34, 1)
trainDF2 = build_mi(simiDF, 0.14, 0.34)
trainDF3 = build_mi(simiDF, 0.04, 0.14)
trainDF4 = build_mi(simiDF, 0, 0.04)

In [65]:
print(trainDF1.shape, trainDF2.shape, trainDF3.shape, trainDF4.shape)

(15140, 5) (15324, 5) (15037, 5) (14339, 5)


# S4: Train model to compare accuracy

In [None]:
clsModel = build_model()

### 4.1 TrainDF1 as an example

In [78]:
# 1 class, posArr is X , posLabel is Y
posArr = np.array(trainDF1.iloc[:, 3].tolist())
negArr = np.array(trainDF1.iloc[:, 4].tolist())

posLabel = np.array([0, 1] * len(posArr)).reshape(-1, 2)
negLabel = np.array([1, 0] * len(negArr)).reshape(-1, 2)

In [103]:
posLabel
negLabel

array([[1, 0],
       [1, 0],
       [1, 0],
       ...,
       [1, 0],
       [1, 0],
       [1, 0]])

In [80]:
posArr = np.squeeze(posArr)
negArr = np.squeeze(negArr)

In [104]:
print(posArr.shape, negArr.shape, posLabel.shape, negLabel.shape)

(15140, 224, 224, 3) (15140, 224, 224, 3) (15140, 2) (15140, 2)


In [105]:
X = np.concatenate([posArr, negArr])
y = np.concatenate([posLabel, negLabel])
print(X.shape, y.shape)

(30280, 224, 224, 3) (30280, 2)


### 4.2 Train model, 400 data as an example

In [106]:
batch_size = 128

In [110]:
print(X[:400].shape, y[:400].shape)

(400, 224, 224, 3) (400, 2)


In [112]:
# epochs define how many times you train the model
clsModel.fit(X[:400], y[:400], batch_size = batch_size, verbose = 1, epochs=1)



<tensorflow.python.keras.callbacks.History at 0x7f7f85b55460>

In [113]:
loss, accuracy = clsModel.evaluate(X[:400], y[:400])



[0.653351366519928, 1.0]

### 4.3 Into module

In [None]:
# loss: 求导更新参数用的: squareloss:  y_true, y_prd, 1, 0.8 -> (1 - 0.8)**2

In [116]:
# 1 class, posArr is X , posLabel is Y
def cal_accuray_for_trainDF(trainDF, epochs=10):
    posArr = np.array(trainDF.iloc[:, 3].tolist())
    negArr = np.array(trainDF.iloc[:, 4].tolist())
    # 0 class
    posLabel = np.array([0, 1] * len(posArr)).reshape(-1, 2)
    negLabel = np.array([1, 0] * len(negArr)).reshape(-1, 2)

    posArr = np.squeeze(posArr)
    negArr = np.squeeze(negArr)

    X = np.concatenate([posArr, negArr])
    y = np.concatenate([posLabel, negLabel])

    clsModel.fit(X, y, batch_size = 128, verbose = 1, epochs=epochs)
    loss, accuracy = clsModel.evaluate(X, y)
    return loss, accuracy

### 4.4 Test sample data

In [120]:
sampleDF = trainDF1.sample(20)
sampleDF.shape

In [None]:
loss, accuracy = cal_accuray_for_trainDF(trainDF1.sample(5000), epochs=10)

Epoch 1/10
 5/79 [>.............................] - ETA: 1:44:56 - loss: 0.7296 - accuracy: 0.5000