<a href="https://colab.research.google.com/github/Guo-bot-1998/Appendicitis/blob/master/Appendicitis_colab_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# import

In [None]:
!pip install timm
!pip install tqdm

In [96]:
import nibabel as nib
import numpy as np
import os
import pandas as pd
import nibabel as nib
import torch
import torch.nn as nn
import torch.optim as optim
import random
import re
import json
import tqdm
import timm
from torch.utils.data import Dataset, DataLoader


## 存取

In [None]:

def process_data2(shift=0, termi=10, dropFalse=0.85, dropPos=0)\
 -> tuple[torch.tensor, torch.tensor, list[tuple[str, int]]]:
  """Process the data, get the id and number of each scan, and merge the images and labels.
  Taking care of memory efficiency"

  Args:
      data: The data of the scans
      labels: The labels of the scans

  Returns:
      A tuple of images, labels, and (id, # of cuts) of each scan
  """



  labels_ = read_label(labelpath)

  if labels_.index.name != 'id':
    labels_.set_index('id', inplace=True)



  filelist = os.listdir(datadir)
  filelist = filelist[shift:shift+termi]
  selecteds = [labels_.loc[labels_.index.str.startswith(afile.strip('.nii.gz')+'_')] for afile in filelist]
  scans_info = []
  # preallocated
  numcuts = [len(selected) for selected in selecteds]
  images = torch.zeros((sum(numcuts),1, 512,512))
  labels = -torch.ones((sum(numcuts)))
  nimgs = 0
  nprocess = 0

  debug = 0

  for key in filelist:
    file_path = os.path.join(datadir, key)
    key = key.strip('.nii.gz')
    scan  = labels_.loc[labels_.index.str.startswith(key+'_')]

    value = nib.load(file_path).get_fdata()
    scans_info.append((key, value.shape[2]))
    label_t = torch.tensor(scan['label'])
    image_t = torch.from_numpy(value).float().permute(2, 0, 1).unsqueeze(1)


    if (image_t.shape[2] != 512 or image_t.shape[3] != 512):
      image_t = cropping(image_t)
    image_t, label_t = remove_false_images(image_t, label_t, dropFalse)
    image_t, label_t = remove_positive_images(image_t, label_t, dropPos)

    n_new = len(label_t)
    images[nimgs:nimgs+n_new] = image_t
    labels[nimgs:nimgs+n_new] = label_t
    nimgs += len(label_t)
    print(f"Process {nprocess}: {key} finished...")
    nprocess += 1

  print(f"read {len(scans_info)} scans")

  return images, labels.float(), scans_info



def save_model(model, modelname="", dirname="", root=os.path.realpath("/content/drive/MyDrive/AOCR2024/params")):

  if not model:
    print("not given model")
    return

  if not os.path.exists(root):
    print(f"{root} not exists!")
    return

  print(f"model will be saved under {root}")

  if not modelname:
    modelname = input("請輸入模型儲存的檔名:")
  if not dirname:
    dirname = input("請輸入模型儲存的資料夾:")

  filename = f"{root}/{dirname}/{modelname}"

  if not os.path.exists(os.path.dirname(filename)):
    os.mkdir(os.path.dirname(filename))

  if os.path.isfile(filename+'.pth'):
      print(f"{filename}.pth exist.")
  else:
      torch.save(model.state_dict(), f'{filename}.pth')

  return filename



def read_label(excel_path) -> pd.DataFrame:
  """Reads a csv file containing ground-truth.
    The csv file should have two columns: 'id' and 'label'.
  """
  with open(excel_path, 'r') as f:
    df = pd.read_csv(f)
    df.set_index('id', inplace=True)
    return df



def write_error(logpath, e, nprocess):
  # with open(logpath, 'a') as f:
    # f.write(f"{nprocess:} error occured: {e}\n") ##! may stuck the program!!
  print(f"{nprocess:} error occured: {e}\n")


def read_submission(excel_path) -> pd.DataFrame:
  """Reads a csv file containing submission file.
    The csv file should follow the format given by Kaggle.
  """
  with open(excel_path, 'r') as f:
    df = pd.read_csv(f)
    df.set_index('id', inplace=True)
    return df




## 列印結果

In [None]:
def get_confusion(guess, truth):
  right = guess == truth
  wrong = np.logical_not(right)
  TP = np.sum(np.logical_and(right, truth == np.ones(right.shape)))
  TN = np.sum(np.logical_and(right, truth == np.zeros(right.shape)))
  FN = np.sum(np.logical_and(wrong, truth == np.ones(wrong.shape)))
  FP = np.sum(np.logical_and(wrong, truth == np.zeros(wrong.shape)))
  return (TP,FP,FN,TN)



def print_results(prediction, labels):
  if torch.is_tensor(prediction):
    prediction = prediction.cpu().numpy()
  if torch.is_tensor(labels):
    labels = labels.cpu().numpy()

  TP,FP,FN,TN = get_confusion(prediction, labels)
  print(f"\n\
      真實值\n\
  預  +-----+-----+\n\
  測| TP: {TP}| FP: {FP}|\n\
  值| FN: {FN}| TN: {TN}|\n\
      +-----+-----+ \n")

  recall = TP/(TP + FN)
  precision = TP/(TP + FP)
  recall = 0 if np.isnan(recall) else recall.item()
  precision = 0 if np.isnan(precision) else precision.item()
  F1 = 0 if recall + precision == 0 else  (2*recall*precision/(recall+precision))

  print(f"{recall=}\n{precision=}\n{F1=}\n")

## 數據處理




In [None]:
def argumenting(images, labels, n=5):
  """
  數據增強。
  在images裡面把所有有闌尾炎的cut複製給定次數，並
  插入回images的隨機位置裡面

  Args:
    images: 输入的图像数据
    labels: 输入的标签数据
    n: 每个样本複製多少次

  Returns:
    增強后的图像数据和标签数据
  """
  where = (labels == True).nonzero(as_tuple=True)[0]
  bad_images = images[where]

  rep_imgs = bad_images.repeat(n, 1, 1, 1)
  rep_labels = torch.ones(rep_imgs.shape[0], dtype=labels.dtype)

  # import pdb
  # pdb.set_trace()
  # 隨機插入argumented圖片
  nimg = images.shape[0]
  rnd_pos = torch.randint(0, nimg, (rep_imgs.shape[0],))
  images = torch.cat((images, rep_imgs), dim=0)
  images = images[torch.argsort(torch.cat((torch.arange(nimg), rnd_pos)))]

  # 對應對置插入標籤
  labels = torch.cat((labels, rep_labels), dim=0)
  labels = labels[torch.argsort(torch.cat((torch.arange(nimg), rnd_pos)))]

  return images, labels

def remove_false_images(images, labels, ratio):
    """
    隨機在images裡面移除一定比例的無闌尾炎cut

    Args:
      images: 输入的图像数据
      labels: 输入的标签数据
      ratio: 移除的比例

    Returns:
      移除后的图像数据和标签数据
    """
    # 找出 labels == 0 的索引
    where_false = (labels == 0).nonzero(as_tuple=True)[0]
    mask = torch.ones(len(images), dtype=torch.bool)
    indices_filter = torch.randperm(len(where_false))[:int(len(where_false)*ratio)]
    mask[where_false[indices_filter]] = False
    images = images[mask]
    labels = labels[mask]

    return images, labels

def remove_positive_images(images, labels, ratio):
    # 找出 labels == 1 的索引
    where_positive = (labels == 1).nonzero(as_tuple=True)[0]
    mask = torch.ones(len(images), dtype=torch.bool)
    indices_filter = torch.randperm(len(where_positive))[:int(len(where_positive)*ratio)]
    mask[where_positive[indices_filter]] = False
    images = images[mask]
    labels = labels[mask]

    return images, labels

## 其他

In [None]:

def count_zero(images):
  count = 0
  for image in images:
    if not torch.any(image):
      count += 1
  return count

def cropping(image):
  if image.shape[2] != 512:
    start = (image.shape[2] - 512) // 2
    end = start + 512
    image = image[:, :, start:end, :]

  if image.shape[3] != 512:
    start = (image.shape[3] - 512) // 2
    end = start + 512
    image = image[:, :, :, start:end]
  return image




def custom_sort_key(val):
    parts = val.split('_')
    if len(parts) == 2 and parts[1].isdigit():
        return (parts[0], int(parts[1]))
    return (parts[0], -1)  # 使沒有_(數字)的id排最前面


def isgpu():
    """檢查是否有 CUDA 支持的 GPU"""
    if torch.cuda.is_available():
        device = torch.device("cuda")
        print("GPU is available")
    else:
        device = torch.device("cpu")
        # raise("GPU not available")
    return device


In [None]:
device = isgpu()

# 掛載


In [None]:
from google.colab import drive
drive.mount('/content/drive/',)

In [None]:
!pwd

# 資料下載

In [None]:
!gdown https://drive.google.com/drive/folders/1C7HXpHMw1Alvwif9hO97FUzfn4rhxG8B -O Train_Valid_Image --folder --remaining-ok

# 資料處理

In [None]:
len(os.listdir('TrainValid_Image/train_data'))
datadir ="Train_Valid_Image"
labelpath = '/content/drive/MyDrive/AOCR2024/TrainValid_ground_truth.csv'

In [None]:
dflabel = read_label(labelpath)
images, labels, info =  process_data2(termi=6,shift=0,dropFalse=0,dropPos=0)

In [None]:
# %reset_selective images

In [None]:
# %reset

In [None]:
model_name = "tf_efficientnetv2_m"
pretrained_model = timm.create_model(model_name, pretrained=True)

# 修改輸入通道
pretrained_model.conv_stem = nn.Conv2d(1, 24, kernel_size=3, stride=2, padding=1, bias=False)

# 修改輸出類別
num_classes = 1
pretrained_model.classifier = nn.Linear(pretrained_model.classifier.in_features, num_classes)

# 添加 Sigmoid 激活函數
pretrained_model = nn.Sequential(
    pretrained_model,
    nn.Sigmoid()
)

# 檢查模型結構
# print(pretrained_model)


# 訓練 (EfficiencyNetV2_m)

In [None]:
class CustomDataset(Dataset):
    def __init__(self, images, labels):
        self.images = images
        self.labels = labels

    def __len__(self):
        return len(self.images)

    def __getitem__(self, idx):
        return self.images[idx], self.labels[idx]

In [None]:
num_epochs = 10
batch_size = 8
lr = 0.01
num_batches = len(images) // batch_size  #最後data不滿一個batch丟棄

# 初始化模型、損失函數和優化器
model = pretrained_model
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr)
model = model.to(device)
running_loss_list = []


dataset = CustomDataset(images, labels)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for batch_images, batch_labels in dataloader:
        batch_images, batch_labels = batch_images.to(device), batch_labels.to(device)

        optimizer.zero_grad()
        outputs = model(batch_images)
        outputs = outputs.squeeze()
        loss = criterion(outputs, batch_labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    running_loss_list.append(running_loss/len(dataloader))
    print(f"Epoch {epoch+1}, Loss: {running_loss/len(dataloader)}")

## Loss圖

In [None]:
import matplotlib.pyplot as plt
plt.plot(np.arange(1,num_epochs+1)[:len(running_loss_list)], running_loss_list)
plt.title('Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')

# 儲存模型參數



In [None]:
filename = save_model(model)

params = {
    'num_epochs': num_epochs,
    'batch_size': batch_size,
    'learning_rate': lr,
}

if os.path.isfile(filename+'.json'):
    print(f"{filename}.json exist.")
else:
    with open(f'{filename}.json', 'w') as f:
        json.dump(params, f)

# 讀取模型參數

In [None]:
# model = pretrained_model
# model = model.to(device)

In [None]:
import json
filename = input("請輸入要獲取模型路徑:")

if not os.path.isfile(filename+'.pth'):
    print(f"{filename}.pth not exist.")
else:
    print(model.load_state_dict(torch.load(filename+'.pth')))

with open(f'{filename}.json', 'r') as f:
    params = json.load(f)


# 評估


In [None]:
data = read_data("TrainValid_Image/train_data",termi=80,shift=80)
# data = read_data("Test1_Image/test_data",termi=3,shift=1)
dflabel = read_label("TrainValid_ground_truth.csv")

In [None]:
images,labels,info = process_data(data, dflabel)

In [None]:
# 評估設置
num_epochs = params['num_epochs']
batch_size = params['batch_size']

#最後data不滿一個batch
num_batches = len(images) // batch_size
if len(images) % batch_size != 0:
  num_batches += 1

In [None]:
model.eval()

predict_list = torch.tensor([]).to(device)
with torch.no_grad():  # 不更新梯度
    correct = 0
    total = 0
    for i in range(num_batches):
        batch_images = images[i*batch_size:(i+1)*batch_size].to(device)
        batch_labels = labels[i*batch_size:(i+1)*batch_size].to(device)

        outputs = model(batch_images)
        predicted = (outputs.squeeze() > 0.5).int()
        if predicted.dim() == 0:
          predicted = predicted.unsqueeze(0)
        predict_list = torch.cat((predict_list,predicted),0)


predict_listq = predict_list.cpu()



In [None]:
print_results(predict_listq, labels[:len(predict_list)])

# 輸出至submission.csv

In [None]:
def cont(l):
  "1是否連續"
  f1 = False
  f2 = False
  for i in range(len(l)):
    if (l[i] == 1):
      f1 = True
    if (l[i] == 0):
      if (f1):
        f2 = True
    if (l[i] == 1) and f2:
      return False
  return True

def yes(predict):
  return int(sum(predict) >= 3)



predict_list = predict_listq.numpy()

## 還原id與對應label，假設key按照scan輸入順序排列,每個key對應的scan的cuts數是nslice
output = {}
k = 0  #第幾個scan
ii = 0   #每個key輪到第幾個
id, nslice = info[k][0], info[k][1]
for i in range(len(predict_list)):

  if (ii >= nslice):
    #該換下一個scan了
    output[id] = yes(predict_list[i-nslice:i]) #評估方式

    k += 1
    ii = 0
    id, nslice = info[k][0], info[k][1]

  label = predict_list[i]
  output[id+f'_{ii}'] = int(predict_list[i])
  ii += 1

output[id] = yes(predict_list[(i+1)-ii:]) #補上最後一個scan評估
# import pdb
# pdb.set_trace()
output = list(output.items())
dfout = pd.DataFrame(output)
dfout.columns = ['id', 'label']
dfout = dfout.sort_values(by='id', key=lambda x: x.map(custom_sort_key))
filename = input("輸入提交檔名(enter for submission)")
if filename == '':
  filename = 'submission'
dfout.to_csv(filename+'.csv', index=False)

In [None]:
info

In [None]:
dfout

# 查看submission.csv

In [None]:
!pwd

In [None]:
dftest = read_submission('fisrt_80.csv')
# dftest = read_submission('submission.csv')
dflabel = read_label("TrainValid_ground_truth.csv")

In [None]:
len(dftest)

In [None]:
import re


# 抓出scan-level data
pattern = re.compile(r'.*_[0-9]+$')  # 正則表達式匹配 "_數字" 結尾
mask = ~dftest.index.str.match(pattern)
scan_guess = np.array(dftest[mask]['label'])
scan_truth = np.array(dflabel.loc[dftest[mask]['label'].index]['label'])

mask = ~mask
cut_guess = np.array(dftest[mask]['label'])
cut_truth = np.array(dflabel.loc[dftest[mask]['label'].index]['label'])

In [None]:
ss = 10
print("="*ss + "F1 score on scan level" + "="*ss)
print_results(scan_guess, scan_truth)
print(),print()
print("="*ss + "F1 score on cut level" +"="*ss )
print_results(cut_guess, cut_truth)

# 其他指令
不在工作流
當參考

In [None]:
import time
datadir = 'Train_Valid_Image'
for idx, afile in enumerate(os.listdir(datadir)):
  file_path = os.path.join(datadir, afile)
  time.sleep(1)
  nii_file =  nib.load(file_path)
  print(f"{idx}: read {afile}")

In [None]:
pattern = re.compile(r'.*_[0-9]+$')  # 正則表達式匹配 "_數字" 結尾
mask = dftest.index.to_series().str.match(pattern)
dflabel_ = dftest[mask]
dflabel_.index.map(lambda x : x.split('_')[1]).sort_values()[-3:]

In [None]:
import random

# 假設這是您的列表
my_list = np.array([1, 2, 3, 4])

mask = [True, False, False, True]
my_list[mask]

In [None]:
datadir ="TrainValid_Image/train_data"
labelpath = 'TrainValid_ground_truth.csv'
labels_ = read_label(labelpath)


filelist = os.listdir(datadir)
filelist = filelist[10:20]
selecteds = [labels_.loc[labels_.index.str.startswith(afile.strip('.nii')+'_')] for afile in filelist]
scans_info = []
# preallocated
print("enter")
numcuts = [len(selected) for selected in selecteds]
numcut = sum(numcuts)

In [None]:
numcuts = [len(selected) for selected in selecteds]

In [None]:
img = nib.load('Train_Valid_Image/Zx00AD16F8B97A53DE6E7CFE260BDF122F0E655659A3DF1628.nii.gz')


In [None]:
drive.flush_and_unmount()

In [None]:
t1 = torch.randint(3,(5,1,2,2))
lab = torch.tensor([1,1,1,1,1])
t1

In [None]:
remove_positive_images(t1,lab,0.79)