----
----
# <b> DLMI Challenge </b>
# <b> MILSVM training </b>
# <b> Matteo MARENGO | matteo.marengo@ens-paris-saclay.fr </b>
# <b> Manal MEFTAH | manal.meftah@ens-paris-saclay.fr </b>

----
----
# <b> Import libraries </b>

In [1]:
import os
import pandas as pd
import torch
import torchvision
from PIL import Image
import matplotlib.pyplot as plt
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset
import random
import numpy as np
from sklearn.decomposition import PCA
import csv


In [2]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
csvfile = "/kaggle/input/dlmi-competition/trainset/trainset_true.csv"
df_raw = pd.read_csv(csvfile)
df = df_raw
df.iloc[110, 2] = "F" 



In [3]:
df_train, df_val = train_test_split(df, test_size=0.2, random_state=50)


In [6]:

import random
random.seed(50)
torch.manual_seed(50)
np.random.seed(50)


----
----
# <b> Load the model </b>

In [7]:
pretrained_net = torchvision.models.resnet50(pretrained=True)
model = torch.nn.Sequential(*list(pretrained_net.children())[:-1])
model.to(device)
for param in model.parameters():
    param.requires_grad = False
model.eval()


Downloading: "https://download.pytorch.org/models/resnet50-0676ba61.pth" to /root/.cache/torch/hub/checkpoints/resnet50-0676ba61.pth
100%|██████████| 97.8M/97.8M [00:00<00:00, 145MB/s] 


Sequential(
  (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU(inplace=True)
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)


In [8]:

class DLMI_data_svm(Dataset):
    def __init__(self, dataframe, root_dir, transforms = None):
        super().__init__()
        self.df = dataframe
        self.image_dir = root_dir
        self.transforms = transforms    
        self.labels_list=list(self.df.iloc[:,1])
    
    def __len__(self):
        return len(self.labels_list)


    def __getitem__(self, index):
        self.bag_list=[]
        name=self.df.iloc[index,0]
        gender=self.df.iloc[index,2]
        DOB=self.df.iloc[index,3]
        DOB=int(DOB[-4:])

        path, dirs, files = next(os.walk(self.image_dir+name))
        i=0

        for file in files:
            image = Image.open(self.image_dir+name+"/"+file)


            if(self.transforms!=None):
                image=self.transforms(image)

            self.bag_list.append(image)
        label = {"name":name,"nb":len(files), "label":self.labels_list[index]*2-1,"gender":gender ,"DOB":DOB,"LYMPH_COUNT":float(self.df.iloc[index,4])}

        return self.bag_list, label


In [None]:
train_data_transforms = torchvision.transforms.Compose([


    torchvision.transforms.ToTensor(),
    torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

In [None]:
def extract_features(model, dataset):
    X, Y, patient = [], [], []
    for i in range(len(dataset)):
        image_bag, labels = dataset[i]
        features = model(torch.stack(image_bag).to(device))
        features = features.view(features.size(0), -1)
        features = features.cpu().numpy()
        X.append(features)
        Y.append(labels["label"])
        patient.append(labels)
    return X, Y, patient


----
----
# <b> Do the training </b>

In [9]:

dataset_svm_train = DLMI_data_svm(df_train, "/kaggle/input/dlmi-competition/trainset/", train_data_transforms)
dataset_svm_val = DLMI_data_svm(df_val, "/kaggle/input/dlmi-competition/trainset/", train_data_transforms)

In [11]:
X_train,Y_train,P_train=extract_features(model,dataset_svm_train)
X_val,Y_val,P_val=extract_features(model,dataset_svm_val)


In [16]:

!git clone https://github.com/garydoranjr/misvm
%cd misvm
!python setup.py install


fatal: destination path 'misvm' already exists and is not an empty directory.
/kaggle/working/misvm
!!

        ********************************************************************************
        Please avoid running ``setup.py`` directly.
        Instead, use pypa/build, pypa/installer or other
        standards-based tools.

        See https://blog.ganssle.io/articles/2021/10/setup-py-deprecated.html for details.
        ********************************************************************************

!!
  self.initialize_options()
!!

        ********************************************************************************
        Please avoid running ``setup.py`` and ``easy_install``.
        Instead, use pypa/build, pypa/installer or other
        standards-based tools.

        See https://github.com/pypa/setuptools/issues/917 for details.
        ********************************************************************************

!!
  self.initialize_options()
!!

        ****

In [17]:
pip install "cvxopt==1.2.4"

Collecting cvxopt==1.2.4
  Using cached cvxopt-1.2.4.tar.gz (6.7 MB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: cvxopt
  Building wheel for cvxopt (setup.py) ... [?25lerror
  [1;31merror[0m: [1msubprocess-exited-with-error[0m
  
  [31m×[0m [32mpython setup.py bdist_wheel[0m did not run successfully.
  [31m│[0m exit code: [1;36m1[0m
  [31m╰─>[0m [31m[33 lines of output][0m
  [31m   [0m !!
  [31m   [0m 
  [31m   [0m         ********************************************************************************
  [31m   [0m         Usage of dash-separated 'description-file' will not be supported in future
  [31m   [0m         versions. Please use the underscore name 'description_file' instead.
  [31m   [0m 
  [31m   [0m         By 2024-Sep-26, you need to update your project and remove deprecated calls
  [31m   [0m         or your builds will no longer be supported.
  [31m   [0m 
  [31m   [0m         See https

In [18]:
import misvm
MILSVM = misvm.MISVM(kernel='linear',C = 80,max_iters=150)
MILSVM.fit(X_train, Y_train) 
pred_y=MILSVM.predict(X_val)


Non-random start...

Iteration 1...
Training SVM...
     pcost       dcost       gap    pres   dres
 0: -7.3332e+01 -1.2271e+01  8e+03  1e+02  4e-12
 1: -5.8909e+00 -1.0752e+01  2e+02  3e+00  4e-12
 2: -3.2462e+00 -8.1747e+00  2e+01  2e-01  5e-13
 3: -2.9257e+00 -5.2051e+00  6e+00  5e-02  2e-13
 4: -2.8734e+00 -4.0956e+00  2e+00  2e-02  2e-13
 5: -2.8984e+00 -3.3483e+00  5e-01  3e-03  2e-13
 6: -2.9375e+00 -3.1095e+00  2e-01  4e-04  2e-13
 7: -2.9580e+00 -3.0232e+00  7e-02  1e-15  2e-13
 8: -2.9678e+00 -2.9879e+00  2e-02  2e-15  3e-13
 9: -2.9721e+00 -2.9812e+00  9e-03  1e-15  5e-13
10: -2.9742e+00 -2.9771e+00  3e-03  1e-15  7e-13
11: -2.9752e+00 -2.9760e+00  9e-04  1e-15  2e-12
12: -2.9755e+00 -2.9757e+00  2e-04  3e-16  3e-12
13: -2.9757e+00 -2.9757e+00  2e-05  8e-16  4e-12
14: -2.9757e+00 -2.9757e+00  5e-07  9e-17  1e-11
Optimal solution found.
Recomputing classes...
Selector differences: 2143
Updating QP...

Iteration 2...
Training SVM...
     pcost       dcost       gap    pres   d

In [19]:
pred_y=np.sign(pred_y)
print("validation balance accuracy:",balanced_accuracy_score(Y_val, pred_y))

validation balance accuracy: 0.7043478260869566


----
----
# <b> Do the testing </b>

In [24]:
test_file="/kaggle/input/dlmi-competition/testset/testset_data.csv"
test_df=pd.read_csv(test_file)  
test_svm=DLMI_data_svm(test_df,"/kaggle/input/dlmi-competition/testset/",train_data_transforms)

In [26]:
X_test,Y_test,P_test=extract_features(model,test_svm)

In [32]:
import csv
myCsv = csv.writer(open('/kaggle/working/pred_svm_mil.csv', 'w'))
myCsv.writerow(["ID", "Predicted"])
pred_y_test=MILSVM.predict(X_test)
for i in range(len(pred_y_test)):
    
    myCsv.writerow([test_svm[i][1]["name"], int((np.sign(pred_y_test[i])+1)/2)])
