# Hugging Face

## Preparation

Goal here is to generate a csv file for hugging face project

In [4]:
import os
import pandas as pd
from pathlib import Path

In [5]:
wd = os.path.join(os.getcwd(), '..')
data = os.path.join(wd, 'data', 'input')
image_train_path = os.path.join(data, 'Train')
image_test_path = os.path.join(data, 'Test')

csv_output_path = os.path.join(wd, 'data', 'output', 'huggingface_autotrain', 'histopathological_image_classification.csv')
submission_path = os.path.join(wd, 'data', 'output', 'submission', 'pred_swim_20230305.csv')
error_analysis_path = os.path.join(wd, 'data', 'output', 'error_analysis', 'pred_swim_20230305.csv')

# get images list from folder using os.listdir
images = os.listdir(path=image_train_path)
images = [image for image in images if Path(image).suffix == '.png']

In [6]:
def parse_fn(filename):
    # <BIOPSY_PROCEDURE>_<TUMOR_CLASS>_<TUMOR_TYPE>-<YEAR>-<SLIDE_ID>-<MAG>-<SEQ>
    parsed = filename[:-4].replace('-', '_').split('_')
    parsed.append(filename)
    return parsed

columns = ['procedure', 'class', 'type', 'magnification', 'slide', 'mag', 'seq', 'filename']

df = pd.DataFrame(list(map(parse_fn, images)), columns=columns)
label_types = df['type'].unique()
print(label_types)
print(df.head())

['A' 'F' 'PT' 'TA' 'DC' 'LC' 'MC' 'PC']
  procedure class type magnification    slide  mag  seq  \
0       SOB     B    A            14  22549AB  100  001   
1       SOB     B    A            14  22549AB  100  002   
2       SOB     B    A            14  22549AB  100  003   
3       SOB     B    A            14  22549AB  100  005   
4       SOB     B    A            14  22549AB  100  006   

                         filename  
0  SOB_B_A-14-22549AB-100-001.png  
1  SOB_B_A-14-22549AB-100-002.png  
2  SOB_B_A-14-22549AB-100-003.png  
3  SOB_B_A-14-22549AB-100-005.png  
4  SOB_B_A-14-22549AB-100-006.png  


In [7]:
label_maps = {'F':'1', 'DC':'2', 'PC':'3', 'PT':'4', 'MC':'5', 'LC':'6', 'A':'7', 'TA':'8'}
df['type_id'] = df['type'].apply(lambda x: label_maps[x])

print(df.head(10))

  procedure class type magnification    slide  mag  seq  \
0       SOB     B    A            14  22549AB  100  001   
1       SOB     B    A            14  22549AB  100  002   
2       SOB     B    A            14  22549AB  100  003   
3       SOB     B    A            14  22549AB  100  005   
4       SOB     B    A            14  22549AB  100  006   
5       SOB     B    A            14  22549AB  100  013   
6       SOB     B    A            14  22549AB  100  014   
7       SOB     B    A            14  22549AB  100  015   
8       SOB     B    A            14  22549AB  100  018   
9       SOB     B    A            14  22549AB  100  019   

                         filename type_id  
0  SOB_B_A-14-22549AB-100-001.png       7  
1  SOB_B_A-14-22549AB-100-002.png       7  
2  SOB_B_A-14-22549AB-100-003.png       7  
3  SOB_B_A-14-22549AB-100-005.png       7  
4  SOB_B_A-14-22549AB-100-006.png       7  
5  SOB_B_A-14-22549AB-100-013.png       7  
6  SOB_B_A-14-22549AB-100-014.png       7 

In [8]:
hf = df[['filename', 'type_id']].copy()
hf.columns = ['image_relpath', 'label']
hf.head(3)

Unnamed: 0,image_relpath,label
0,SOB_B_A-14-22549AB-100-001.png,7
1,SOB_B_A-14-22549AB-100-002.png,7
2,SOB_B_A-14-22549AB-100-003.png,7


In [6]:
hf.to_csv(csv_output_path, index=False)

The csv can be used in Hugging Face autotrain project.\
The results, if accurate enough, can be used as baseline for the rest of the project.

Let me try it now...

The results are very good, with accuracy over 90% !

## Model api

In [9]:
from transformers import AutoFeatureExtractor, AutoModelForImageClassification

access_token = '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'

extractor = AutoFeatureExtractor.from_pretrained("JoffreyMa/autotrain-histopathological_image_classification-3393093038", use_auth_token=access_token)
model = AutoModelForImageClassification.from_pretrained("JoffreyMa/autotrain-histopathological_image_classification-3393093038", use_auth_token=access_token)

# ViT alternative
extractor = AutoFeatureExtractor.from_pretrained("JoffreyMa/autotrain-histopathological_image_classification-3393093036", use_auth_token=access_token)
model = AutoModelForImageClassification.from_pretrained("JoffreyMa/autotrain-histopathological_image_classification-3393093036", use_auth_token=access_token)



Downloading (…)rocessor_config.json:   0%|          | 0.00/325 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading (…)lve/main/config.json:   0%|          | 0.00/903 [00:00<?, ?B/s]



Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/343M [00:00<?, ?B/s]

In [10]:
import os
import pandas as pd
import torch
from torchvision.io import read_image
from torch.utils.data import Dataset

class HistoDataset(Dataset):
    def __init__(self, img_dir):
        self.img_dir = img_dir

    def __len__(self):
        return len(os.listdir(path=self.img_dir))

    def __getitem__(self, idx):
        filename = os.listdir(path=self.img_dir)[idx]
        img_path = os.path.join(self.img_dir, filename)
        image = read_image(img_path)
        return image, filename

In [11]:
test_data = HistoDataset(image_test_path)
test_data[0][0].shape, test_data[0][1]

(torch.Size([3, 460, 700]), 'SOB_1.png')

In [12]:
inputs = extractor(test_data[0][0], return_tensors="pt")
inputs

{'pixel_values': tensor([[[[ 0.5922,  0.5529,  0.4824,  ...,  0.3490,  0.3333,  0.2706],
          [ 0.5922,  0.5608,  0.4824,  ...,  0.2784,  0.1373,  0.0118],
          [ 0.5922,  0.5765,  0.4902,  ...,  0.1373, -0.0275, -0.0902],
          ...,
          [ 0.4353,  0.2471,  0.2235,  ...,  0.5922,  0.5922,  0.5765],
          [ 0.4745,  0.3569,  0.3255,  ...,  0.5451,  0.5373,  0.5059],
          [ 0.5373,  0.4275,  0.4118,  ...,  0.5216,  0.5294,  0.4824]],

         [[ 0.6549,  0.5294,  0.3412,  ...,  0.2784,  0.2863,  0.2549],
          [ 0.6706,  0.5608,  0.3725,  ...,  0.1843,  0.0980, -0.0039],
          [ 0.6706,  0.5922,  0.4275,  ...,  0.0510, -0.0667, -0.0980],
          ...,
          [ 0.4588,  0.2000,  0.1608,  ...,  0.5608,  0.5608,  0.5529],
          [ 0.4902,  0.3020,  0.2706,  ...,  0.3961,  0.3882,  0.3804],
          [ 0.5216,  0.3569,  0.3569,  ...,  0.2784,  0.2784,  0.2706]],

         [[ 0.6078,  0.5686,  0.5059,  ...,  0.4196,  0.4431,  0.3961],
          [ 0

In [13]:
with torch.no_grad():
    logits = model(**inputs).logits

In [14]:
predicted_label = logits.argmax(-1).item()
predicted_label

6

In [15]:
print(model.config.id2label[predicted_label])

7


Let's apply on the test data

In [16]:
# Test images
images_test = os.listdir(path=image_test_path)

# Number of images
print("Number of images for the test set: ", len(images_test))

def parseTest_fn(filename):
    # <BIOPSY_PROCEDURE>_<ID>
    parsed = filename[:-4].split('_')
    parsed.append(filename)
    return parsed

columns = ['procedure', 'id', 'filename']

df_test = pd.DataFrame(list(map(parseTest_fn, images_test)), columns=columns)
print(df_test.head())
print(df_test.shape)

Number of images for the test set:  207
  procedure   id     filename
0       SOB    1    SOB_1.png
1       SOB   10   SOB_10.png
2       SOB  100  SOB_100.png
3       SOB  101  SOB_101.png
4       SOB  102  SOB_102.png
(207, 3)


In [17]:
df_test['type_id'] = 0

In [18]:
df_test[df_test['filename']=='SOB_1.png']['type_id']

0    0
Name: type_id, dtype: int64

In [19]:
types_test = []
for data in test_data:
    inputs = extractor(data[0], return_tensors="pt")
    filename = data[1]
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_label = logits.argmax(-1).item()
    types_test.append(model.config.id2label[predicted_label])

In [20]:
df_test['type_id'] = types_test

In [21]:
df_test

Unnamed: 0,procedure,id,filename,type_id
0,SOB,1,SOB_1.png,7
1,SOB,10,SOB_10.png,7
2,SOB,100,SOB_100.png,1
3,SOB,101,SOB_101.png,1
4,SOB,102,SOB_102.png,4
...,...,...,...,...
202,SOB,95,SOB_95.png,7
203,SOB,96,SOB_96.png,7
204,SOB,97,SOB_97.png,1
205,SOB,98,SOB_98.png,7


In [22]:
df_pred = df_test[['id', 'type_id']]

In [23]:
display(df_pred['type_id'].value_counts())
df_pred.to_csv(submission_path, index=False, header=False)

7    70
2    48
1    28
3    24
4    20
5    17
Name: type_id, dtype: int64

With this submission, I get 0.778882621975 F1-score and 0.835748792271 Accuracy.

## Error analysis

Let's predict for the entire dataset and check were mistakes are made.\


In [32]:
# Train images
def parse_fn(filename):
    # <BIOPSY_PROCEDURE>_<TUMOR_CLASS>_<TUMOR_TYPE>-<YEAR>-<SLIDE_ID>-<MAG>-<SEQ>
    parsed = filename[:-4].replace('-', '_').split('_')
    parsed.append(filename)
    return parsed

columns = ['procedure', 'class', 'type', 'magnification', 'slide', 'mag', 'seq', 'filename']

df = pd.DataFrame(list(map(parse_fn, images)), columns=columns)
label_types = df['type'].unique()
print(label_types)
print(df.head())

['A' 'F' 'PT' 'TA' 'DC' 'LC' 'MC' 'PC']
  procedure class type magnification    slide  mag  seq  \
0       SOB     B    A            14  22549AB  100  001   
1       SOB     B    A            14  22549AB  100  002   
2       SOB     B    A            14  22549AB  100  003   
3       SOB     B    A            14  22549AB  100  005   
4       SOB     B    A            14  22549AB  100  006   

                         filename  
0  SOB_B_A-14-22549AB-100-001.png  
1  SOB_B_A-14-22549AB-100-002.png  
2  SOB_B_A-14-22549AB-100-003.png  
3  SOB_B_A-14-22549AB-100-005.png  
4  SOB_B_A-14-22549AB-100-006.png  


In [23]:
types_train = []
train_data = HistoDataset(image_train_path)
for data in train_data:
    inputs = extractor(data[0], return_tensors="pt")
    filename = data[1]
    with torch.no_grad():
        logits = model(**inputs).logits
    predicted_label = logits.argmax(-1).item()
    types_train.append(model.config.id2label[predicted_label])

In [33]:
df['type_id_pred'] = 0
df['type_id_pred'] = types_train

In [34]:
df.head(3)

Unnamed: 0,procedure,class,type,magnification,slide,mag,seq,filename,type_id_pred
0,SOB,B,A,14,22549AB,100,1,SOB_B_A-14-22549AB-100-001.png,7
1,SOB,B,A,14,22549AB,100,2,SOB_B_A-14-22549AB-100-002.png,7
2,SOB,B,A,14,22549AB,100,3,SOB_B_A-14-22549AB-100-003.png,7


In [35]:
label_maps = {'F':'1', 'DC':'2', 'PC':'3', 'PT':'4', 'MC':'5', 'LC':'6', 'A':'7', 'TA':'8'}
df['type_id'] = df['type'].apply(lambda x: label_maps[x])

print(df.head(3))

  procedure class type magnification    slide  mag  seq  \
0       SOB     B    A            14  22549AB  100  001   
1       SOB     B    A            14  22549AB  100  002   
2       SOB     B    A            14  22549AB  100  003   

                         filename type_id_pred type_id  
0  SOB_B_A-14-22549AB-100-001.png            7       7  
1  SOB_B_A-14-22549AB-100-002.png            7       7  
2  SOB_B_A-14-22549AB-100-003.png            7       7  


In [37]:
df.to_csv(error_analysis_path, index=False)