# Import libaries

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import StandardScaler, LabelEncoder
import cv2
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from utils import save_to_csv, save_encoder, download_data

  from .autonotebook import tqdm as notebook_tqdm


# Map data and processing images with CV2

In [2]:
download_data() #Download data and save in specific path for future use
df = pd.read_csv('../Datasets/skin-cancer-mnist-ham10000/HAM10000_metadata.csv')
df.head()

Cache path: C:\Users\Anh\.cache\kagglehub\datasets\kmader\skin-cancer-mnist-ham10000\versions\2
Dataset already exists at: ../Datasets/skin-cancer-mnist-ham10000


Unnamed: 0,lesion_id,image_id,dx,dx_type,age,sex,localization
0,HAM_0000118,ISIC_0027419,bkl,histo,80.0,male,scalp
1,HAM_0000118,ISIC_0025030,bkl,histo,80.0,male,scalp
2,HAM_0002730,ISIC_0026769,bkl,histo,80.0,male,scalp
3,HAM_0002730,ISIC_0025661,bkl,histo,80.0,male,scalp
4,HAM_0001466,ISIC_0031633,bkl,histo,75.0,male,ear


In [3]:
image_dirs = ['../Datasets/skin-cancer-mnist-ham10000/HAM10000_images_part_1', '../Datasets/skin-cancer-mnist-ham10000/HAM10000_images_part_2']

In [4]:
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['dx'])
label_map = dict(zip(label_encoder.transform(label_encoder.classes_), label_encoder.classes_))

In [5]:
image_size = (32, 32)
pixels = []
labels = []

for idx, row in df.iterrows():
  img_path = None
  for d in image_dirs:
    candidate_path = os.path.join(d, row['image_id'] + '.jpg')
    if os.path.exists(candidate_path):
        print(f"{candidate_path} is existed")
        img_path = candidate_path
        break

  if img_path:
      img = cv2.imread(img_path)
      if img is not None:
          img_resized = cv2.resize(img, image_size)
          img_resized = cv2.cvtColor(img_resized, cv2.COLOR_BGR2RGB)
          img_flat = img_resized.flatten()
          pixels.append(img_flat)
          labels.append(row['label'])
  else:
    print("Won't map data")

X = np.array(pixels)
y = np.array(labels)

X.shape
y.shape

../Datasets/skin-cancer-mnist-ham10000/HAM10000_images_part_1\ISIC_0027419.jpg is existed
../Datasets/skin-cancer-mnist-ham10000/HAM10000_images_part_1\ISIC_0025030.jpg is existed
../Datasets/skin-cancer-mnist-ham10000/HAM10000_images_part_1\ISIC_0026769.jpg is existed
../Datasets/skin-cancer-mnist-ham10000/HAM10000_images_part_1\ISIC_0025661.jpg is existed
../Datasets/skin-cancer-mnist-ham10000/HAM10000_images_part_2\ISIC_0031633.jpg is existed
../Datasets/skin-cancer-mnist-ham10000/HAM10000_images_part_1\ISIC_0027850.jpg is existed
../Datasets/skin-cancer-mnist-ham10000/HAM10000_images_part_1\ISIC_0029176.jpg is existed
../Datasets/skin-cancer-mnist-ham10000/HAM10000_images_part_1\ISIC_0029068.jpg is existed
../Datasets/skin-cancer-mnist-ham10000/HAM10000_images_part_1\ISIC_0025837.jpg is existed
../Datasets/skin-cancer-mnist-ham10000/HAM10000_images_part_1\ISIC_0025209.jpg is existed
../Datasets/skin-cancer-mnist-ham10000/HAM10000_images_part_1\ISIC_0025276.jpg is existed
../Dataset

(10015,)

Reducing the number of features 

In [6]:
pca = PCA(n_components=100)
X_pca = pca.fit_transform(X)

In [7]:
save_encoder(pca, f"../Encoders/pca.joblib")

Model is existed at ../Encoders/pca.joblib


In [8]:
X_pca.shape

(10015, 100)

In [9]:
final = pd.DataFrame(X_pca)

In [10]:
final['image_id'] = df['image_id']

In [11]:
final.shape

(10015, 101)

Save the processed DataFrame

In [12]:
save_to_csv(final, "processed_image.csv")

File 'csv\processed_image.csv' is existed
