## Dermnet - Kamień milowy 3 (Modelowanie)

### 0. Pakiety

In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import random
from tensorflow.keras.preprocessing.image import img_to_array
import cv2
from sklearn.decomposition import PCA

## 1. Przygotowanie danych
### 1.1. Import

In [13]:
df1 = pd.read_pickle('train_data_1.pkl')
df2 = pd.read_pickle('train_data_2.pkl')
X = pd.concat([df1, df2], axis=0)
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15557 entries, 0 to 15556
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   image   15557 non-null  object
dtypes: object(1)
memory usage: 121.7+ KB


Zmiana rozdzielczości z 224x224 px na 75x75 px.

In [None]:
# resize all images
X['image'] = X['image'].apply(lambda x: cv2.resize(x, (75, 75)))

### 1.2. Rotacja

In [None]:
def rotate_all_images(data_frame, colname = 'image'):
    images_90 = []
    for image in data_frame[colname]:
        image_90 = np.rot90(image)
        images_90.append(image_90)
    return pd.DataFrame({colname: images_90})


In [None]:
X_rotated_90 = rotate_all_images(X)
X_rotated_180 = rotate_all_images(X_rotated_90)
X_rotated_270 = rotate_all_images(X_rotated_180)

### 1.3. Modyfikacje obrazów
#### Funkcje

In [None]:
random.seed(2137)
chosen_indexes = random.sample(range(len(X['image'])), 8)

def visualize_random_images(df, indexes = [0, 1, 2, 3, 4, 5, 6, 7]):
    fig, axes = plt.subplots(2, 4, figsize=(20, 10))
    for i in range(8):
        img = df['image'].iloc[indexes[i]]
        ax = axes[i//4, i%4]
        ax.imshow(img)
        ax.axis('off')


def convert_to_grayscale(data_frame, colname='image'):
    '''
    Convert the RGB images to grayscale
    '''
    result = []
    k=0
    for image in data_frame['image']:
        r, g, b = image[:,:,0], image[:,:,1], image[:,:,2]
        gray = 0.33 * r + 0.33 * g + 0.33 * b
        result.append(gray)
    result = pd.DataFrame({colname : result})
    return result

def convert_to_grayscale_one_color(data_frame, colname='image'):
    '''
    Convert the RGB images to grayscale
    '''
    result = []
    k=0
    for image in data_frame['image']:
        r, g, b = image[:,:,0], image[:,:,1], image[:,:,2]
        gray = r + g + b
        result.append(gray)
    result = pd.DataFrame({colname : result})
    return result


def visualize_random_images_grey(df, indexes = [0, 1, 2, 3, 4, 5, 6, 7]):
    fig, axes = plt.subplots(2, 4, figsize=(20, 10))
    for i in range(8):
        img = df['image'].iloc[indexes[i]]
        ax = axes[i//4, i%4]
        ax.imshow(img, cmap='gray')
        ax.axis('off')


def extract_rgb_channel(data_frame, colname='image', color='red'):
    '''
    Convert the RGB images to grayscale
    '''
    if color == 'red':
        channel = 0
    elif color == 'green':
        channel = 1
    elif color == 'blue':
        channel = 2
    result = []
    k=0
    for image in data_frame['image']:
        modified_image = np.zeros_like(image)
        modified_image[:,:,channel] = image[:,:,channel]
        result.append(modified_image)
    result = pd.DataFrame({colname : result})
    return result


def global_threshold(df, tresh, colname = 'image'):
    '''
    Runs threshold on all images in df
    '''
    assert df[colname].iloc[0].ndim == 2, "images must be in greyscale"
    result = []
    for image in df[colname]:
        result.append((image > tresh) * 255)
    result = pd.DataFrame({colname : result})
    return result

def reverse_threshold(df, tresh, colname = 'image'):
    '''
    Runs threshold on all images in df
    '''
    assert df[colname].iloc[0].ndim == 2, "images must be in greyscale"
    result = []
    for image in df[colname]:
        result.append((image < tresh) * 255)
    result = pd.DataFrame({colname : result})
    return result

def detect_images(df, low_th = 50, high_th = 150, blur_ksize = 5, colname = 'image'):
    '''
    Runs edge detection on all images in df
    '''
    assert df[colname].iloc[0].ndim == 2, "images must be in greyscale"
    result = []
    for image in df[colname]:
        image = (image * 255).astype(np.uint8)
        blurred_image = cv2.GaussianBlur(image, (blur_ksize, blur_ksize), 0)

        edges = cv2.Canny(blurred_image, low_th, high_th)  
        result.append(edges)
    result = pd.DataFrame({colname : result})
    return result

def convert_to_negative_image(df, colname = 'image'):
    '''
    Converts the image to negative
    '''
    result = []
    for image in df[colname]:
        result.append(255 - image)
    result = pd.DataFrame({colname : result})
    return result

def erosion(df, kernel_size=(5,5), iterations=1, colname='image'):
    result = []
    kernel = np.ones(kernel_size, np.uint8)
    for image in df[colname]:
        eroded_image = cv2.erode(image, kernel, iterations=iterations)
        result.append(eroded_image)
    result = pd.DataFrame({colname: result})
    return result

def dilation(df, kernel_size=(5,5), iterations=1, colname='image'):
    result = []
    kernel = np.ones(kernel_size, np.uint8)
    for image in df[colname]:
        dilated_image = cv2.dilate(image, kernel, iterations=iterations)
        result.append(dilated_image)
    result = pd.DataFrame({colname: result})
    return result

def reverse_hough_transform(df, threshold=100, colname='image'):
    assert df[colname].iloc[0].ndim == 2, "images must be in greyscale"
    result = []
    for image in df[colname]:
        image = image.astype(np.uint8)  # Convert to CV_8U data type
        edges = cv2.Canny(image, 50, 150, apertureSize=3)
        lines = cv2.HoughLines(edges, 1, np.pi/180, threshold)  # Define 'lines' variable
        if lines is not None:
            for rho, theta in lines[:, 0]:
                a = np.cos(theta)
                b = np.sin(theta)
                x0 = a * rho
                y0 = b * rho
                x1 = int(x0 + 1000 * (-b))
                y1 = int(y0 + 1000 * (a))
                x2 = int(x0 - 1000 * (-b))
                y2 = int(y0 - 1000 * (a))
                cv2.line(image, (x1, y1), (x2, y2), (0, 0, 255), 2)
        result.append(image)
    result = pd.DataFrame({colname: result})
    return result

def flatten_images(df, colname='image', batch_size=100):
    num_images = len(df)
    flattened_data = []
    for start in range(0, num_images, batch_size):
        end = min(start + batch_size, num_images)
        batch_images = df[colname].iloc[start:end]
        batch_flattened = []
        for image in batch_images:
            image_array = img_to_array(image)
            batch_flattened.append(image_array.flatten() / 255.0)
        batch_flattened = np.array(batch_flattened)
        flattened_data.append(batch_flattened)
    flattened_data = np.concatenate(flattened_data, axis=0)
    flattened_df = pd.DataFrame(flattened_data, columns=[f'{colname}_{i}' for i in range(flattened_data.shape[1])])
    return flattened_df


def modify_pictures(df, colname='image'):

    # convert and flatten
    grayscale = convert_to_grayscale(df, colname)
    grayscale_flatten = flatten_images(grayscale, colname)
    print('grayscale done')
    red = extract_rgb_channel(df, colname, 'red')
    red_flatten = flatten_images(red, colname)
    print('red done')
    green = extract_rgb_channel(df, colname, 'green')
    green_flatten = flatten_images(green, colname)
    print('green done')
    blue = extract_rgb_channel(df, colname, 'blue')
    blue_flatten = flatten_images(blue, colname)
    print('blue done')
    thresholded = global_threshold(grayscale, 100, colname)
    thresholded_flatten = flatten_images(thresholded, colname)
    print('threshold done') 
    eroded = erosion(df, colname=colname)
    eroded_flatten = flatten_images(eroded, colname)
    print('eroded done')
    dilated = dilation(df, colname=colname)
    dilated_flatten = flatten_images(dilated, colname)
    print('dilated done')
    hough = reverse_hough_transform(grayscale, colname=colname)
    hough_flatten = flatten_images(hough, colname)
    print('hough done')

    # concatenate
    result = pd.concat([grayscale_flatten, red_flatten, green_flatten, blue_flatten, thresholded_flatten, eroded_flatten, dilated_flatten, hough_flatten], axis=1)
    return result


### 1.3.1. Ramka danych `X`
Obrazy z wyciągniętymi kanałami R, G i B niosą w sumie te same informacje, co obraz kolorowy czy negatyw. Nie będziemy więc tych dwóch modyfikacji uwzględniać. Dzięki temu zmniejszamy liczbę 'kolumn' w docelowej ramce danych o 40%, niosąc te same informacje.

In [None]:
# X_processed = modify_pictures(X)

grayscale done
red done
green done
blue done
threshold done
eroded done
dilated done
hough done


### 1.3.2. Ramka obrazów obróconych `X_rotated_alpha`


In [None]:
X_rot_90_processed = modify_pictures(X_rotated_90)
print('90 done')
X_rot_180_processed = modify_pictures(X_rotated_180)
print('180 done')
X_rot_270_processed = modify_pictures(X_rotated_270)

grayscale done
red done
green done
blue done
threshold done
eroded done
dilated done
hough done
90 done
grayscale done
red done
green done
blue done
threshold done
eroded done
dilated done
hough done
180 done
grayscale done
red done
green done
blue done
threshold done
eroded done
dilated done
hough done


### 1.4. PCA na `X`

In [None]:
# pca = PCA(n_components = 0.9)
# X_pca = pca.fit_transform(X_processed)

In [None]:
# X_pca.shape

(15557, 239)

In [None]:
# X_pca_df = pd.DataFrame(X_pca)
# X_pca_df.to_csv('X_pca.csv', index=False)

### 1.5. PCA na `X_rotated_alpha`

In [None]:
import dask.dataframe as dd

# Convert pandas DataFrames to Dask DataFrames
dX_processed = dd.from_pandas(X_processed, npartitions=10)
dX_rot_90_processed = dd.from_pandas(X_rot_90_processed, npartitions=10)
dX_rot_180_processed = dd.from_pandas(X_rot_180_processed, npartitions=10)
dX_rot_270_processed = dd.from_pandas(X_rot_270_processed, npartitions=10)

# Concatenate Dask DataFrames
dX_rot_processed = dd.concat([dX_processed, dX_rot_90_processed, dX_rot_180_processed, dX_rot_270_processed], axis=0)

# Compute the final result
X_rot_processed = dX_rot_processed.compute()



KeyboardInterrupt: 