In [12]:
import pandas as pd
import numpy as np
import cv2
from skimage.feature import *

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import cross_validate

### Morphological features
Morphological features are geometric and structural characteristics extracted from the shapes and contours of objects in images. These include measurements such as area, perimeter, etc..
 
They are used to describe the morphology of structures visible in images.

In [13]:
def extract_morphological_features(image, max_contours=2):
    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # Threshold to get binary image
    _, binary = cv2.threshold(gray, 127, 255, cv2.THRESH_BINARY)
    # Find contours
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    features = []
    for contour in contours[:max_contours]:
        # Extract area
        area = cv2.contourArea(contour)
        # Extract perimeter
        perimeter = cv2.arcLength(contour, True)
        
        features.extend([area, perimeter])
    
    # Fill with zeros if there are less contours then max
    while len(features) < max_contours * 2:
        features.append(0)
    
    return np.array(features)

### Texture features
Texture features are characteristics that describe the variation of intensity or colour in an image.
 
They are used to identify and classify repetitive patterns and textures in images.

In [14]:
def extract_texture_features(image):
    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    # Calculate grey level co-occurrence matrix
    glcm = graycomatrix(gray, distances=[1], angles=[0], symmetric=True, normed=True)
    # Get properties from matrix
    contrast = graycoprops(glcm, 'contrast')[0, 0]
    dissimilarity = graycoprops(glcm, 'dissimilarity')[0, 0]
    homogeneity = graycoprops(glcm, 'homogeneity')[0, 0]
    energy = graycoprops(glcm, 'energy')[0, 0]
    correlation = graycoprops(glcm, 'correlation')[0, 0]
    
    texture_features = [contrast, dissimilarity, homogeneity, energy, correlation]
    
    return np.array(texture_features)

### Colour features
Colour features are characteristics that describe the distribution and intensity of colours in an image.

In [15]:
def extract_color_features(image, hist_size=512):
    # Calculate color histogram
    hist = cv2.calcHist([image], [0, 1, 2], None, [8, 8, 8], [0, 256, 0, 256, 0, 256])
    hist = cv2.normalize(hist, hist).flatten()
    return hist


In [16]:
def extract_features(image_path):
    image = cv2.imread(image_path)
    
    morphological_features = extract_morphological_features(image)
    texture_features = extract_texture_features(image)
    color_features = extract_color_features(image)
    
    # Combine all features into a single feature vector
    features = np.hstack([morphological_features, texture_features, color_features])
    
    return features

In [17]:
# Load dataset
df = pd.read_csv("isolated_cells/single_cell_dataset.csv")
df

Unnamed: 0,filename,class
0,img-0137_png_jpg.rf.0ea8b8dcfc295e58f709f8eadc...,neutrophil
1,img-0073_png_jpg.rf.1e42284b7c00c8e1e02b03c9f1...,neutrophil
2,img-0197_png_jpg.rf.3104549baeb172a227efa2ac87...,neutrophil
3,img-0059_png_jpg.rf.90762669b9486e21e6ce570efb...,neutrophil
4,img-0081_png_jpg.rf.2151947735c50695395cfc7847...,neutrophil
...,...,...
7102,img-0082_png_jpg.rf.9719f27bbe102f331e38eb039e...,artefatto
7103,img-0224_png_jpg.rf.b4aae7b06a2394a5bedde89880...,artefatto
7104,img-0134_png_jpg.rf.4099dc986ce0b3a39a40de4780...,artefatto
7105,img-0048_png_jpg.rf.80ebfae57f4023c2d8ef9ff0ae...,artefatto


In [18]:
features = []
labels = []

# Extract features array for each image
for index, row in df.iterrows():
        image_path = f"isolated_cells/{row['class']}/{row['filename']}"
        # Save features array
        features_list = extract_features(image_path)
        features.append(features_list)
        #Save labels
        labels.append(row['class'])
        

In [19]:
X = np.array(features)
y = np.array(labels)

# Split dataset in train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
model = RandomForestClassifier(n_estimators=200, criterion='gini')
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [30]:
# Calculate metrics
print("------ RandomForestClassifier ------")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average="weighted"))
print("Recall:", recall_score(y_test, y_pred, average="weighted"))
print("---------------------------------------------------------")

------ RandomForestClassifier ------
Accuracy: 0.8002812939521801
Precision: 0.7823542103033043
Recall: 0.8002812939521801
---------------------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
cross_validate(model, X, y, cv=10, scoring=['accuracy'])

{'fit_time': array([3.60548854, 3.51310897, 3.29190421, 3.30488896, 3.25951099,
        3.38028574, 3.4466114 , 3.45351553, 3.6470294 , 3.84648728]),
 'score_time': array([0.0485177 , 0.0457058 , 0.04486823, 0.04226017, 0.04835176,
        0.04479671, 0.04528618, 0.04458475, 0.05032134, 0.05085611]),
 'test_accuracy': array([0.80590717, 0.80168776, 0.79324895, 0.80872011, 0.78762307,
        0.80028129, 0.79184248, 0.78873239, 0.8084507 , 0.80985915])}