# Feature Extraction
## Importing Libraries

In [15]:
# import cv2 as cv
import SimpleITK as sitk

import numpy as np
import os
from skimage.feature.texture import greycomatrix
from skimage.feature.texture import greycoprops
from skimage.measure import shannon_entropy
from radiomics import glrlm, glcm
# import pyfeats\\
import pandas as pd
import multiprocessing as mlp
import math
import feature_extraction as fe

from imblearn.over_sampling import SMOTE

In [16]:
# from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

## Define Feature Extraction functions

### Read dataset images

In [17]:
def read_images(folder = "dataset/train",
                classes = [
                            "normal",
                            "fatty",
                            "cirrhosis"
                        ]):
    image_names = {}
    images = []
    # Get all image names in folders
    for cls in classes:
        image_names[cls] = os.listdir(f'{folder}/{cls}')

    # read all images to list
    for cls in classes:
        for name in image_names[cls]:
            mask = []
            with open(f'dataset/masks/{name[0:-4]}.txt', 'r') as file:
                data = file.read()
                data = data.strip().split('\n')
                for line in data:
                    x, y = line.split(',')
                    mask.append((int(y),int(x)))
            img = sitk.ReadImage(f'{folder}/{cls}/{name}', sitk.sitkUInt8)
            images.append((name, img,cls,mask))
    return images

### Extract ROIs from image

In [18]:
def extract_roi(img, start , size = (32,32)):
    img = sitk.GetArrayFromImage(img)
    roi = img[start[0]:start[0]+size[0],start[1]:start[1]+size[1]]
    mask = np.zeros(img.shape)
    mask[start[0]:start[0]+size[0],start[1]:start[1]+size[1]] = 1
    return roi, mask

# Calculate Liver Diagonal Length

In [19]:
def get_length(img, mask):
    # top right, bottom left
    tr_distance = []
    bl_distance = []

    # top left, bottom right
    tl_distance = []
    br_distance = []


    for x,y in mask:
        tr_distance.append(math.dist([0,img.shape[1]],[x+32,y]))
        bl_distance.append(math.dist([img.shape[0],0],[x,y+32]))

        tl_distance.append(math.dist([0,0],[x,y]))
        br_distance.append(math.dist(img.shape,[x+32,y+32]))


    top_right = mask[tr_distance.index(min(tr_distance))]
    bottom_left = mask[bl_distance.index(min(bl_distance))]

    top_left = mask[tl_distance.index(min(tl_distance))]
    bottom_right = mask[br_distance.index(min(br_distance))]
    
    return max(math.dist(top_right,bottom_left),math.dist(top_left,bottom_right))

### Extract Features from ROIs

In [20]:
def feature_extraction(img, roi_pos):
    roi_mask_arr = []
    for pos in roi_pos:
        roi_mask_arr.append(extract_roi(img, pos))

    # 0 45 90 135 degrees
    angles = [0, np.pi / 4, np.pi / 2, 3 * np.pi / 4]

    da_dict = {
        0: "d1_0",1: "d1_45",2: "d1_90",3: "d1_135",
        4: "d2_0",5: "d2_45",6: "d2_90",7: "d2_135",
        8: "d3_0",9: "d3_45",10: "d3_90",11: "d3_135"
    }

    length = get_length(sitk.GetArrayFromImage(img), roi_pos)

    feat_arr = []
    for roi, mask in roi_mask_arr:
        features = {}

        glcm_mtx = greycomatrix(roi, distances = [1,2,3], angles = angles, levels = 256)
        con = greycoprops(glcm_mtx, 'contrast').flatten()
        hom = greycoprops(glcm_mtx, 'homogeneity').flatten()
        en = greycoprops(glcm_mtx, 'energy').flatten()
        corr = greycoprops(glcm_mtx, 'correlation').flatten()

        for j in range(len(da_dict)):
            features[f'contrast_{da_dict[j]}'] = con[j]
            features[f'homogeneity_{da_dict[j]}'] = hom[j]
            features[f'energy_{da_dict[j]}'] = en[j]
            features[f'correlation_{da_dict[j]}'] = corr[j]

        features[f'entropy'] = shannon_entropy(roi)

        features['length'] = length

        # pyradiomics
        mask = sitk.GetImageFromArray(mask)
        # First Order features
        firstOrderFeatures = firstorder.RadiomicsFirstOrder(img, mask)
        # firstOrderFeatures.enableFeatureByName('Mean', True)
        firstOrderFeatures.enableAllFeatures()
        results = firstOrderFeatures.execute()
        for col in results.keys():
            features[col] = results[col].item()

        # GLCM features
        glcmFeatures = glcm.RadiomicsGLCM(img, mask)
        glcmFeatures.enableAllFeatures()
        results = glcmFeatures.execute()
        for col in results.keys():
            features[col] = results[col].item()
        #
        # GLRLM features
        glrlmFeatures = glrlm.RadiomicsGLRLM(img, mask)
        glrlmFeatures.enableAllFeatures()
        results = glrlmFeatures.execute()
        for col in results.keys():
            features[col] = results[col].item()

        feat_arr.append(features)

    return feat_arr

### Construct dataframe from ROI features

In [21]:
def build_dataframe(images):
    # dataframe consists of features of 1 ROI per image
    # column name roiNum_feature
    data = pd.DataFrame()

    for img, cls, mask in images:
        feat_arr = feature_extraction(img, roi_pos=mask)
        for row in feat_arr:
            row['target'] = cls
            data = data.append(row,ignore_index=True)
    return data

### Construct dataframe using multiprocessing
### Reduced runtime by 82%

In [22]:
def build_with_mlp(images, n=9): 
    pool = mlp.Pool(n)
    results = pool.map(fe.build_dataframe,np.array_split(images,n))
    return results

## Feature Analysis and Selection

### Extract Features and build dataframe

In [23]:
%%time

# images = read_images('dataset/train')
# mlp_data = build_with_mlp(images)
# data = pd.DataFrame()
# for frame in mlp_data:
#     data = data.append(frame)

# data.set_index('name', drop=True, inplace=True)

# data.to_csv("dataset/train.csv")

data = pd.read_excel('dataset/segment/train.xlsx', index_col='name')

data.describe()

Wall time: 12.8 s


Unnamed: 0,10Percentile,90Percentile,Autocorrelation,ClusterProminence,ClusterShade,ClusterTendency,Contrast,Correlation,DifferenceAverage,DifferenceEntropy,...,homogeneity_d1_90,homogeneity_d2_0,homogeneity_d2_135,homogeneity_d2_45,homogeneity_d2_90,homogeneity_d3_0,homogeneity_d3_135,homogeneity_d3_45,homogeneity_d3_90,length
count,10239.0,10239.0,10239.0,10239.0,10239.0,10239.0,10239.0,10239.0,10239.0,10239.0,...,10239.0,10239.0,10239.0,10239.0,10239.0,10239.0,10239.0,10239.0,10239.0,10239.0
mean,43.214386,70.884881,5.190461,27.420777,2.159861,1.169789,0.316116,0.520472,0.246282,0.7505986,...,0.177287,0.197639,0.162198,0.192916,0.140118,0.162515,0.137177,0.146077,0.135427,294.34297
std,25.953822,33.997997,4.488373,176.498185,12.593087,1.654653,0.331033,0.171877,0.12886,0.2695262,...,0.109722,0.11397,0.104919,0.118423,0.101432,0.106142,0.100996,0.104579,0.10112,72.10889
min,0.0,0.0,1.0,0.0,-20.744314,0.0,0.0,-0.003308,0.0,-3.203427e-16,...,0.041775,0.050039,0.042185,0.043133,0.034135,0.039906,0.035083,0.034,0.027464,135.764502
25%,24.0,46.0,2.729294,0.870617,-0.024615,0.534995,0.170213,0.40743,0.163213,0.6255283,...,0.117785,0.130055,0.10911,0.122657,0.093496,0.106743,0.090946,0.094952,0.089072,243.704739
50%,41.0,69.0,4.182663,2.084719,0.282485,0.807595,0.254504,0.513868,0.238976,0.77875,...,0.154595,0.173784,0.141294,0.165697,0.117997,0.140509,0.115081,0.122553,0.112606,286.216701
75%,60.0,91.0,6.122207,9.357921,0.876473,1.345874,0.373407,0.625236,0.32369,0.9169306,...,0.197679,0.226263,0.178031,0.223166,0.151147,0.180674,0.14746,0.161069,0.146184,340.164666
max,174.0,253.0,66.896373,5954.836767,390.534435,39.317592,10.51747,1.0,1.587563,1.962372,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,480.0


In [24]:
%%time

# test_images = read_images("dataset/test")
# mlp_data = build_with_mlp(test_images)
# test_data = pd.DataFrame()
# for frame in mlp_data:
#     test_data = test_data.append(frame)
    
# test_data.set_index('name', drop=True, inplace=True)

# test_data.to_csv("dataset/test.csv")

test_data = pd.read_excel('dataset/segment/test.xlsx', index_col='name')

test_data.describe()

Wall time: 3.45 s


Unnamed: 0,10Percentile,90Percentile,Autocorrelation,ClusterProminence,ClusterShade,ClusterTendency,Contrast,Correlation,DifferenceAverage,DifferenceEntropy,...,homogeneity_d1_90,homogeneity_d2_0,homogeneity_d2_135,homogeneity_d2_45,homogeneity_d2_90,homogeneity_d3_0,homogeneity_d3_135,homogeneity_d3_45,homogeneity_d3_90,length
count,2543.0,2543.0,2543.0,2543.0,2543.0,2543.0,2543.0,2543.0,2543.0,2543.0,...,2543.0,2543.0,2543.0,2543.0,2543.0,2543.0,2543.0,2543.0,2543.0,2543.0
mean,42.901219,71.010814,5.015083,26.3394,2.002938,1.167737,0.314198,0.516472,0.25087,0.7536736,...,0.176278,0.197346,0.161127,0.192916,0.140383,0.162683,0.138232,0.146238,0.136276,293.332709
std,26.602086,35.537199,4.006853,201.157591,13.729393,1.703052,0.340934,0.170138,0.138701,0.2863724,...,0.114381,0.116635,0.109106,0.123798,0.106084,0.108893,0.105759,0.109164,0.105858,66.869642
min,0.0,0.0,1.0,0.0,-13.731914,0.0,0.0,-0.003336,0.0,-3.203427e-16,...,0.046256,0.04947,0.038131,0.047311,0.034038,0.039583,0.03459,0.035135,0.032659,135.764502
25%,24.0,45.0,2.542497,0.861262,-0.008815,0.52425,0.168639,0.408831,0.161579,0.6248237,...,0.116269,0.130361,0.108888,0.121239,0.091311,0.107111,0.090186,0.093158,0.08753,243.704739
50%,41.0,68.0,4.091035,1.934584,0.275089,0.815074,0.251024,0.504155,0.241261,0.7808226,...,0.15149,0.172473,0.137961,0.16149,0.116903,0.139648,0.114804,0.121216,0.113159,295.025423
75%,62.0,93.0,6.06037,8.418262,0.766716,1.332241,0.380118,0.6185,0.331848,0.9225272,...,0.193826,0.228049,0.175607,0.219497,0.15046,0.182794,0.149007,0.159531,0.147083,329.460165
max,135.0,253.0,40.840677,5220.689962,333.676485,31.20749,9.178053,1.0,1.135032,1.52604,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,435.247056


## Testing

In [25]:
def split(data, test_data, drop=None, cols = None):
    X_train = data.copy()
    y_train = X_train.pop('target')
    X_test = test_data.copy()
    y_test = X_test.pop('target')

    if drop != None:
        X_train = X_train[y_train != drop]
        X_test = X_test[y_test != drop]

        y_train = y_train[y_train != drop]
        y_test = y_test[y_test != drop]
    
    if cols is None: cols = X_train.columns
    
    std = StandardScaler()
    std.fit(X_train[cols])
    X_train = pd.DataFrame(std.transform(X_train[cols]), columns = cols, index = X_train.index)
    X_test = pd.DataFrame(std.transform(X_test[cols]), columns = cols, index = X_test.index)
    return X_train, y_train, X_test, y_test, std

In [26]:
def train_test(model, X_train, y_train, X_test, y_test):
    sm = SMOTE(sampling_strategy='minority', random_state=42)
    X_train, y_train = sm.fit_resample(X_train, y_train)
    model = model.fit(X_train, y_train)
    y_pred = pd.Series(model.predict(X_test),index=y_test.index)
    return model, y_pred

In [27]:
def images_pred(y_pred):
    count = 0
    prediction = {}
    for name in np.unique(y_pred.index):
        pred_cls = {}
        for i in y_pred[name]:
            if i not in pred_cls.keys():
                pred_cls[i]=1
            else: pred_cls[i]+=1
        
        prediction[name] = max(pred_cls, key=pred_cls.get)
    return prediction

In [28]:
def images_acc(y_test, y_pred):
    pred_count = 0
    for key in y_pred.keys():
        if y_test[key][0] == y_pred[key]:
            pred_count += 1
    return pred_count/len(y_pred.keys())

In [29]:
models = {
    "RFC": RandomForestClassifier(
                    random_state=42,
                    max_features='auto',
                    n_estimators= 500,
                    max_depth=6,
                    criterion='entropy'),
    "MLP": MLPClassifier(
                    max_iter=600,
                    momentum=0.6,
                    solver='adam',
                    activation='relu',
                    learning_rate_init=0.005,
                    alpha=0.001,
                    random_state=42),
    "SVC": svm.SVC(random_state=42)
}
classes = ['normal', 'fatty', 'cirrhosis']

for drop in classes:
    X_train, y_train, X_test, y_test, std = split(data, test_data, drop)
    print(*[cls for cls in classes if cls != drop])
    for name in models.keys():
        model, y_pred = train_test(models[name], X_train, y_train, X_test, y_test)
        prediction = images_pred(y_pred)
        print(name," Image Accuracy: ", images_acc(y_test, prediction))
        report = classification_report(y_test, y_pred, output_dict = True)
        cr = pd.DataFrame(report).transpose()
        print(cr)
    print('\n\n')

fatty cirrhosis


  warn(


RFC  Image Accuracy:  0.8461538461538461
              precision    recall  f1-score      support
cirrhosis      0.414219  0.595556  0.488605   450.000000
fatty          0.868116  0.759670  0.810281  1577.000000
accuracy       0.723236  0.723236  0.723236     0.723236
macro avg      0.641168  0.677613  0.649443  2027.000000
weighted avg   0.767350  0.723236  0.738868  2027.000000
MLP  Image Accuracy:  0.8461538461538461
              precision    recall  f1-score      support
cirrhosis      0.453815  0.502222  0.476793   450.000000
fatty          0.853499  0.827521  0.840309  1577.000000
accuracy       0.755303  0.755303  0.755303     0.755303
macro avg      0.653657  0.664871  0.658551  2027.000000
weighted avg   0.764768  0.755303  0.759607  2027.000000
SVC  Image Accuracy:  0.8461538461538461
              precision    recall  f1-score     support
cirrhosis      0.430723  0.635556  0.513465   450.00000
fatty          0.879677  0.760304  0.815646  1577.00000
accuracy       0.732610  

  warn(


RFC  Image Accuracy:  0.6785714285714286
              precision    recall  f1-score     support
cirrhosis      0.677165  0.573333  0.620939  450.000000
normal         0.671795  0.761628  0.713896  516.000000
accuracy       0.673913  0.673913  0.673913    0.673913
macro avg      0.674480  0.667481  0.667418  966.000000
weighted avg   0.674297  0.673913  0.670593  966.000000
MLP  Image Accuracy:  0.5357142857142857
              precision    recall  f1-score     support
cirrhosis      0.568365  0.471111  0.515188  450.000000
normal         0.598651  0.687984  0.640216  516.000000
accuracy       0.586957  0.586957  0.586957    0.586957
macro avg      0.583508  0.579548  0.577702  966.000000
weighted avg   0.584542  0.586957  0.581974  966.000000
SVC  Image Accuracy:  0.6071428571428571
              precision    recall  f1-score     support
cirrhosis      0.616307  0.571111  0.592849  450.000000
normal         0.648452  0.689922  0.668545  516.000000
accuracy       0.634576  0.634576  0.

  warn(


RFC  Image Accuracy:  0.8507462686567164
              precision    recall  f1-score      support
fatty          0.899475  0.760304  0.824055  1577.000000
normal         0.502632  0.740310  0.598746   516.000000
accuracy       0.755375  0.755375  0.755375     0.755375
macro avg      0.701053  0.750307  0.711401  2093.000000
weighted avg   0.801639  0.755375  0.768508  2093.000000
MLP  Image Accuracy:  0.8208955223880597
              precision    recall  f1-score      support
fatty          0.866757  0.808497  0.836614  1577.000000
normal         0.514469  0.620155  0.562390   516.000000
accuracy       0.762064  0.762064  0.762064     0.762064
macro avg      0.690613  0.714326  0.699502  2093.000000
weighted avg   0.779906  0.762064  0.769008  2093.000000
SVC  Image Accuracy:  0.8059701492537313
              precision    recall  f1-score      support
fatty          0.876664  0.793278  0.832889  1577.000000
normal         0.510511  0.658915  0.575296   516.000000
accuracy       0.76015

In [30]:
# feat importance
files = ['fatty_normal', 'cirrhosis_fatty', 'cirrhosis_normal']
features_acc={}
for name in files:
    features_acc[name] = pd.read_csv(f'dataset/segment/manual selection/{name}.csv', index_col = 0)

In [31]:
normal_fatty_mlp = MLPClassifier(
                    max_iter=600,
                    momentum=0.6,
                    solver='adam',
                    activation='relu',
                    learning_rate_init=0.005,
                    alpha=0.001,
                    random_state=31
                    )

feat_imp = features_acc['fatty_normal']['ANN Accuracy'].sort_values(ascending=False)
normal_fatty_cols = feat_imp.index[0:19]

X_train, y_train, X_test, y_test, normal_fatty_std = split(data, test_data, 'cirrhosis', cols = normal_fatty_cols)
model, y_pred = train_test(normal_fatty_mlp, X_train, y_train, X_test, y_test)
normal_fatty_mlp = model
prediction = images_pred(y_pred)
print("Normal/Fatty MLP Image Accuracy: ", images_acc(y_test, prediction))
report = classification_report(y_test, y_pred, output_dict = True)
cr = pd.DataFrame(report).transpose()
print(cr)

Normal/Fatty MLP Image Accuracy:  0.8059701492537313
              precision    recall  f1-score      support
fatty          0.848315  0.766011  0.805065  1577.000000
normal         0.448430  0.581395  0.506329   516.000000
accuracy       0.720497  0.720497  0.720497     0.720497
macro avg      0.648373  0.673703  0.655697  2093.000000
weighted avg   0.749729  0.720497  0.731416  2093.000000


In [32]:
normal_cirrhosis_mlp = MLPClassifier(
                    max_iter=600,
                    momentum=0.6,
                    solver='adam',
                    activation='relu',
                    learning_rate_init=0.005,
                    alpha=0.001,
                    random_state=81
                    )

feat_imp = features_acc['cirrhosis_normal']['ANN Accuracy'].sort_values(ascending=False)
normal_cirrhosis_cols = feat_imp.index[0:21].insert(0,'length')
X_train, y_train, X_test, y_test, normal_cirrhosis_std = split(data, test_data, 'fatty', cols = normal_cirrhosis_cols)
model, y_pred = train_test(normal_cirrhosis_mlp, X_train, y_train, X_test, y_test)
normal_cirrhosis_mlp = model
prediction = images_pred(y_pred)
print("normal/cirrhosis MLP Image Accuracy: ", images_acc(y_test, prediction))
report = classification_report(y_test, y_pred, output_dict = True)
cr = pd.DataFrame(report).transpose()
print(cr)

normal/cirrhosis MLP Image Accuracy:  0.7142857142857143
              precision    recall  f1-score     support
cirrhosis      0.590818  0.657778  0.622503  450.000000
normal         0.668817  0.602713  0.634047  516.000000
accuracy       0.628364  0.628364  0.628364    0.628364
macro avg      0.629818  0.630245  0.628275  966.000000
weighted avg   0.632482  0.628364  0.628669  966.000000


In [33]:
fatty_cirrhosis_mlp = MLPClassifier(
                    max_iter=600,
                    momentum=0.6,
                    solver='adam',
                    activation='relu',
                    learning_rate_init=0.005,
                    alpha=0.001,
                    random_state=53
                    )

feat_imp = features_acc['cirrhosis_fatty']['ANN Accuracy'].sort_values(ascending=False)
fatty_cirrhosis_cols = feat_imp.index[0:63].insert(0,'length')

X_train, y_train, X_test, y_test, fatty_cirrhosis_std = split(data, test_data, 'normal', cols = fatty_cirrhosis_cols)
model, y_pred = train_test(fatty_cirrhosis_mlp, X_train, y_train, X_test, y_test)
fatty_cirrhosis_mlp = model
prediction = images_pred(y_pred)
print("cirrhosis/Fatty MLP Image Accuracy: ", images_acc(y_test, prediction))
report = classification_report(y_test, y_pred, output_dict = True)
cr = pd.DataFrame(report).transpose()
print(cr)

cirrhosis/Fatty MLP Image Accuracy:  0.8307692307692308
              precision    recall  f1-score      support
cirrhosis      0.500000  0.420000  0.456522   450.000000
fatty          0.841722  0.880152  0.860508  1577.000000
accuracy       0.777997  0.777997  0.777997     0.777997
macro avg      0.670861  0.650076  0.658515  2027.000000
weighted avg   0.765859  0.777997  0.770822  2027.000000


In [34]:
models = {
    "normal_fatty": (normal_fatty_mlp, normal_fatty_std, normal_fatty_cols),
    "normal_cirrhosis": (normal_cirrhosis_mlp, normal_cirrhosis_std, normal_cirrhosis_cols),
    "fatty_cirrhosis": (fatty_cirrhosis_mlp, fatty_cirrhosis_std, fatty_cirrhosis_cols)
}

X_test = test_data.copy()
y_test = X_test.pop('target')

In [35]:
predictions = {}
for name in models.keys():
    cols = models[name][2]
    X_test = test_data.copy()
    y_test = X_test.pop('target')
    X_test = pd.DataFrame(models[name][1].transform(X_test[cols]), columns = cols, index = X_test.index)
    X_test =  X_test[cols]
    y_pred = pd.Series(models[name][0].predict(X_test),index=y_test.index)
    predictions[name] = images_pred(y_pred)
    
image_names = np.unique(y_test.index)

In [36]:
image_name = np.unique(y_test.index)
final_pred = {}
for image in image_name:
    pred = {
        'normal': 0,
        'fatty': 0,
        'cirrhosis': 0
    }
    for model in predictions.keys():
        pred[predictions[model][image]] += 1
    cls = max(pred, key=pred.get)
    if pred[cls] == 1:
        final_pred[image] = 'abstain'
    else: final_pred[image] = cls

In [37]:
images_acc(y_test, final_pred)

0.6625

In [38]:
y_test = y_test[~y_test.index.duplicated(keep='first')].sort_index()
y_pred = pd.Series(final_pred).sort_index()
abstain = y_pred[y_pred=='abstain'].index

y_test = y_test.drop(abstain)
y_pred = y_pred.drop(abstain)

report = classification_report(y_test, y_pred, output_dict = True)
cr = pd.DataFrame(report).transpose()
print(cr)
print("Abstention Rate: ", len(abstain)/(len(y_pred)+len(abstain)))

              precision    recall  f1-score    support
cirrhosis      0.375000  0.250000  0.300000  12.000000
fatty          0.785714  0.862745  0.822430  51.000000
normal         0.461538  0.428571  0.444444  14.000000
accuracy       0.688312  0.688312  0.688312   0.688312
macro avg      0.540751  0.513772  0.522291  77.000000
weighted avg   0.662766  0.688312  0.672288  77.000000
Abstention Rate:  0.0375
