In [1]:
import pandas as pd
import numpy as np

# Import data

In [2]:
data = pd.read_csv("data/data.txt", sep = ',')
gt = pd.read_excel("data/ground_truth.xlsx")

In [3]:
features = [113, 173, 475, 489, 603]
col = [int(i) for i in list(np.linspace(0, 150, 76))]

In [4]:
df = data.iloc[features][data.columns[col]].T
df = df.sort_index()

In [5]:
gt.columns = gt.iloc[1]
gt = gt.iloc[2:78,1:10]
gt = gt.sort_values("LESION")

In [6]:
df

Unnamed: 0,113,173,475,489,603
adenoma_1,0.037433,269.0,0.97006,92.307,0.23727
adenoma_10,0.034214,218.0,0.95297,108.560,0.22117
adenoma_11,0.033058,241.0,0.95286,82.827,0.32428
adenoma_12,0.025324,245.0,0.96353,48.178,0.16127
adenoma_13,0.035764,372.0,0.98178,111.580,0.20508
...,...,...,...,...,...
serrated_5,0.034913,292.0,0.97679,87.662,0.27799
serrated_6,0.036592,401.0,0.98770,123.790,0.27492
serrated_7,0.022274,514.0,0.84982,129.410,0.30094
serrated_8,0.030119,187.0,0.92142,76.131,0.30621


In [7]:
gt

1,LESION,GROUND TRUTH,EXPERT 1,EXPERT 2,EXPERT 3,EXPERT 4,BEGINNER 1,BEGINNER 2,BEGINNER 3
54,adenoma_1,adenoma,adenoma,adenoma,adenoma,adenoma,adenoma,adenoma,serrated
11,adenoma_10,adenoma,adenoma,adenoma,adenoma,adenoma,adenoma,adenoma,adenoma
12,adenoma_11,adenoma,adenoma,adenoma,hyperplasic,adenoma,adenoma,adenoma,adenoma
13,adenoma_12,adenoma,adenoma,adenoma,adenoma,adenoma,adenoma,adenoma,adenoma
14,adenoma_13,adenoma,adenoma,adenoma,adenoma,adenoma,adenoma,adenoma,adenoma
...,...,...,...,...,...,...,...,...,...
39,serrated_5,serrated,adenoma,serrated,serrated,adenoma,serrated,adenoma,adenoma
52,serrated_6,serrated,serrated,serrated,serrated,serrated,serrated,adenoma,hyperplasic
40,serrated_7,serrated,serrated,hyperplasic,serrated,serrated,hyperplasic,serrated,hyperplasic
2,serrated_8,serrated,serrated,adenoma,serrated,adenoma,serrated,adenoma,adenoma


# Random Forest for each annotator

In [101]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [228]:
result = {'accuracy': {},
          'probability': {}}
for _ in range(50):

    X_train, X_test, y_train_gt, y_test = train_test_split(df, gt['GROUND TRUTH'], stratify=gt['GROUND TRUTH'], test_size=0.3)

    for annotator in gt.columns[1:]:

        y_train = gt.loc[y_train_gt.index, annotator]

        clf = RandomForestClassifier(max_depth=2)
        clf.fit(X_train, y_train)
        pred = clf.predict(X_test)
        pred_proba = clf.predict_proba(X_test)
        # print(clf.classes_)
        cm = confusion_matrix(y_test, pred)

        # Calculate overall accuracy
        accuracy = cm.diagonal().sum() / cm.sum()
        result['accuracy'][annotator] = accuracy
        result['probability'][annotator] = pred_proba

In [229]:
pd.Series(y_train_gt).value_counts()

adenoma        28
hyperplasic    15
serrated       10
Name: GROUND TRUTH, dtype: int64

In [230]:
pd.Series(y_test).value_counts()

adenoma        12
hyperplasic     6
serrated        5
Name: GROUND TRUTH, dtype: int64

# Weights based on Majority Vote

In [231]:
labels = gt[gt.columns[2:]]
majority_label = labels.mode(axis=1).iloc[:, 0]
scores = [0] * labels.shape[1]

for n, annotator in enumerate(labels.columns):
    y_train = gt.loc[y_train_gt.index, annotator]
    scores[n] = (labels[annotator] == majority_label).sum()
scores /= sum(scores)

# Combine the prediction

In [232]:
final_pred_prob = 0
for n, annotator in enumerate(labels.columns):
    final_pred_prob += result['probability'][annotator] * scores[n]

final_pred = final_pred_prob.argmax(axis=1)
final_pred = (clf.classes_)[final_pred]

cm_final = confusion_matrix(y_test, final_pred)
final_accuracy = cm.diagonal().sum() / cm.sum()

In [233]:
result['accuracy']

{'GROUND TRUTH': 0.6086956521739131,
 'EXPERT 1': 0.5217391304347826,
 'EXPERT 2': 0.6086956521739131,
 'EXPERT 3': 0.5217391304347826,
 'EXPERT 4': 0.6086956521739131,
 'BEGINNER 1': 0.5217391304347826,
 'BEGINNER 2': 0.5217391304347826,
 'BEGINNER 3': 0.5652173913043478}

In [234]:
final_accuracy

0.5652173913043478