In [1]:
import pandas as pd
import numpy as np

# Import data

In [2]:
data = pd.read_csv("data/data.txt", sep = ',')
gt = pd.read_excel("data/ground_truth.xlsx")

In [3]:
features = [113, 173, 475, 489, 603]
col = [int(i) for i in list(np.linspace(0, 150, 76))]

In [4]:
df = data.iloc[features][data.columns[col]].T
df = df.sort_index()

In [5]:
gt.columns = gt.iloc[1]
gt = gt.iloc[2:78,1:10]
gt = gt.sort_values("LESION")

In [6]:
df

Unnamed: 0,113,173,475,489,603
adenoma_1,0.037433,269.0,0.97006,92.307,0.23727
adenoma_10,0.034214,218.0,0.95297,108.560,0.22117
adenoma_11,0.033058,241.0,0.95286,82.827,0.32428
adenoma_12,0.025324,245.0,0.96353,48.178,0.16127
adenoma_13,0.035764,372.0,0.98178,111.580,0.20508
...,...,...,...,...,...
serrated_5,0.034913,292.0,0.97679,87.662,0.27799
serrated_6,0.036592,401.0,0.98770,123.790,0.27492
serrated_7,0.022274,514.0,0.84982,129.410,0.30094
serrated_8,0.030119,187.0,0.92142,76.131,0.30621


In [7]:
gt

1,LESION,GROUND TRUTH,EXPERT 1,EXPERT 2,EXPERT 3,EXPERT 4,BEGINNER 1,BEGINNER 2,BEGINNER 3
54,adenoma_1,adenoma,adenoma,adenoma,adenoma,adenoma,adenoma,adenoma,serrated
11,adenoma_10,adenoma,adenoma,adenoma,adenoma,adenoma,adenoma,adenoma,adenoma
12,adenoma_11,adenoma,adenoma,adenoma,hyperplasic,adenoma,adenoma,adenoma,adenoma
13,adenoma_12,adenoma,adenoma,adenoma,adenoma,adenoma,adenoma,adenoma,adenoma
14,adenoma_13,adenoma,adenoma,adenoma,adenoma,adenoma,adenoma,adenoma,adenoma
...,...,...,...,...,...,...,...,...,...
39,serrated_5,serrated,adenoma,serrated,serrated,adenoma,serrated,adenoma,adenoma
52,serrated_6,serrated,serrated,serrated,serrated,serrated,serrated,adenoma,hyperplasic
40,serrated_7,serrated,serrated,hyperplasic,serrated,serrated,hyperplasic,serrated,hyperplasic
2,serrated_8,serrated,serrated,adenoma,serrated,adenoma,serrated,adenoma,adenoma


# Random Forest for each annotator

In [8]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [13]:
result = {'accuracy': {},
          'probability': {}}
for _ in range(50):

    X_train, X_test, y_train_gt, y_test = train_test_split(df, gt['GROUND TRUTH'], stratify=gt['GROUND TRUTH'], test_size=0.3)

    for annotator in gt.columns[1:]:
        if len(result['accuracy'].keys()) < len(gt.columns[1:]):
            result['accuracy'][annotator] = []
            result['probability'][annotator] = []

        y_train = gt.loc[y_train_gt.index, annotator]

        clf = RandomForestClassifier(max_depth=2)
        clf.fit(X_train, y_train)
        pred = clf.predict(X_test)
        pred_proba = clf.predict_proba(X_test)
        # print(clf.classes_)
        cm = confusion_matrix(y_test, pred)

        # Calculate overall accuracy
        accuracy = cm.diagonal().sum() / cm.sum()
        result['accuracy'][annotator].append(accuracy)
        result['probability'][annotator].append(pred_proba)

In [14]:
pd.Series(y_train_gt).value_counts()

adenoma        28
hyperplasic    15
serrated       10
Name: GROUND TRUTH, dtype: int64

In [15]:
pd.Series(y_test).value_counts()

adenoma        12
hyperplasic     6
serrated        5
Name: GROUND TRUTH, dtype: int64

# Weights based on Majority Vote

In [16]:
labels = gt[gt.columns[2:]]
majority_label = labels.mode(axis=1).iloc[:, 0]
scores = [0] * labels.shape[1]

for n, annotator in enumerate(labels.columns):
    y_train = gt.loc[y_train_gt.index, annotator]
    scores[n] = (labels[annotator] == majority_label).sum()
scores /= sum(scores)

# Combine the prediction

In [59]:
final_pred_prob = [0] * 50
final_pred = [0] * 50
for s in range(50):
    for n, annotator in enumerate(labels.columns):
        final_pred_prob[s] += result['probability'][annotator][s] * scores[n]

    final_pred[s] = final_pred_prob[s].argmax(axis=1)
    final_pred[s] = (clf.classes_)[final_pred[s]]
    
final_accuracy = 0
for s in range(50):
    cm_final = confusion_matrix(y_test, final_pred[s])
    final_accuracy[s] += cm.diagonal().sum() / cm.sum()
final_accuracy /= 50

TypeError: 'int' object is not subscriptable

In [52]:
mean_accuracy_single_model = {key: np.mean(values) for key, values in result['accuracy'].items()}
mean_accuracy_single_model

{'GROUND TRUTH': 0.577391304347826,
 'EXPERT 1': 0.5599999999999999,
 'EXPERT 2': 0.5721739130434782,
 'EXPERT 3': 0.568695652173913,
 'EXPERT 4': 0.606086956521739,
 'BEGINNER 1': 0.48782608695652163,
 'BEGINNER 2': 0.5173913043478261,
 'BEGINNER 3': 0.5782608695652174}

In [53]:
final_accuracy

0.6521739130434776

In [31]:
final_pred_prob

AttributeError: 'list' object has no attribute 'shape'

In [23]:
result['probability']['EXPERT 1'][0]

array([[0.62831845, 0.14646844, 0.22521312],
       [0.56833124, 0.13186889, 0.29979988],
       [0.49170373, 0.24928871, 0.25900756],
       [0.44770505, 0.19281823, 0.35947671],
       [0.44272695, 0.4031832 , 0.15408984],
       [0.4767194 , 0.11158416, 0.41169644],
       [0.39534642, 0.43481397, 0.16983961],
       [0.61399036, 0.14895828, 0.23705136],
       [0.33357889, 0.39541352, 0.27100759],
       [0.57931902, 0.19985858, 0.2208224 ],
       [0.5685916 , 0.20370907, 0.22769933],
       [0.59841154, 0.24863935, 0.1529491 ],
       [0.62152496, 0.09444698, 0.28402806],
       [0.67133056, 0.10139969, 0.22726975],
       [0.58330176, 0.17647097, 0.24022727],
       [0.57650491, 0.09547593, 0.32801916],
       [0.65426406, 0.13782603, 0.20790991],
       [0.55696966, 0.15337859, 0.28965175],
       [0.66828384, 0.14991439, 0.18180177],
       [0.42586157, 0.39046548, 0.18367294],
       [0.43963409, 0.3351478 , 0.2252181 ],
       [0.62361887, 0.08783531, 0.28854582],
       [0.