## import libraries

In [5]:
import pandas as pd
import os
from collections import Counter
import numpy as np
from google.cloud import automl
from functools import partial

## data prep for automl

In [2]:
data = pd.read_csv('thyroid_ltd.csv')

In [3]:
data.shape

(652, 2)

In [8]:

for idx,row in data.iterrows():
    dirname = 'single' if row['target'] == '1' else 'multiple'
    filename = "data/dataset/"+ dirname + '/example_' + str(idx)+".txt"
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "w+") as f:
        f.write(row['Exam Result'])
    f.close()

    #print (idx)
    #print(filename)

## predict using automl model

In [99]:

# TODO(developer): Uncomment and set the following variables
project_id = '666716883415'
model_id = 'TCN2248943282476285952'


#below is a sample for a test report...
content = test['Exam Result'].iloc[0]

prediction_client = automl.PredictionServiceClient()

# Get the full path of the model.
model_full_id = automl.AutoMlClient.model_path(project_id, "us-central1", model_id)

# Supported mime_types: 'text/plain', 'text/html'
# https://cloud.google.com/automl/docs/reference/rpc/google.cloud.automl.v1#textsnippet
text_snippet = automl.TextSnippet(content=content, mime_type="text/plain")
payload = automl.ExamplePayload(text_snippet=text_snippet)

response = prediction_client.predict(name=model_full_id, payload=payload)

for annotation_payload in response.payload:
    print(u"Predicted class name: {}".format(annotation_payload.display_name))
    print(
        u"Predicted class score: {}".format(annotation_payload.classification.score)
    )

Predicted class name: single
Predicted class score: 0.9907870888710022
Predicted class name: multiple
Predicted class score: 0.009212881326675415


This is the loop through all the reports in the test set and making a list of all the responses

In [107]:
responses = []
for rep in test['Exam Result']:
    text_snippet = automl.TextSnippet(content=rep, mime_type="text/plain")
    payload = automl.ExamplePayload(text_snippet=text_snippet)
    response = prediction_client.predict(name=model_full_id, payload=payload)
    responses.append(response)

In [108]:
len(responses)

48

In [120]:
#just have a look at the format of a single reponse
responses[0]

payload {
  annotation_spec_id: "6973062061425164288"
  classification {
    score: 0.9907871
  }
  display_name: "single"
}
payload {
  annotation_spec_id: "1208454538390929408"
  classification {
    score: 0.009212881
  }
  display_name: "multiple"
}

In [115]:
#create an empty dataframe
results_df = pd.DataFrame(columns = ['single', 'multiple'], index = range(len(responses)))

In [117]:
#loop through the responses and populate the dataframe appropriately
for i, res in enumerate(responses):
    for payload in res.payload:
        results_df.iloc[i][payload.display_name] = payload.classification.score

In [118]:
results_df.head()

Unnamed: 0,single,multiple
0,0.990787,0.009213
1,0.990224,0.009776
2,0.996347,0.003653
3,0.450748,0.549252
4,0.997189,0.002812


In [125]:
def get_predictions(df, thresh = 0.5):
    preds = np.array([df.single, df.multiple])
    prediction = 0 if preds[0] >= thresh else 1
    return '1' if prediction == 0 else '>1'

In [126]:
results_df['predictions_0.5'] = results_df.apply(get_predictions, axis = 1)

In [127]:
results_df.head()

Unnamed: 0,single,multiple,predictions_0.5
0,0.990787,0.009213,1
1,0.990224,0.009776,1
2,0.996347,0.003653,1
3,0.450748,0.549252,>1
4,0.997189,0.002812,1


In [130]:
Counter(results_df['predictions_0.5'])

Counter({'1': 44, '>1': 4})

In [131]:
thresholds = [0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]

In [133]:

get_pred = partial(get_predictions, thresh = 0.6)

In [134]:
get_pred

functools.partial(<function get_predictions at 0x000001633549BDC0>, thresh=0.6)

In [136]:
for thresh in thresholds:
    col_name = 'predictions_' + str(thresh)
    get_pred = partial(get_predictions,thresh = thresh)
    results_df[col_name] = results_df.apply(get_pred, axis = 1)

In [138]:
results_df.head()

Unnamed: 0,single,multiple,predictions_0.5,predictions_0.55,predictions_0.6,predictions_0.65,predictions_0.7,predictions_0.75,predictions_0.8,predictions_0.85,predictions_0.9,predictions_0.95
0,0.990787,0.009213,1,1,1,1,1,1,1,1,1,1
1,0.990224,0.009776,1,1,1,1,1,1,1,1,1,1
2,0.996347,0.003653,1,1,1,1,1,1,1,1,1,1
3,0.450748,0.549252,>1,>1,>1,>1,>1,>1,>1,>1,>1,>1
4,0.997189,0.002812,1,1,1,1,1,1,1,1,1,1


In [139]:
for col in list(results_df.columns)[2:]:
    print (Counter(results_df[col]))

Counter({'1': 44, '>1': 4})
Counter({'1': 43, '>1': 5})
Counter({'1': 43, '>1': 5})
Counter({'1': 43, '>1': 5})
Counter({'1': 43, '>1': 5})
Counter({'1': 43, '>1': 5})
Counter({'1': 43, '>1': 5})
Counter({'1': 42, '>1': 6})
Counter({'1': 41, '>1': 7})
Counter({'1': 37, '>1': 11})


In [140]:
results_df.to_csv('test_predictions.csv', index = False)

## evaluate results

### reload predictions

In [2]:
preds = pd.read_csv('test_predictions.csv')
preds.head()

Unnamed: 0,single,multiple,predictions_0.5,predictions_0.55,predictions_0.6,predictions_0.65,predictions_0.7,predictions_0.75,predictions_0.8,predictions_0.85,predictions_0.9,predictions_0.95
0,0.990787,0.009213,1,1,1,1,1,1,1,1,1,1
1,0.990224,0.009776,1,1,1,1,1,1,1,1,1,1
2,0.996347,0.003653,1,1,1,1,1,1,1,1,1,1
3,0.450748,0.549252,>1,>1,>1,>1,>1,>1,>1,>1,>1,>1
4,0.997189,0.002812,1,1,1,1,1,1,1,1,1,1


### load labelled test set

In [3]:
test_df = pd.read_csv('thyroid_test_data.csv')
test_df.head()

Unnamed: 0,Accession,MRN,Sign off Rad,Exam Result,Date,labels
0,16851023.0,N000861996,FITZGERALD,FNA BIOPSY OF THYROID\r\n\r\nHISTORY: Request ...,2021-02-23,1
1,16851107.0,N000444488,FITZGERALD,FNA BIOPSY OF THYROID\r\n\r\nHISTORY: Request ...,2021-02-23,1
2,16851388.0,N000923373,FITZGERALD,FNA BIOPSY OF THYROID\r\n\r\nHISTORY: Request ...,2021-02-23,1
3,16854532.0,N000935827,CARVALHO,FINE-NEEDLE ASPIRATION THYROID NODULE\r\n\r\nP...,2021-02-24,2
4,16854824.0,N000504684,CARVALHO,FINE-NEEDLE ASPIRATION THYROID NODULE\r\n\r\nP...,2021-02-24,1


In [6]:
Counter(test_df.labels)

Counter({'1': 42, '2': 4, nan: 1, '>2': 1})

In [12]:
missing_label = test_df[test_df.labels.isnull()]['Exam Result']

In [14]:
for i in missing_label:  print (i)

******** ADDENDUM #1 ********
CYTOLOGY REPORT:

Left breast: FN21-290

LEFT BREAST 2:00 3 CM FN:
CYST CONTENTS.
Microscopic examination shows abundant proteinaceous debris admixed with scattered macrophages, few cyst lining cells and benign ductal cells. Cytologically malignant cells are not seen.

Right breast: FN21-290

RIGHT BREAST 9:00 at nipple:
CYST CONTENTS.
Microscopic examination shows abundant proteinaceous debris admixed with macrophages and apocrine cells. Cytologically malignant cells are not seen.

Summary: Concordant with imaging findings. Benign-appearing bilateral breast cysts.

BI-RADS 2

******** ORIGINAL REPORT ********
Comparison: 8/12/2020 and prior.

Bilateral simple breast cysts cysts are seen. The largest on the right breast measures 4.1 x 0.8 x 2.7 cm at the 9:00 radian, previously 2.9 x 1.5 x 5.8 cm.

The largest on the left measures 3.3 x 1.4 x 2.7 cm at the 2:00 radian, previously 3.8 x 1.2 x 3.2 cm.

An informed consent was obtained from the patient for as

In [16]:
combo = pd.merge(test_df, preds, left_index = True, right_index = True)

In [17]:
combo.head()

Unnamed: 0,Accession,MRN,Sign off Rad,Exam Result,Date,labels,single,multiple,predictions_0.5,predictions_0.55,predictions_0.6,predictions_0.65,predictions_0.7,predictions_0.75,predictions_0.8,predictions_0.85,predictions_0.9,predictions_0.95
0,16851023.0,N000861996,FITZGERALD,FNA BIOPSY OF THYROID\r\n\r\nHISTORY: Request ...,2021-02-23,1,0.990787,0.009213,1,1,1,1,1,1,1,1,1,1
1,16851107.0,N000444488,FITZGERALD,FNA BIOPSY OF THYROID\r\n\r\nHISTORY: Request ...,2021-02-23,1,0.990224,0.009776,1,1,1,1,1,1,1,1,1,1
2,16851388.0,N000923373,FITZGERALD,FNA BIOPSY OF THYROID\r\n\r\nHISTORY: Request ...,2021-02-23,1,0.996347,0.003653,1,1,1,1,1,1,1,1,1,1
3,16854532.0,N000935827,CARVALHO,FINE-NEEDLE ASPIRATION THYROID NODULE\r\n\r\nP...,2021-02-24,2,0.450748,0.549252,>1,>1,>1,>1,>1,>1,>1,>1,>1,>1
4,16854824.0,N000504684,CARVALHO,FINE-NEEDLE ASPIRATION THYROID NODULE\r\n\r\nP...,2021-02-24,1,0.997189,0.002812,1,1,1,1,1,1,1,1,1,1


In [18]:
len(test_df), len(combo), len(preds)

(48, 48, 48)

In [19]:
Counter(combo.labels)

Counter({'1': 42, '2': 4, nan: 1, '>2': 1})

In [20]:
combo = combo.dropna()

In [22]:
combo.labels.replace({'2': '>1', '>2': '>1'}, inplace = True)

In [23]:
Counter(combo.labels)

Counter({'1': 42, '>1': 5})

In [24]:
combo.columns[8:]

Index(['predictions_0.5', 'predictions_0.55', 'predictions_0.6',
       'predictions_0.65', 'predictions_0.7', 'predictions_0.75',
       'predictions_0.8', 'predictions_0.85', 'predictions_0.9',
       'predictions_0.95'],
      dtype='object')

In [34]:
accs = []
for col in combo.columns[8:]:
    thresh =  col[12:]
    acc = (combo.labels == combo[col]).mean()
    accs.append((f'threshold: {thresh}', f'accuracy: {acc}'))
    
accs

[('threshold: 0.5', 'accuracy: 0.9574468085106383'),
 ('threshold: 0.55', 'accuracy: 0.9787234042553191'),
 ('threshold: 0.6', 'accuracy: 0.9787234042553191'),
 ('threshold: 0.65', 'accuracy: 0.9787234042553191'),
 ('threshold: 0.7', 'accuracy: 0.9787234042553191'),
 ('threshold: 0.75', 'accuracy: 0.9787234042553191'),
 ('threshold: 0.8', 'accuracy: 0.9787234042553191'),
 ('threshold: 0.85', 'accuracy: 0.9574468085106383'),
 ('threshold: 0.9', 'accuracy: 0.9361702127659575'),
 ('threshold: 0.95', 'accuracy: 0.851063829787234')]