## data prep for automl

In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('thyroid_ltd.csv')

In [3]:
data.shape

(652, 2)

In [8]:
import os

for idx,row in data.iterrows():
    dirname = 'single' if row['target'] == '1' else 'multiple'
    filename = "data/dataset/"+ dirname + '/example_' + str(idx)+".txt"
    os.makedirs(os.path.dirname(filename), exist_ok=True)
    with open(filename, "w+") as f:
        f.write(row['Exam Result'])
    f.close()

    #print (idx)
    #print(filename)

## predict using automl model

In [99]:
from google.cloud import automl

# TODO(developer): Uncomment and set the following variables
project_id = '666716883415'
model_id = 'TCN2248943282476285952'


#below is a sample for a test report...
content = test['Exam Result'].iloc[0]

prediction_client = automl.PredictionServiceClient()

# Get the full path of the model.
model_full_id = automl.AutoMlClient.model_path(project_id, "us-central1", model_id)

# Supported mime_types: 'text/plain', 'text/html'
# https://cloud.google.com/automl/docs/reference/rpc/google.cloud.automl.v1#textsnippet
text_snippet = automl.TextSnippet(content=content, mime_type="text/plain")
payload = automl.ExamplePayload(text_snippet=text_snippet)

response = prediction_client.predict(name=model_full_id, payload=payload)

for annotation_payload in response.payload:
    print(u"Predicted class name: {}".format(annotation_payload.display_name))
    print(
        u"Predicted class score: {}".format(annotation_payload.classification.score)
    )

Predicted class name: single
Predicted class score: 0.9907870888710022
Predicted class name: multiple
Predicted class score: 0.009212881326675415


This is the loop through all the reports in the test set and making a list of all the responses

In [107]:
responses = []
for rep in test['Exam Result']:
    text_snippet = automl.TextSnippet(content=rep, mime_type="text/plain")
    payload = automl.ExamplePayload(text_snippet=text_snippet)
    response = prediction_client.predict(name=model_full_id, payload=payload)
    responses.append(response)

In [108]:
len(responses)

48

In [120]:
#just have a look at the format of a single reponse
responses[0]

payload {
  annotation_spec_id: "6973062061425164288"
  classification {
    score: 0.9907871
  }
  display_name: "single"
}
payload {
  annotation_spec_id: "1208454538390929408"
  classification {
    score: 0.009212881
  }
  display_name: "multiple"
}

In [115]:
#create an empty dataframe
results_df = pd.DataFrame(columns = ['single', 'multiple'], index = range(len(responses)))

In [117]:
#loop through the responses and populate the dataframe appropriately
for i, res in enumerate(responses):
    for payload in res.payload:
        results_df.iloc[i][payload.display_name] = payload.classification.score

In [118]:
results_df.head()

Unnamed: 0,single,multiple
0,0.990787,0.009213
1,0.990224,0.009776
2,0.996347,0.003653
3,0.450748,0.549252
4,0.997189,0.002812


In [124]:
import numpy as np

In [125]:
def get_predictions(df, thresh = 0.5):
    preds = np.array([df.single, df.multiple])
    prediction = 0 if preds[0] >= thresh else 1
    return '1' if prediction == 0 else '>1'

In [126]:
results_df['predictions_0.5'] = results_df.apply(get_predictions, axis = 1)

In [127]:
results_df.head()

Unnamed: 0,single,multiple,predictions_0.5
0,0.990787,0.009213,1
1,0.990224,0.009776,1
2,0.996347,0.003653,1
3,0.450748,0.549252,>1
4,0.997189,0.002812,1


In [130]:
from collections import Counter
Counter(results_df['predictions_0.5'])

Counter({'1': 44, '>1': 4})

In [131]:
thresholds = [0.55, 0.6, 0.65, 0.7, 0.75, 0.8, 0.85, 0.9, 0.95]

In [133]:
from functools import partial
get_pred = partial(get_predictions, thresh = 0.6)

In [134]:
get_pred

functools.partial(<function get_predictions at 0x000001633549BDC0>, thresh=0.6)

In [136]:
for thresh in thresholds:
    col_name = 'predictions_' + str(thresh)
    get_pred = partial(get_predictions,thresh = thresh)
    results_df[col_name] = results_df.apply(get_pred, axis = 1)

In [138]:
results_df.head()

Unnamed: 0,single,multiple,predictions_0.5,predictions_0.55,predictions_0.6,predictions_0.65,predictions_0.7,predictions_0.75,predictions_0.8,predictions_0.85,predictions_0.9,predictions_0.95
0,0.990787,0.009213,1,1,1,1,1,1,1,1,1,1
1,0.990224,0.009776,1,1,1,1,1,1,1,1,1,1
2,0.996347,0.003653,1,1,1,1,1,1,1,1,1,1
3,0.450748,0.549252,>1,>1,>1,>1,>1,>1,>1,>1,>1,>1
4,0.997189,0.002812,1,1,1,1,1,1,1,1,1,1


In [139]:
for col in list(results_df.columns)[2:]:
    print (Counter(results_df[col]))

Counter({'1': 44, '>1': 4})
Counter({'1': 43, '>1': 5})
Counter({'1': 43, '>1': 5})
Counter({'1': 43, '>1': 5})
Counter({'1': 43, '>1': 5})
Counter({'1': 43, '>1': 5})
Counter({'1': 43, '>1': 5})
Counter({'1': 42, '>1': 6})
Counter({'1': 41, '>1': 7})
Counter({'1': 37, '>1': 11})


In [140]:
results_df.to_csv('test_predictions.csv', index = False)