# Download and Predict

Notebook to download the old Delphi models and the corresponding D3M datasets from S3
and calculate the score that the old models obtain when run on the TEST partition of
the dataset.

### Configuration

In [1]:
import io
import json
import os
import shutil
import tarfile
import zlib

import boto3
import pandas as pd
from sklearn.externals.joblib.numpy_pickle import NumpyUnpickler

s3 = boto3.resource('s3')

atm_bucket_name = 'atm-data-store'
atm_path = 'gp+bandit-search/'

d3m_bucket_name = 'd3m-data-dai'
d3m_path = 'datasets/'

atm_bucket = s3.Bucket(atm_bucket_name)
d3m_bucket = s3.Bucket(d3m_bucket_name)

### Download Delphi CSVs

In [2]:
for table in ('datasets', 'dataruns', 'classifiers'):   #, 'd3m_atm_overlap'):
    csv_name = 'csvs/{}.csv'.format(table)
    print('Downloading file {} from S3'.format(csv_name))
    atm_bucket.download_file(Key=atm_path + csv_name, Filename=csv_name)

Downloading file csvs/datasets.csv from S3
Downloading file csvs/dataruns.csv from S3
Downloading file csvs/classifiers.csv from S3


In [3]:
overlap = pd.read_csv('csvs/d3m_atm_overlap.csv')
datasets = pd.read_csv('csvs/datasets.csv')
dataruns = pd.read_csv('csvs/dataruns.csv')
classifiers = pd.read_csv('csvs/classifiers.csv')

In [4]:
def get_row(df, column, value):
    rows = df[df[column] == value]
    
    if len(rows) > 1:
        raise Exception("More than one row found")
    
    return rows.iloc[0]

get_row(datasets, 'name', 'Australian_1')

dataset_id                                          4
name                                     Australian_1
train_path      data/processed/Australian_1_train.csv
test_path        data/processed/Australian_1_test.csv
class_column                                        0
Name: 3, dtype: object

In [5]:
dataset_name = 'Australian_1'

def get_classifier(dataset_name):
    dataset = get_row(datasets, 'name', dataset_name)
    datarun = get_row(dataruns, 'dataset_id', dataset.dataset_id)
    ds_classifiers = classifiers[classifiers.datarun_id == datarun.datarun_id]
    
    classifier = ds_classifiers.sort_values('test_judgment_metric', ascending=False).iloc[0]
    
    return classifier

classifier = get_classifier(dataset_name)

In [6]:
def download_model(model_location):
    key = atm_path + model_location
    body = atm_bucket.Object(key).get()['Body'].read()
    decomp = zlib.decompress(body)
    return NumpyUnpickler('', io.BytesIO(decomp)).load()

model = download_model(classifier.model_location)

gnumpy: failed to import cudamat. Using npmat instead. No GPU will be used.




In [7]:
def download_d3m_dataset(d3m_dataset):
    archive = d3m_dataset + '.tar.gz'
    key = d3m_path + archive
    d3m_bucket.download_file(Key=key, Filename=archive)
    
    with tarfile.open(archive) as tar:
        tar.extractall()
    
    # Cleanup
    os.remove(archive)

def get_d3m_data(dataset_name):
    d3m_dataset = get_row(overlap, 'name', dataset_name).name_d3m
    download_d3m_dataset(d3m_dataset)
    
    problem_doc_path = d3m_dataset + '/SCORE/problem_TEST/problemDoc.json'
    with open(problem_doc_path) as f:
        problem_doc = json.load(f)
    
    learning_data_path = d3m_dataset + '/SCORE/dataset_TEST/tables/learningData.csv'
    X = pd.read_csv(learning_data_path, index_col=0)
    del X[problem_doc['inputs']['data'][0]['targets'][0]['colName']]
    
    y = pd.read_csv(d3m_dataset + '/SCORE/targets.csv', index_col=0)
    
    metric = problem_doc['inputs']['performanceMetrics'][0]['metric']
    
    # Cleanup
    shutil.rmtree(d3m_dataset)
    
    return X.values, y.values.ravel(), metric

X, y, metric = get_d3m_data('Australian_1')

In [8]:
y_pred = model.predict(X, input_type='vector')

In [9]:
from metrics import METRICS_DICT

METRICS_DICT[metric.lower()](y, y_pred)

  'precision', 'predicted', average, warn_for)
  'recall', 'true', average, warn_for)


0.2703962703962704

In [10]:
def score_model(dataset_name):
    print('Scoring dataset {}'.format(dataset_name))
    classifier = get_classifier(dataset_name)
    
    # print('Downloading trained ATM model')
    model = download_model(classifier.model_location)
    atm_score = classifier.test_judgment_metric
    
    # print('Downloading D3M dataset {}'.format(d3m_dataset))
    X, y, metric = get_d3m_data(dataset_name)
    
    y_pred = model.predict(X, input_type='vector')
    
    return y, y_pred, metric, atm_score
    # return METRICS_DICT[metric.lower()](y, y_pred)

y, y_pred, metric, atm_score = score_model(overlap.iloc[0]['name'])

Scoring dataset Australian_1


In [11]:
values = dict()
for _, row in overlap.iterrows():
    name = row['name']
    print('Scoring dataset {}'.format(name))
    
    classifier = get_classifier(name)
    model = download_model(classifier.model_location)
    atm_score = classifier.test_judgment_metric
    X, y, metric = get_d3m_data(name)
    
    dataset_values = {
        'y': y,
        'metric': metric,
        'atm_score': atm_score,
        'model': model
    }
    values[name] = dataset_values
    
    try:
        # y, y_pred, metric, atm_score = score_model(name)
        
        y_pred = model.predict(X, input_type='vector')
        
        dataset_values['y_pred'] = y_pred
        
    except Exception as e:
        print('    PREDICTION_ERROR: {}'.format(e))
        dataset_values['X'] = X
        dataset_values['status'] = 'PREDICTION_ERROR'
    
    else:
        try:
            #if name in labels:
            #    y = pd.Series(y).apply(labels[name].get).values
                
            score = METRICS_DICT[metric.lower()](y, y_pred)
            
            dataset_values['score'] = score
            dataset_values['status'] = 'SCORED'
            print('    SCORE: {}'.format(score))
            
        except Exception as e:
            print('    SCORE_ERROR: {}'.format(e))
            dataset_values['status'] = 'SCORE_ERROR'

Scoring dataset Australian_1
    SCORE: 0.270396270396
Scoring dataset CostaMadre1_1
    SCORE_ERROR: Mix of label input types (string and number)
Scoring dataset MegaWatt1_1
    SCORE_ERROR: Mix of label input types (string and number)
Scoring dataset SPECT_1




    PREDICTION_ERROR: operands could not be broadcast together with shapes (81,66) (22,) (81,66) 
Scoring dataset ailerons_1
    SCORE: 0.583480143932
Scoring dataset autoPrice_1
    SCORE: 160622064.63
Scoring dataset backache_1
    PREDICTION_ERROR: operands could not be broadcast together with shapes (100,32) (31,) (100,32) 
Scoring dataset badges2_1
    PREDICTION_ERROR: operands could not be broadcast together with shapes (100,11) (10,) (100,11) 
Scoring dataset balloon_1


*nolearn.lasagne* for a more modern neural net toolkit.

  """)


    PREDICTION_ERROR: arrays not aligned for dot product. a dot product was requested of arrays with shapes (1, 2) and (1, 299)
Scoring dataset banana_1
    SCORE: 0.027027027027
Scoring dataset bank32nh_1
    SCORE: 0.221107630974
Scoring dataset bodyfat_1
    SCORE: 414.9618
Scoring dataset boston_1
    SCORE: 511.564356436
Scoring dataset collins_1
    PREDICTION_ERROR: could not convert string to float: R03.TXT
Scoring dataset diabetes_1
    SCORE_ERROR: Mix of label input types (string and number)
Scoring dataset ecoli_1
    SCORE_ERROR: Mix of label input types (string and number)
Scoring dataset elevators_1
    SCORE: 0.258988973787
Scoring dataset flags_1
    PREDICTION_ERROR: could not convert string to float: green
Scoring dataset glass_1
    SCORE_ERROR: Mix of label input types (string and number)
Scoring dataset haberman_1
    SCORE: 0.425287356322
Scoring dataset housing_1
    SCORE_ERROR: Mix of label input types (string and number)
Scoring dataset ilpd_1
    PREDICTION_

In [15]:
dfv = pd.DataFrame(values).T

In [26]:
dfv.head()

Unnamed: 0,X,atm_score,metric,model,score,status,y,y_pred
Australian_1,,0.903553,f1Macro,<delphi.model.Model object at 0x7f4ca415ab90>,0.270396,SCORED,"[-1, 1, 1, -1, -1, -1, 1, -1, -1, -1, -1, -1, ...","[0.0, 1.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, ..."
CostaMadre1_1,,0.545455,f1Macro,<delphi.model.Model object at 0x7f4c52bffa10>,,SCORE_ERROR,"[N, N, N, N, Y, N, N, N, N, Y, N, Y, N, Y, N, ...","[0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, ..."
MegaWatt1_1,,0.4,f1Macro,<delphi.model.Model object at 0x7f4ca415ac50>,,SCORE_ERROR,"[N, Y, N, N, N, Y, N, N, N, N, N, N, N, Y, N, ...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
SPECT_1,"[[59, 52, 70, 67, 73, 66, 72, 61, 58, 52, 72, ...",0.931298,f1,<delphi.model.Model object at 0x7f4c50917e50>,,PREDICTION_ERROR,"[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, ...",
ailerons_1,,0.901053,meanSquaredError,<delphi.model.Model object at 0x7f4c508ccc90>,0.58348,SCORED,"[-0.0009, -0.0009, -0.0009, -0.001, -0.0009, -...","[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 1.0, ..."


In [31]:
dfv['y_classes'] = dfv.y.apply(pd.unique)

In [32]:
dfv['n_classes'] = dfv.y_classes.apply(len)

In [51]:
dfv['pred_classes'] = dfv.y_pred.apply(lambda y: [] if isinstance(y, float) else pd.unique(y))
dfv['pred_classes'] = dfv.pred_classes.apply(len)

In [52]:
summary = dfv[['status', 'score', 'metric', 'n_classes', 'pred_classes', 'y_classes']]

In [53]:
summary.to_csv('summary.csv')

In [54]:
pd.set_option('max_rows', 100)

In [55]:
summary

Unnamed: 0,status,score,metric,n_classes,pred_classes,y_classes
Australian_1,SCORED,0.270396,f1Macro,2,2,"[-1, 1]"
CostaMadre1_1,SCORE_ERROR,,f1Macro,2,2,"[N, Y]"
MegaWatt1_1,SCORE_ERROR,,f1Macro,2,2,"[N, Y]"
SPECT_1,PREDICTION_ERROR,,f1,2,0,"[1, 0]"
ailerons_1,SCORED,0.58348,meanSquaredError,26,2,"[-0.0009, -0.001, -0.0006, -0.0008, -0.0007, -..."
autoPrice_1,SCORED,160622000.0,meanSquaredError,97,2,"[13950.0, 17710.0, 16430.0, 16925.0, 6295.0, 6..."
backache_1,PREDICTION_ERROR,,f1Macro,2,0,"[0, 1]"
badges2_1,PREDICTION_ERROR,,f1Macro,2,0,"[-, +]"
balloon_1,PREDICTION_ERROR,,meanSquaredError,177,0,"[0.09, 0.056, 0.073, 0.049, 0.011, 0.029, 0.02..."
banana_1,SCORED,0.027027,f1Macro,2,2,"[2, 1]"
