In [1]:
import re

In [2]:
import boto3

s3 = boto3.resource('s3')

In [3]:
bucket_name = 'atm-data-store'
base_path = 'gp+bandit-search'

In [4]:
bucket = s3.Bucket(bucket_name)

In [5]:
def read_csv(bucket, path, *args, **kwargs):
    print('Downloading file {} from S3 bucket {}'.format(bucket, path))
    body = bucket.Object(path).get()['Body'].read()
    with io.BytesIO(body) as buf:
        return pd.read_csv(buf, *args, **kwargs)

def to_csv(df, bucket, key, *args, **kwargs):
    print('Uploading file {} to S3 bucket {}'.format(bucket, key))
    with io.StringIO() as buf:
        df.to_csv(buf, *args, **kwargs)
        bucket.Object(key).put(Body=buf.getvalue())

### Get ATM Datasets

#### Download

In [6]:
import io
import zlib

import pandas as pd

def open_remote(path, compressed=False):
    body = bucket.Object(path).get()['Body'].read()
    
    if compressed:
        body = zlib.decompress(body)
        
    return io.BytesIO(body)

with open_remote(base_path + '/csvs/datasets.csv') as f:
    atm = pd.read_csv(f)

In [7]:
atm.shape

(420, 5)

#### Cleanup

In [8]:
RE_ATM_SUFFIX = re.compile('_\d*')

atm['clean'] = atm.name.replace(RE_ATM_SUFFIX, '', regex=True)

# Drop datasets that end up sharing the "clean" name.
atm.drop_duplicates(subset=['clean'], inplace=True, keep=False)
atm.shape

(274, 6)

In [9]:
atm.head()

Unnamed: 0,dataset_id,name,train_path,test_path,class_column,clean
0,1,2dplanes_1,data/processed/2dplanes_1_train.csv,data/processed/2dplanes_1_test.csv,0,2dplanes
1,2,AP_Endometrium_Prostate_1,data/processed/AP_Endometrium_Prostate_1_train...,data/processed/AP_Endometrium_Prostate_1_test.csv,0,APEndometriumProstate
2,3,Amazon_employee_access_1,data/processed/Amazon_employee_access_1_train.csv,data/processed/Amazon_employee_access_1_test.csv,0,Amazonemployeeaccess
3,4,Australian_1,data/processed/Australian_1_train.csv,data/processed/Australian_1_test.csv,0,Australian
4,5,BNG(breast-w)_1,data/processed/BNG(breast-w)_1_train.csv,data/processed/BNG(breast-w)_1_test.csv,0,BNG(breast-w)


### Get D3M Datasets

#### Get list

In [10]:
d3m_bucket = s3.Bucket('d3m-data-dai')

d3m_keys = [obj.key for obj in d3m_bucket.objects.filter(Prefix='datasets')]

RE_DATASET = re.compile('^datasets/(.*)\.tar\.gz')

d3m_datasets = [RE_DATASET.match(key).group(1) for key in d3m_keys]

#### Cleanup

In [11]:
RE_D3M_PREFIX = re.compile('^[^_]+_(\d*_)?')

In [12]:
d3m = pd.DataFrame({'name': d3m_datasets})
d3m['clean'] = d3m.name.replace(RE_D3M_PREFIX, '', regex=True)

# Drop datasets that end up sharing the "clean" name.
d3m.drop_duplicates(subset=['clean'], keep=False, inplace=True)

In [13]:
d3m.head()

Unnamed: 0,name,clean
0,124_120_mnist,mnist
1,124_138_cifar100,cifar100
2,124_153_svhn_cropped,svhn_cropped
3,124_174_cifar10,cifar10
4,124_178_coil100,coil100


### Overlap

In [14]:
match = atm[atm.clean.isin(d3m.clean)].copy()
merged = match.merge(d3m, on='clean', suffixes=('', '_d3m'))
merged.shape

(66, 7)

In [15]:
merged.head()

Unnamed: 0,dataset_id,name,train_path,test_path,class_column,clean,name_d3m
0,4,Australian_1,data/processed/Australian_1_train.csv,data/processed/Australian_1_test.csv,0,Australian,LL0_40509_Australian
1,12,CostaMadre1_1,data/processed/CostaMadre1_1_train.csv,data/processed/CostaMadre1_1_test.csv,0,CostaMadre1,LL0_1446_CostaMadre1
2,16,MegaWatt1_1,data/processed/MegaWatt1_1_train.csv,data/processed/MegaWatt1_1_test.csv,0,MegaWatt1,LL0_1442_MegaWatt1
3,26,SPECT_1,data/processed/SPECT_1_train.csv,data/processed/SPECT_1_test.csv,0,SPECT,uu4_SPECT
4,33,ailerons_1,data/processed/ailerons_1_train.csv,data/processed/ailerons_1_test.csv,0,ailerons,LL0_296_ailerons


In [16]:
merged.to_csv('csvs/d3m_atm_overlap.csv', index=False)