Merge pull request #81 from HDI-Project/bcyphers/python3

Upgrade to Python 3
HDI-Project · Feb 13, 2018 · 959bdc2 · 959bdc2
2 parents 9b201ec + 4bf64fb
commit 959bdc2
Show file tree

Hide file tree

Showing 17 changed files with 81 additions and 55 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -18,6 +18,6 @@ jobs:
       - checkout
       - run: apt-get -qq update
       - run: apt-get -qq -y install git mysql-client libmysqlclient-dev
-      - run: pyenv local 2.7.13 # 3.5.2 3.6.0
+      - run: pyenv local 2.7.13 3.5.2 3.6.0
       - run: make installdeps
       - run: make lint && tox && codecov
diff --git a/README.md b/README.md
@@ -50,7 +50,7 @@ This section describes the quickest way to get started with ATM on a modern mach
    ```
 
 3. **Install python dependencies**
-   - python=2.7
+  ATM is tested with Python 2.7+ and Python 3.5+.
    ```
    $ virtualenv venv
    $ . venv/bin/activate

diff --git a/atm/__init__.py b/atm/__init__.py
@@ -2,6 +2,7 @@
 A multi-user, multi-data AutoML framework.
 """
 from __future__ import absolute_import
+from __future__ import unicode_literals
 import logging
 import os
 

diff --git a/atm/config.py b/atm/config.py
@@ -1,11 +1,12 @@
-from __future__ import absolute_import
+from __future__ import absolute_import, unicode_literals
 
 import logging
 import os
 import re
 import socket
 import sys
 from argparse import ArgumentError, ArgumentTypeError, RawTextHelpFormatter
+from builtins import map, object, str
 
 import yaml
 
@@ -233,7 +234,7 @@ def add_arguments_logging(parser):
                         help='If set, compute full ROC and PR curves and '
                         'per-label metrics for each classifier')
 
-    log_levels = map(str.lower, LOG_LEVELS.keys())
+    log_levels = list(map(str.lower, list(LOG_LEVELS.keys())))
     parser.add_argument('--log-level-file', choices=log_levels,
                         help='minimum log level to write to the log file')
     # if this is being called from the command line, print more information to
@@ -516,7 +517,7 @@ def load_config(sql_path=None, run_path=None, aws_path=None, log_path=None, **kw
     # kwargs are most likely generated by argparse.
     # Any unspecified argparse arguments will be None, so ignore those. We only
     # care about arguments explicitly specified by the user.
-    kwargs = {k: v for k, v in kwargs.items() if v is not None}
+    kwargs = {k: v for k, v in list(kwargs.items()) if v is not None}
 
     # check the keyword args for config paths
     sql_path = sql_path or kwargs.get('sql_config')
@@ -541,13 +542,13 @@ def load_config(sql_path=None, run_path=None, aws_path=None, log_path=None, **kw
             log_args = yaml.load(f)
 
     # Use keyword args to override yaml config values
-    sql_args.update({k.replace('sql_', ''): v for k, v in kwargs.items()
+    sql_args.update({k.replace('sql_', ''): v for k, v in list(kwargs.items())
                      if 'sql_' in k})
-    aws_args.update({k.replace('aws_', ''): v for k, v in kwargs.items()
+    aws_args.update({k.replace('aws_', ''): v for k, v in list(kwargs.items())
                      if 'aws_' in k})
-    run_args.update({k: v for k, v in kwargs.items() if k in
+    run_args.update({k: v for k, v in list(kwargs.items()) if k in
                      RunConfig.PARAMETERS})
-    log_args.update({k: v for k, v in kwargs.items() if k in
+    log_args.update({k: v for k, v in list(kwargs.items()) if k in
                      LogConfig.PARAMETERS})
 
     # It's ok if there are some extra arguments that get passed in here; only

diff --git a/atm/constants.py b/atm/constants.py
@@ -1,7 +1,8 @@
-from __future__ import absolute_import
+from __future__ import absolute_import, unicode_literals
 
 import logging
 import os
+from builtins import object
 
 from . import PROJECT_ROOT
 
@@ -84,32 +85,32 @@
 }
 
 
-class ClassifierStatus:
+class ClassifierStatus(object):
     RUNNING = 'running'
     ERRORED = 'errored'
     COMPLETE = 'complete'
 
 
-class RunStatus:
+class RunStatus(object):
     PENDING = 'pending'
     RUNNING = 'running'
     COMPLETE = 'complete'
 
 
-class PartitionStatus:
+class PartitionStatus(object):
     INCOMPLETE = 'incomplete'
     GRIDDING_DONE = 'gridding_done'
     ERRORED = 'errored'
 
 
-class FileType:
+class FileType(object):
     LOCAL = 'local'
     S3 = 's3'
     HTTP = 'http'
 
 
 # these are the strings that are used to index into results dictionaries
-class Metrics:
+class Metrics(object):
     ACCURACY = 'accuracy'
     RANK_ACCURACY = 'rank_accuracy'
     COHEN_KAPPA = 'cohen_kappa'

diff --git a/atm/database.py b/atm/database.py
@@ -1,7 +1,8 @@
-from __future__ import absolute_import
+from __future__ import absolute_import, unicode_literals
 
 import json
 import pickle
+from builtins import object
 from datetime import datetime
 from operator import attrgetter
 
@@ -278,7 +279,7 @@ def mu_sigma_judgment_metric(self):
 
             def __repr__(self):
                 params = ', '.join(['%s: %s' % i for i in
-                                    self.hyperparameter_values.items()])
+                                    list(self.hyperparameter_values.items())])
                 return "<id=%d, params=(%s)>" % (self.id, params)
 
         Datarun.classifiers = relationship('Classifier',
@@ -334,7 +335,7 @@ def from_csv(self, path):
 
             for _, r in df.iterrows():
                 # replace NaN and NaT with None
-                for k, v in r.items():
+                for k, v in list(r.items()):
                     if pd.isnull(v):
                         r[k] = None
 

diff --git a/atm/encoder.py b/atm/encoder.py
@@ -1,5 +1,10 @@
+from __future__ import division, unicode_literals
+
+from builtins import object
+
 import numpy as np
 import pandas as pd
+from past.utils import old_div
 from sklearn.preprocessing import LabelEncoder, OneHotEncoder
 
 
@@ -21,7 +26,7 @@ def __init__(self, class_column, train_path, test_path=None):
         for c in data.columns:
             if data[c].dtype == 'object':
                 total_features += len(np.unique(data[c])) - 1
-        majority_percentage = float(max(counts)) / float(sum(counts))
+        majority_percentage = old_div(float(max(counts)), float(sum(counts)))
 
         self.n_examples = data.shape[0]
         self.d_features = total_features
@@ -97,7 +102,7 @@ def transform(self, data):
         features = data[self.feature_columns]
 
         # encode each categorical feature as an integer
-        for column, encoder in self.column_encoders.items():
+        for column, encoder in list(self.column_encoders.items()):
             features[column] = encoder.transform(features[column])
 
         # one-hot encode the categorical features

diff --git a/atm/enter_data.py b/atm/enter_data.py
@@ -1,9 +1,12 @@
-from __future__ import absolute_import
+from __future__ import absolute_import, division, unicode_literals
 
 import logging
 import os
+from builtins import map
 from datetime import datetime, timedelta
 
+from past.utils import old_div
+
 from .config import *
 from .constants import *
 from .database import Database
@@ -45,7 +48,7 @@ def create_dataset(db, run_config, aws_config=None):
                                 k_classes=meta.k_classes,
                                 d_features=meta.d_features,
                                 majority=meta.majority,
-                                size_kb=meta.size / 1000)
+                                size_kb=old_div(meta.size, 1000))
     return dataset
 
 
@@ -128,7 +131,7 @@ def enter_data(sql_config, run_config, aws_config=None,
         datarun = create_datarun(db, dataset, run_config)
 
     logger.debug('saving hyperpartions...')
-    for method, parts in method_parts.items():
+    for method, parts in list(method_parts.items()):
         for part in parts:
             # if necessary, create a new datarun for each hyperpartition.
             # This setting is useful for debugging.

diff --git a/atm/method.py b/atm/method.py
@@ -1,8 +1,8 @@
-from __future__ import absolute_import
+from __future__ import absolute_import, unicode_literals
 
 import json
 from builtins import str as newstr
-from builtins import object
+from builtins import object, range
 from os.path import join
 
 from .constants import METHOD_PATH, METHODS_MAP
@@ -158,14 +158,14 @@ def __init__(self, method):
 
         # create hyperparameters from the parameter config
         self.parameters = {}
-        for k, v in config['hyperparameters'].items():
+        for k, v in list(config['hyperparameters'].items()):
             param_type = HYPERPARAMETER_TYPES[v['type']]
             self.parameters[k] = param_type(name=k, **v)
 
         # List hyperparameters are special. These are replaced in the
         # CPT with a size hyperparameter and sets of element hyperparameters
         # conditioned on the size.
-        for name, param in self.parameters.items():
+        for name, param in list(self.parameters.items()):
             if type(param) == List:
                 elements, conditions = param.get_elements()
                 for e in elements:
@@ -182,8 +182,8 @@ def __init__(self, method):
                     self.root_params.remove(param.name)
 
                 # if this is a conditional param, replace it there instead
-                for var, cond in self.conditions.items():
-                    for val, deps in cond.items():
+                for var, cond in list(self.conditions.items()):
+                    for val, deps in list(cond.items()):
                         if param.name in deps:
                             deps.append(param.length.name)
                             deps.remove(param.name)

diff --git a/atm/metrics.py b/atm/metrics.py
@@ -1,7 +1,10 @@
-from __future__ import absolute_import
+from __future__ import absolute_import, division, unicode_literals
+
+from builtins import range
 
 import numpy as np
 import pandas as pd
+from past.utils import old_div
 from sklearn.metrics import (accuracy_score, average_precision_score,
                              cohen_kappa_score, f1_score, matthews_corrcoef,
                              precision_recall_curve, roc_auc_score, roc_curve)
@@ -34,7 +37,7 @@ def rank_n_accuracy(y_true, y_prob_mat, n=0.33):
         if y_true[i] in rankings[i, :]:
             correct_sample_count += 1
 
-    return correct_sample_count / num_samples
+    return old_div(correct_sample_count, num_samples)
 
 
 def get_per_class_matrix(y, classes=None):
@@ -94,7 +97,7 @@ def get_metrics_binary(y_true, y_pred, y_pred_probs, include_curves=False):
     any_probs_nan = np.any(np.isnan(y_pred_probs))
     if not any_probs_nan:
         # AP can be computed even if all labels are the same
-        y_true_bin = get_per_class_matrix(y_true, range(2))
+        y_true_bin = get_per_class_matrix(y_true, list(range(2)))
         results[Metrics.AP] = average_precision_score(y_true_bin, y_pred_probs)
 
         if not all_labels_same:

diff --git a/atm/model.py b/atm/model.py
@@ -3,16 +3,18 @@
    :synopsis: Model around classification method.
 
 """
-from __future__ import absolute_import
+from __future__ import absolute_import, division, unicode_literals
 
 import logging
 import re
 import time
+from builtins import object
 from collections import defaultdict
 from importlib import import_module
 
 import numpy as np
 import pandas as pd
+from past.utils import old_div
 from sklearn import decomposition
 from sklearn.gaussian_process.kernels import (RBF, ConstantKernel,
                                               ExpSineSquared, Matern,
@@ -98,9 +100,9 @@ def make_pipeline(self):
         steps = []
 
         # create a classifier with specified parameters
-        hyperparameters = {k: v for k, v in self.params.iteritems()
+        hyperparameters = {k: v for k, v in list(self.params.items())
                            if k not in Model.ATM_KEYS}
-        atm_params = {k: v for k, v in self.params.iteritems()
+        atm_params = {k: v for k, v in list(self.params.items())
                       if k in Model.ATM_KEYS}
 
         # do special conversions
@@ -157,7 +159,7 @@ def test_final_model(self, X, y):
         # time the prediction
         start_time = time.time()
         total = time.time() - start_time
-        self.avg_predict_time = total / float(len(y))
+        self.avg_predict_time = old_div(total, float(len(y)))
 
         # TODO: this is hacky. See https://github.com/HDI-Project/ATM/issues/48
         binary = self.num_classes == 2
@@ -246,7 +248,7 @@ def special_conversions(self, params):
         # create list parameters
         lists = defaultdict(list)
         element_regex = re.compile('(.*)\[(\d)\]')
-        for name, param in params.items():
+        for name, param in list(params.items()):
             # look for variables of the form "param_name[1]"
             match = element_regex.match(name)
             if match:
@@ -259,7 +261,7 @@ def special_conversions(self, params):
                 # drop the element parameter from our list
                 del params[name]
 
-        for lname, items in lists.items():
+        for lname, items in list(lists.items()):
             # drop the list size parameter
             del params['len(%s)' % lname]
 

diff --git a/atm/tests/unit_tests/test_method.py b/atm/tests/unit_tests/test_method.py
@@ -32,7 +32,7 @@ def test_enumerate():
     hps = Method(config_path).get_hyperpartitions()
 
     assert len(hps) == 12
-    assert all('a' in zip(*hp.categoricals)[0] for hp in hps)
+    assert all('a' in list(zip(*hp.categoricals))[0] for hp in hps)
     assert all(('f', 0.5) in hp.constants for hp in hps)
     assert len([hp for hp in hps if hp.tunables
-                and 'b' in zip(*hp.tunables)[0]]) == 1
+                and 'b' in list(zip(*hp.tunables))[0]]) == 1