<a href="https://colab.research.google.com/github/HSE-LAMBDA/mldm-2019/blob/master/day-2/contest/NoFreeLunch_competition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import os
import numpy as np
from sklearn.model_selection import train_test_split
from multiprocessing import Process
import time
import pickle
import importlib
from copy import deepcopy

In [0]:
def check_sample(c1, c2, size):
  assert isinstance(c1, np.ndarray), "c1 is not an ndarray"
  assert isinstance(c2, np.ndarray), "c2 is not an ndarray"
  assert c1.ndim == c2.ndim == 2, "got non-2d array"
  assert c1.shape[0] == c2.shape[0] == size, "generate returned an array of wrong length"
  assert c1.shape[1] == c2.shape[1] == 2, "generate returned an array with wrong number of features (expected 2), got: {} and {}".format(c1.shape[1], c2.shape[1])

def merge_ds(c0, c1):
  features = np.concatenate([c0, c1], axis=0)
  answers  = np.ones(shape=(len(c0) + len(c1), 1), dtype=float)
  answers[:len(c0)] = 0.
  result = np.concatenate([features, answers], axis=1)
  np.random.shuffle(result)
  return result

def run_test(dataset, model):
  dataset = dataset.copy()
  data_train, data_test = train_test_split(dataset, test_size=0.2)
  model.fit(data_train[:,:-1], data_train[:,-1])
  predictions = np.array(model.predict(data_test[:,:-1])).squeeze().astype(float)
  assert isinstance(predictions, np.ndarray), "your model predictions must be an ndarray"
  assert predictions.ndim == 1, "your model predictions must be a 1d vector"
  assert len(predictions) == len(data_test), "the length of predictions doesn't equal the lenght of input"

  accuracy = (predictions == data_test[:,-1]).mean()
  return accuracy

In [0]:
def run_test_wrapper(dataset, model, filename):
  try:
    result = run_test(dataset, model)
  except Exception as e:
    result = str(e)
  with open(filename, 'wb') as f:
    pickle.dump(result, f)

def run_test_limited(dataset, model, limit=10):
  filename = 'tmp.pickle'
  p = Process(target=run_test_wrapper, args=(dataset, model, filename))
  p.start()
  p.join(timeout=limit)
  p.terminate()
  time.sleep(1)
  try:
    with open(filename, 'rb') as f:
      result = pickle.load(f)
    os.remove(filename)
    return result
  except:
    pass

  return "Timeout"

In [0]:
TEST_LIMIT = 10
SAMPLE_HALF_SIZE = 400

class User:
  counter = 0
  def __init__(self, name, gen_code, model_code):
    User.counter += 1
    self.name = name
    self.gen_code = gen_code
    self.model_code = model_code
    self.valid = True
    self.scores = {}
    self.scores_T = {}
    self.error = None
    
    fname_gen = "tmp_script_gen_{}".format(User.counter)
    fname_gen_py = fname_gen + '.py'
    if os.path.isfile(fname_gen_py):
      os.remove(fname_gen_py)

    with open(fname_gen_py, 'w') as f:
      f.write(gen_code)

    fname_model = 'tmp_script_model_{}'.format(User.counter)
    fname_model_py = fname_model + '.py'
    if os.path.isfile(fname_model_py):
      os.remove(fname_model_py)
    with open(fname_model_py, 'w') as f:
      f.write(model_code)
      
    try:
      self.m_gen   = importlib.import_module(fname_gen)
      self.m_model = importlib.import_module(fname_model)
      self.generate = self.m_gen.generate
      self.model    = self.m_model.model
      if isinstance(self.model, type):
        self.model = self.model()
      
      c0 = self.generate(SAMPLE_HALF_SIZE, 0).astype(float)
      c1 = self.generate(SAMPLE_HALF_SIZE, 1).astype(float)
      check_sample(c0, c1, SAMPLE_HALF_SIZE)
      self.dataset = merge_ds(c0, c1)

    except Exception as e:
      self.error = str(e)
      self.generate = None
      self.model = None
      self.dataset = None
      self.valid = False

    if self.valid:
      for i in range(4):
        test_result = run_test_limited(self.dataset, self.get_model(), TEST_LIMIT)
        if isinstance(test_result, str):
          self.error = test_result
          self.valid = False
        elif test_result < 0.55:
          self.error = 'the model failed attempt #{} (score = {}) on its own dataset'.format(i + 1, test_result)
          self.valid = False
        else:
          self.error = None
          self.valid = True
          self.scores[self.name] = test_result
          self.scores_T[self.name] = test_result
          break

  def get_model(self):
    return deepcopy(self.model)

In [0]:
import requests
import pandas as pd
from io import StringIO

key = '1Vvv4I4n8p8N3HKk_gZqajD07MuU8hBRcnz27B22D_Xc'
format_str = 'https://docs.google.com/spreadsheets/d/{}/export?format=csv'

data = requests.get(format_str.format(key)).content
df = pd.read_csv(StringIO(data.decode()))

In [6]:
df

Unnamed: 0,Отметка времени,Your name,Data generating function,Model
0,17.09.2019 15:19:53,Baseline_test1,"import numpy as np\n\ndef generate(n, c):\n ...",from sklearn.neighbors import KNeighborsClassi...
1,17.09.2019 15:23:15,Baseline_test2,"def generate(n, c):\n if c == 0:\n ...",from sklearn.neighbors import KNeighborsClassi...
2,17.09.2019 15:24:53,Baseline_test3,"import numpy as np\n\ndef generate(n, c):\n ...",from sklearn.neighbors import KNeighborsClassi...
3,17.09.2019 17:32:16,Boris,"def generate(n, c):\n if c == 0:\n r...",from lightgbm import LGBMClassifier\nfrom skle...
4,17.09.2019 17:34:01,Boris (fixed),"import numpy as np\n\ndef generate(n, c):\n ...",from lightgbm import LGBMClassifier\nfrom skle...
5,17.09.2019 17:46:24,Ravil Khairullin,import random\nfrom operator import xor\ndef g...,model = LogisticRegression()
6,17.09.2019 17:47:32,Khairullin Ravil (2),import random\nfrom operator import xor\ndef g...,from sklearn.linear_model import LogisticRegre...
7,18.09.2019 9:55:37,Eric George Parakal,"def generate(n, c):\n \n if c == 0:\n ...",import xgboost as xgb\n\nxgb_model = xgb.XGBCl...
8,19.09.2019 15:05:23,Ruben Freire,"import numpy as np\ndef generate(n, c):\n ...",from sklearn.neighbors import KNeighborsClassi...
9,19.09.2019 22:51:38,yulia_gurova_s1,"import numpy as np\ndef generate(n,c):\n if c...",from sklearn.neighbors import KNeighborsClassi...


In [0]:
from tqdm import tqdm

In [0]:
def experiment():
  users = []

  for i, entry in df[['Your name', 'Data generating function', 'Model']].iterrows():
    if entry.isna().any(): continue
    name, code_gen, code_model = entry
    if any([name == u.name for u in users]):
      continue

    start_time = time.time()
    users.append(User(name, code_gen, code_model))
    elapsed = time.time() - start_time
    print("User", name, "tested in", elapsed, "seconds")


  for u1 in tqdm(users):
    if not u1.valid:
      continue

    for u2 in users:
      if not u2.valid:
        continue
      
      if u2.name in u1.scores:
        continue
      
      test_result = run_test_limited(u2.dataset, u1.get_model(), TEST_LIMIT)
      if test_result == 'Timeout':
        u1.scores  [u2.name] = -1.
        u2.scores_T[u1.name] = -1.
      else:
        u1.scores  [u2.name] = test_result
        u2.scores_T[u1.name] = test_result
    

  for u in users:
    print(u.name, u.valid, u.error)
    print("  scores:")
    for k, v in u.scores.items():
      try:
        str_v = '{:.3}'.format(v)
      except:
        str_v = str(v)
      print('    {:20} : {}'.format(k, str_v))

  scores = pd.DataFrame([(u.name,
                          np.mean(list(u.scores  .values())),
                          np.mean(list(u.scores_T.values()))) for u in users if u.valid],
                         columns=['username', 'mean_model_score', 'mean_dataset_loss'])
  return scores

In [0]:
import pickle

In [10]:
results = []
for i in range(10):
  print('Running experiment #', i)
  results.append(experiment())

with open("results.pkl", 'wb') as f:
  pickle.dump(results, f)

Running experiment # 0
User Baseline_test1 tested in 1.1803760528564453 seconds
User Baseline_test2 tested in 0.001486063003540039 seconds
User Baseline_test3 tested in 1.0316426753997803 seconds
User Boris tested in 0.05986833572387695 seconds
User Boris (fixed) tested in 1.1137495040893555 seconds
User Ravil Khairullin tested in 0.0020682811737060547 seconds
User Khairullin Ravil (2) tested in 0.0042040348052978516 seconds
User Eric George Parakal tested in 0.13057374954223633 seconds
User Ruben Freire tested in 0.0014677047729492188 seconds
User yulia_gurova_s1 tested in 0.0021152496337890625 seconds
User Ruben Freire (2) tested in 1.0354244709014893 seconds
User yulia_gurova_s2 tested in 1.025989055633545 seconds




User Anna Beketova tested in 4.120912075042725 seconds
User Anna Graessel tested in 1.0375969409942627 seconds
User Fouzi Takelait tested in 0.002881288528442383 seconds
User Eric_George_Parakal_2 tested in 1.2408881187438965 seconds


  0%|          | 0/17 [00:00<?, ?it/s]

User Fouzi_v3 tested in 4.117706537246704 seconds


100%|██████████| 17/17 [00:43<00:00,  2.57s/it]

Baseline_test1 True None
  scores:
    Baseline_test1       : 0.556
    Baseline_test3       : 0.812
    Boris (fixed)        : 0.713
    Ruben Freire (2)     : 1.0
    yulia_gurova_s2      : 0.469
    Anna Graessel        : 0.956
    Eric_George_Parakal_2 : 0.887
Baseline_test2 False name 'np' is not defined
  scores:
Baseline_test3 True None
  scores:
    Baseline_test3       : 0.762
    Baseline_test1       : 0.55
    Boris (fixed)        : 0.713
    Ruben Freire (2)     : 1.0
    yulia_gurova_s2      : 0.469
    Anna Graessel        : 0.956
    Eric_George_Parakal_2 : 0.887
Boris False name 'np' is not defined
  scores:
Boris (fixed) True None
  scores:
    Boris (fixed)        : 0.631
    Baseline_test1       : 0.512
    Baseline_test3       : 0.838
    Ruben Freire (2)     : 1.0
    yulia_gurova_s2      : 0.481
    Anna Graessel        : 0.95
    Eric_George_Parakal_2 : 0.881
Ravil Khairullin False name 'LogisticRegression' is not defined
  scores:
Khairullin Ravil (2) False name




User Baseline_test1 tested in 4.111155271530151 seconds
User Baseline_test2 tested in 0.0014190673828125 seconds
User Baseline_test3 tested in 1.0286836624145508 seconds
User Boris tested in 0.0016589164733886719 seconds
User Boris (fixed) tested in 1.0849993228912354 seconds
User Ravil Khairullin tested in 0.0016138553619384766 seconds
User Khairullin Ravil (2) tested in 0.0040781497955322266 seconds
User Eric George Parakal tested in 0.001191854476928711 seconds
User Ruben Freire tested in 0.0009663105010986328 seconds
User yulia_gurova_s1 tested in 0.0016450881958007812 seconds
User Ruben Freire (2) tested in 1.0312278270721436 seconds
User yulia_gurova_s2 tested in 1.0242066383361816 seconds




User Anna Beketova tested in 4.112219333648682 seconds
User Anna Graessel tested in 1.0365049839019775 seconds
User Fouzi Takelait tested in 0.0026979446411132812 seconds
User Eric_George_Parakal_2 tested in 4.309026002883911 seconds


  0%|          | 0/17 [00:00<?, ?it/s]

User Fouzi_v3 tested in 1.0391740798950195 seconds


100%|██████████| 17/17 [00:31<00:00,  2.38s/it]

Baseline_test1 False the model failed attempt #4 (score = 0.5125) on its own dataset
  scores:
Baseline_test2 False name 'np' is not defined
  scores:
Baseline_test3 True None
  scores:
    Baseline_test3       : 0.769
    Boris (fixed)        : 0.656
    Ruben Freire (2)     : 1.0
    yulia_gurova_s2      : 0.469
    Anna Graessel        : 0.944
    Fouzi_v3             : 0.531
Boris False name 'np' is not defined
  scores:
Boris (fixed) True None
  scores:
    Boris (fixed)        : 0.681
    Baseline_test3       : 0.794
    Ruben Freire (2)     : 1.0
    yulia_gurova_s2      : 0.512
    Anna Graessel        : 0.919
    Fouzi_v3             : 0.544
Ravil Khairullin False name 'LogisticRegression' is not defined
  scores:
Khairullin Ravil (2) False name 'np' is not defined
  scores:
Eric George Parakal False name 'X' is not defined
  scores:
Ruben Freire False unexpected indent (tmp_script_model_26.py, line 3)
  scores:
yulia_gurova_s1 False generate returned an array of wrong length





User Baseline_test1 tested in 1.035107135772705 seconds
User Baseline_test2 tested in 0.0024535655975341797 seconds
User Baseline_test3 tested in 1.0298926830291748 seconds
User Boris tested in 0.0015904903411865234 seconds
User Boris (fixed) tested in 1.0831568241119385 seconds
User Ravil Khairullin tested in 0.0028066635131835938 seconds
User Khairullin Ravil (2) tested in 0.0037446022033691406 seconds
User Eric George Parakal tested in 0.001434326171875 seconds
User Ruben Freire tested in 0.0013704299926757812 seconds
User yulia_gurova_s1 tested in 0.002272367477416992 seconds
User Ruben Freire (2) tested in 1.0386266708374023 seconds
User yulia_gurova_s2 tested in 1.0239841938018799 seconds




User Anna Beketova tested in 4.1205058097839355 seconds
User Anna Graessel tested in 1.0366899967193604 seconds
User Fouzi Takelait tested in 0.002036571502685547 seconds
User Eric_George_Parakal_2 tested in 1.0801610946655273 seconds


  0%|          | 0/17 [00:00<?, ?it/s]

User Fouzi_v3 tested in 1.034132957458496 seconds


100%|██████████| 17/17 [00:58<00:00,  5.22s/it]

Baseline_test1 True None
  scores:
    Baseline_test1       : 0.588
    Baseline_test3       : 0.756
    Boris (fixed)        : 0.681
    Ruben Freire (2)     : 0.994
    yulia_gurova_s2      : 0.506
    Anna Graessel        : 0.963
    Eric_George_Parakal_2 : 0.825
    Fouzi_v3             : 0.487
Baseline_test2 False name 'np' is not defined
  scores:
Baseline_test3 True None
  scores:
    Baseline_test3       : 0.794
    Baseline_test1       : 0.556
    Boris (fixed)        : 0.681
    Ruben Freire (2)     : 0.994
    yulia_gurova_s2      : 0.506
    Anna Graessel        : 0.963
    Eric_George_Parakal_2 : 0.825
    Fouzi_v3             : 0.487
Boris False name 'np' is not defined
  scores:
Boris (fixed) True None
  scores:
    Boris (fixed)        : 0.694
    Baseline_test1       : 0.5
    Baseline_test3       : 0.713
    Ruben Freire (2)     : 0.963
    yulia_gurova_s2      : 0.575
    Anna Graessel        : 0.938
    Eric_George_Parakal_2 : 0.825
    Fouzi_v3             : 0.575





User Baseline_test1 tested in 1.0344176292419434 seconds
User Baseline_test2 tested in 0.0016405582427978516 seconds
User Baseline_test3 tested in 1.032435417175293 seconds
User Boris tested in 0.002153635025024414 seconds
User Boris (fixed) tested in 1.0825214385986328 seconds
User Ravil Khairullin tested in 0.0016350746154785156 seconds
User Khairullin Ravil (2) tested in 0.005871295928955078 seconds
User Eric George Parakal tested in 0.002533435821533203 seconds
User Ruben Freire tested in 0.0013115406036376953 seconds
User yulia_gurova_s1 tested in 0.0018939971923828125 seconds
User Ruben Freire (2) tested in 1.030503511428833 seconds
User yulia_gurova_s2 tested in 1.023808240890503 seconds




User Anna Beketova tested in 1.0376322269439697 seconds
User Anna Graessel tested in 1.0420730113983154 seconds
User Fouzi Takelait tested in 0.001960277557373047 seconds
User Eric_George_Parakal_2 tested in 1.08038330078125 seconds


  0%|          | 0/17 [00:00<?, ?it/s]

User Fouzi_v3 tested in 4.1294310092926025 seconds


100%|██████████| 17/17 [00:58<00:00,  3.42s/it]

Baseline_test1 True None
  scores:
    Baseline_test1       : 0.619
    Baseline_test3       : 0.819
    Boris (fixed)        : 0.65
    Ruben Freire (2)     : 0.994
    yulia_gurova_s2      : 0.438
    Anna Beketova        : 0.619
    Anna Graessel        : 0.95
    Eric_George_Parakal_2 : 0.931
Baseline_test2 False name 'np' is not defined
  scores:
Baseline_test3 True None
  scores:
    Baseline_test3       : 0.781
    Baseline_test1       : 0.562
    Boris (fixed)        : 0.65
    Ruben Freire (2)     : 0.994
    yulia_gurova_s2      : 0.438
    Anna Beketova        : 0.619
    Anna Graessel        : 0.95
    Eric_George_Parakal_2 : 0.931
Boris False name 'np' is not defined
  scores:
Boris (fixed) True None
  scores:
    Boris (fixed)        : 0.644
    Baseline_test1       : 0.569
    Baseline_test3       : 0.844
    Ruben Freire (2)     : 1.0
    yulia_gurova_s2      : 0.487
    Anna Beketova        : 0.588
    Anna Graessel        : 0.919
    Eric_George_Parakal_2 : 0.894
Ravi




User Baseline_test1 tested in 1.0297071933746338 seconds
User Baseline_test2 tested in 0.0016040802001953125 seconds
User Baseline_test3 tested in 1.0297350883483887 seconds
User Boris tested in 0.0018353462219238281 seconds
User Boris (fixed) tested in 1.0868003368377686 seconds
User Ravil Khairullin tested in 0.0020432472229003906 seconds
User Khairullin Ravil (2) tested in 0.0038552284240722656 seconds
User Eric George Parakal tested in 0.001848459243774414 seconds
User Ruben Freire tested in 0.0012955665588378906 seconds
User yulia_gurova_s1 tested in 0.002606630325317383 seconds
User Ruben Freire (2) tested in 1.0357129573822021 seconds
User yulia_gurova_s2 tested in 1.026763677597046 seconds




User Anna Beketova tested in 1.0341613292694092 seconds
User Anna Graessel tested in 1.0382511615753174 seconds
User Fouzi Takelait tested in 0.0020487308502197266 seconds
User Eric_George_Parakal_2 tested in 1.083320140838623 seconds


  0%|          | 0/17 [00:00<?, ?it/s]

User Fouzi_v3 tested in 1.036353588104248 seconds


100%|██████████| 17/17 [01:14<00:00,  6.82s/it]

Baseline_test1 True None
  scores:
    Baseline_test1       : 0.55
    Baseline_test3       : 0.825
    Boris (fixed)        : 0.619
    Ruben Freire (2)     : 0.981
    yulia_gurova_s2      : 0.506
    Anna Beketova        : 0.531
    Anna Graessel        : 0.912
    Eric_George_Parakal_2 : 0.869
    Fouzi_v3             : 0.531
Baseline_test2 False name 'np' is not defined
  scores:
Baseline_test3 True None
  scores:
    Baseline_test3       : 0.831
    Baseline_test1       : 0.544
    Boris (fixed)        : 0.619
    Ruben Freire (2)     : 0.981
    yulia_gurova_s2      : 0.506
    Anna Beketova        : 0.531
    Anna Graessel        : 0.912
    Eric_George_Parakal_2 : 0.869
    Fouzi_v3             : 0.531
Boris False name 'np' is not defined
  scores:
Boris (fixed) True None
  scores:
    Boris (fixed)        : 0.644
    Baseline_test1       : 0.575
    Baseline_test3       : 0.781
    Ruben Freire (2)     : 0.988
    yulia_gurova_s2      : 0.619
    Anna Beketova        : 0.619





User Baseline_test1 tested in 4.104851484298706 seconds
User Baseline_test2 tested in 0.0017015933990478516 seconds
User Baseline_test3 tested in 1.031114101409912 seconds
User Boris tested in 0.0017044544219970703 seconds
User Boris (fixed) tested in 1.0857863426208496 seconds
User Ravil Khairullin tested in 0.0017490386962890625 seconds
User Khairullin Ravil (2) tested in 0.00515294075012207 seconds
User Eric George Parakal tested in 0.0014045238494873047 seconds
User Ruben Freire tested in 0.0010852813720703125 seconds
User yulia_gurova_s1 tested in 0.0024862289428710938 seconds
User Ruben Freire (2) tested in 1.0308735370635986 seconds
User yulia_gurova_s2 tested in 1.024007797241211 seconds




User Anna Beketova tested in 1.0331242084503174 seconds
User Anna Graessel tested in 1.0377192497253418 seconds
User Fouzi Takelait tested in 0.0018150806427001953 seconds
User Eric_George_Parakal_2 tested in 1.0697135925292969 seconds


  0%|          | 0/17 [00:00<?, ?it/s]

User Fouzi_v3 tested in 1.0347445011138916 seconds


100%|██████████| 17/17 [00:58<00:00,  5.66s/it]

Baseline_test1 False the model failed attempt #4 (score = 0.525) on its own dataset
  scores:
Baseline_test2 False name 'np' is not defined
  scores:
Baseline_test3 True None
  scores:
    Baseline_test3       : 0.75
    Boris (fixed)        : 0.662
    Ruben Freire (2)     : 1.0
    yulia_gurova_s2      : 0.506
    Anna Beketova        : 0.581
    Anna Graessel        : 0.919
    Eric_George_Parakal_2 : 0.981
    Fouzi_v3             : 0.525
Boris False name 'np' is not defined
  scores:
Boris (fixed) True None
  scores:
    Boris (fixed)        : 0.637
    Baseline_test3       : 0.769
    Ruben Freire (2)     : 1.0
    yulia_gurova_s2      : 0.463
    Anna Beketova        : 0.575
    Anna Graessel        : 0.919
    Eric_George_Parakal_2 : 0.988
    Fouzi_v3             : 0.588
Ravil Khairullin False name 'LogisticRegression' is not defined
  scores:
Khairullin Ravil (2) False name 'np' is not defined
  scores:
Eric George Parakal False name 'X' is not defined
  scores:
Ruben Freire 




User Baseline_test1 tested in 4.104686260223389 seconds
User Baseline_test2 tested in 0.001790761947631836 seconds
User Baseline_test3 tested in 1.032318353652954 seconds
User Boris tested in 0.0021393299102783203 seconds
User Boris (fixed) tested in 1.0855588912963867 seconds
User Ravil Khairullin tested in 0.0027866363525390625 seconds
User Khairullin Ravil (2) tested in 0.0038874149322509766 seconds
User Eric George Parakal tested in 0.001676797866821289 seconds
User Ruben Freire tested in 0.0011451244354248047 seconds
User yulia_gurova_s1 tested in 0.0018305778503417969 seconds
User Ruben Freire (2) tested in 1.0332505702972412 seconds
User yulia_gurova_s2 tested in 1.0241048336029053 seconds




User Anna Beketova tested in 4.114160776138306 seconds
User Anna Graessel tested in 1.0374765396118164 seconds
User Fouzi Takelait tested in 0.0018572807312011719 seconds
User Eric_George_Parakal_2 tested in 1.0809838771820068 seconds


  0%|          | 0/17 [00:00<?, ?it/s]

User Fouzi_v3 tested in 1.0364127159118652 seconds


100%|██████████| 17/17 [00:43<00:00,  4.10s/it]

Baseline_test1 False the model failed attempt #4 (score = 0.53125) on its own dataset
  scores:
Baseline_test2 False name 'np' is not defined
  scores:
Baseline_test3 True None
  scores:
    Baseline_test3       : 0.787
    Boris (fixed)        : 0.688
    Ruben Freire (2)     : 1.0
    yulia_gurova_s2      : 0.481
    Anna Graessel        : 0.912
    Eric_George_Parakal_2 : 0.812
    Fouzi_v3             : 0.537
Boris False name 'np' is not defined
  scores:
Boris (fixed) True None
  scores:
    Boris (fixed)        : 0.694
    Baseline_test3       : 0.706
    Ruben Freire (2)     : 1.0
    yulia_gurova_s2      : 0.588
    Anna Graessel        : 0.9
    Eric_George_Parakal_2 : 0.819
    Fouzi_v3             : 0.525
Ravil Khairullin False name 'LogisticRegression' is not defined
  scores:
Khairullin Ravil (2) False name 'np' is not defined
  scores:
Eric George Parakal False name 'X' is not defined
  scores:
Ruben Freire False unexpected indent (tmp_script_model_111.py, line 3)
  score




User Baseline_test1 tested in 4.105912685394287 seconds
User Baseline_test2 tested in 0.0027742385864257812 seconds
User Baseline_test3 tested in 1.0310564041137695 seconds
User Boris tested in 0.0019328594207763672 seconds
User Boris (fixed) tested in 1.0862724781036377 seconds
User Ravil Khairullin tested in 0.0019364356994628906 seconds
User Khairullin Ravil (2) tested in 0.0034444332122802734 seconds
User Eric George Parakal tested in 0.0014302730560302734 seconds
User Ruben Freire tested in 0.0012056827545166016 seconds
User yulia_gurova_s1 tested in 0.0018317699432373047 seconds
User Ruben Freire (2) tested in 1.030670404434204 seconds
User yulia_gurova_s2 tested in 1.023280143737793 seconds




User Anna Beketova tested in 1.0374860763549805 seconds
User Anna Graessel tested in 1.0401389598846436 seconds
User Fouzi Takelait tested in 0.0020456314086914062 seconds
User Eric_George_Parakal_2 tested in 1.0829663276672363 seconds


  0%|          | 0/17 [00:00<?, ?it/s]

User Fouzi_v3 tested in 1.0397520065307617 seconds


100%|██████████| 17/17 [00:58<00:00,  5.66s/it]

Baseline_test1 False the model failed attempt #4 (score = 0.5125) on its own dataset
  scores:
Baseline_test2 False name 'np' is not defined
  scores:
Baseline_test3 True None
  scores:
    Baseline_test3       : 0.8
    Boris (fixed)        : 0.65
    Ruben Freire (2)     : 1.0
    yulia_gurova_s2      : 0.438
    Anna Beketova        : 0.55
    Anna Graessel        : 0.9
    Eric_George_Parakal_2 : 0.812
    Fouzi_v3             : 0.55
Boris False name 'np' is not defined
  scores:
Boris (fixed) True None
  scores:
    Boris (fixed)        : 0.706
    Baseline_test3       : 0.806
    Ruben Freire (2)     : 1.0
    yulia_gurova_s2      : 0.512
    Anna Beketova        : 0.55
    Anna Graessel        : 0.875
    Eric_George_Parakal_2 : 0.819
    Fouzi_v3             : 0.531
Ravil Khairullin False name 'LogisticRegression' is not defined
  scores:
Khairullin Ravil (2) False name 'np' is not defined
  scores:
Eric George Parakal False name 'X' is not defined
  scores:
Ruben Freire False 




User Baseline_test1 tested in 1.0339620113372803 seconds
User Baseline_test2 tested in 0.0018379688262939453 seconds
User Baseline_test3 tested in 1.031999111175537 seconds
User Boris tested in 0.002545595169067383 seconds
User Boris (fixed) tested in 1.0872302055358887 seconds
User Ravil Khairullin tested in 0.0032341480255126953 seconds
User Khairullin Ravil (2) tested in 0.004928112030029297 seconds
User Eric George Parakal tested in 0.001954793930053711 seconds
User Ruben Freire tested in 0.0012352466583251953 seconds
User yulia_gurova_s1 tested in 0.0018820762634277344 seconds
User Ruben Freire (2) tested in 1.0328888893127441 seconds
User yulia_gurova_s2 tested in 1.0269365310668945 seconds




User Anna Beketova tested in 4.118738889694214 seconds
User Anna Graessel tested in 1.0409541130065918 seconds
User Fouzi Takelait tested in 0.002760648727416992 seconds
User Eric_George_Parakal_2 tested in 1.0815792083740234 seconds


  0%|          | 0/17 [00:00<?, ?it/s]

User Fouzi_v3 tested in 1.0402765274047852 seconds


100%|██████████| 17/17 [00:58<00:00,  5.21s/it]

Baseline_test1 True None
  scores:
    Baseline_test1       : 0.594
    Baseline_test3       : 0.825
    Boris (fixed)        : 0.731
    Ruben Freire (2)     : 0.994
    yulia_gurova_s2      : 0.519
    Anna Graessel        : 0.925
    Eric_George_Parakal_2 : 0.738
    Fouzi_v3             : 0.512
Baseline_test2 False name 'np' is not defined
  scores:
Baseline_test3 True None
  scores:
    Baseline_test3       : 0.775
    Baseline_test1       : 0.606
    Boris (fixed)        : 0.731
    Ruben Freire (2)     : 0.994
    yulia_gurova_s2      : 0.519
    Anna Graessel        : 0.925
    Eric_George_Parakal_2 : 0.738
    Fouzi_v3             : 0.512
Boris False name 'np' is not defined
  scores:
Boris (fixed) True None
  scores:
    Boris (fixed)        : 0.725
    Baseline_test1       : 0.619
    Baseline_test3       : 0.812
    Ruben Freire (2)     : 0.981
    yulia_gurova_s2      : 0.525
    Anna Graessel        : 0.906
    Eric_George_Parakal_2 : 0.756
    Fouzi_v3             : 0.5





User Baseline_test1 tested in 1.0318536758422852 seconds
User Baseline_test2 tested in 0.0017240047454833984 seconds
User Baseline_test3 tested in 1.0310313701629639 seconds
User Boris tested in 0.0024497509002685547 seconds
User Boris (fixed) tested in 1.082017183303833 seconds
User Ravil Khairullin tested in 0.0032465457916259766 seconds
User Khairullin Ravil (2) tested in 0.003970146179199219 seconds
User Eric George Parakal tested in 0.0014164447784423828 seconds
User Ruben Freire tested in 0.0013256072998046875 seconds
User yulia_gurova_s1 tested in 0.0018396377563476562 seconds
User Ruben Freire (2) tested in 1.0330677032470703 seconds
User yulia_gurova_s2 tested in 1.0223195552825928 seconds




User Anna Beketova tested in 4.11436128616333 seconds
User Anna Graessel tested in 1.036839246749878 seconds
User Fouzi Takelait tested in 0.0021746158599853516 seconds
User Eric_George_Parakal_2 tested in 1.0862007141113281 seconds


  0%|          | 0/17 [00:00<?, ?it/s]

User Fouzi_v3 tested in 4.124982595443726 seconds


100%|██████████| 17/17 [00:43<00:00,  2.57s/it]

Baseline_test1 True None
  scores:
    Baseline_test1       : 0.613
    Baseline_test3       : 0.806
    Boris (fixed)        : 0.694
    Ruben Freire (2)     : 1.0
    yulia_gurova_s2      : 0.531
    Anna Graessel        : 0.938
    Eric_George_Parakal_2 : 0.731
Baseline_test2 False name 'np' is not defined
  scores:
Baseline_test3 True None
  scores:
    Baseline_test3       : 0.8
    Baseline_test1       : 0.55
    Boris (fixed)        : 0.694
    Ruben Freire (2)     : 1.0
    yulia_gurova_s2      : 0.531
    Anna Graessel        : 0.938
    Eric_George_Parakal_2 : 0.731
Boris False name 'np' is not defined
  scores:
Boris (fixed) True None
  scores:
    Boris (fixed)        : 0.675
    Baseline_test1       : 0.631
    Baseline_test3       : 0.812
    Ruben Freire (2)     : 0.994
    yulia_gurova_s2      : 0.525
    Anna Graessel        : 0.931
    Eric_George_Parakal_2 : 0.675
Ravil Khairullin False name 'LogisticRegression' is not defined
  scores:
Khairullin Ravil (2) False nam




In [41]:
with open('results.pkl', 'rb') as f:
  results = pickle.load(f)

for i, r in enumerate(results):
  r.columns = ["{}_{}".format(c, i) if c != 'username' else c for c in r.columns]

result = pd.concat([r.set_index('username') for r in results],
                   axis=1, sort=True, copy=True)

model_cols   = [c for c in result.columns if 'mean_model_score'  in c]
dataset_cols = [c for c in result.columns if 'mean_dataset_loss' in c]

result[model_cols  ].fillna(0.5, inplace=True)
result[dataset_cols].fillna(1.0, inplace=True)

result['total_model_score'  ] = result[model_cols  ].mean(axis=1)
result['total_dataset_score'] = result[dataset_cols].mean(axis=1)

final_scores = result[['total_model_score', 'total_dataset_score']]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  downcast=downcast, **kwargs)


In [43]:
final_scores.sort_values('total_model_score', ascending=False)

Unnamed: 0,total_model_score,total_dataset_score
yulia_gurova_s2,0.794993,0.569157
Anna Graessel,0.741528,0.898845
Baseline_test1,0.739879,0.572168
Boris (fixed),0.73907,0.670723
Baseline_test3,0.733258,0.793689
Anna Beketova,0.732639,0.572005
Fouzi_v3,0.730649,0.546802
Ruben Freire (2),0.729907,0.991687
Eric_George_Parakal_2,0.7187,0.838952


In [45]:
final_scores.sort_values('total_dataset_score', ascending=True)

Unnamed: 0,total_model_score,total_dataset_score
Fouzi_v3,0.730649,0.546802
yulia_gurova_s2,0.794993,0.569157
Anna Beketova,0.732639,0.572005
Baseline_test1,0.739879,0.572168
Boris (fixed),0.73907,0.670723
Baseline_test3,0.733258,0.793689
Eric_George_Parakal_2,0.7187,0.838952
Anna Graessel,0.741528,0.898845
Ruben Freire (2),0.729907,0.991687
