In [1]:
import os
import csv
import random
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import torch
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.preprocessing import StandardScaler
from keras.models import Sequential, Model
from keras.layers import Input, Dense, LSTM, Attention, Conv1D, Conv2D
from keras.metrics import MeanSquaredError
from keras.losses import MeanSquaredError
import keras
from scipy.stats.mstats import gmean
from keras.utils.vis_utils import plot_model



Weights, for the weighted prediction later on.

In [2]:
weights = json.load(open('/kaggle/input/cache-estimator-data/out_weights.json'))

Read the dataframe

In [3]:
def read_df(path, embeddings_path=None):
    # Read csv
    df = pd.read_csv(path)
    
    # Append filename and source_file column column
    filenames = []
    for i in df["id"]:
        nmb = i.split('_')[-1]
        filenames.append(i[:-len(nmb)-1])
    df["file_name"] = filenames
    filenames = list(set(filenames))
    df['source_file'] = df["file_name"].apply(lambda x: int(x[:3]))
    # Append embeddings
    if embeddings_path != None:
        embedding_df = pd.read_csv(embeddings_path)
        embedding_df['source_file'] = embedding_df['fname']
        embedding_df['embeddings'] = embedding_df['embeddings'].apply(lambda x: np.array([*map(float, x.strip('][').replace("'","").split(','))]))
        df = pd.merge(df, embedding_df).drop(columns=["fname"])
    # Calculate miss ratios into a seperate dataframe
    miss_ratios = pd.DataFrame()
    for i in ["768", "1024", "1536", "2048", "3072","4096", "6144", "8192"]:
        miss_ratios[i] = df['misses_'+i] / df['accesses_'+i]
        df = df.drop(columns=['misses_'+i, 'accesses_'+i])
        if 'l1d_accesses_'+i in df.columns:
            df = df.drop(columns=['l1d_accesses_'+i])
    return df, miss_ratios
rds, mrs = read_df('/kaggle/input/cache-estimator-data/dataset_lru_extra.csv', '/kaggle/input/cache-estimator-data/embeddings/flow-aware-f.csv')
print(rds.shape, mrs.shape)

(6163, 900) (6163, 8)


In [4]:
rds.head

<bound method NDFrame.head of                0           1           2          3          4          5  \
0     50816880.0  17323312.0  11868112.0  7701840.0  3853504.0  5287776.0   
1     17361520.0   6023600.0   4203744.0  2591488.0  1828656.0  1847648.0   
2     58283520.0  17249296.0  14592592.0  9466576.0  7204896.0  5614336.0   
3      3098304.0   3037808.0   2698720.0  2692176.0  1999568.0  1238496.0   
4      1623056.0   1618144.0   1501200.0  1353728.0  1014736.0   670656.0   
...          ...         ...         ...        ...        ...        ...   
6158   4869744.0   5968224.0   3301024.0  2821488.0  2218352.0  1938048.0   
6159   4967104.0   7056944.0   3530208.0  2928336.0  2367040.0  2018816.0   
6160  18063488.0  19199008.0   9437200.0  7269680.0  7833904.0  3198400.0   
6161  13498912.0  14343616.0   6633152.0  5098416.0  6450640.0  2275728.0   
6162  26404208.0  36893024.0  10084752.0  2060848.0  2511312.0  1297024.0   

              6          7          8        

# Train-Test split
Here we split the data into training and testing set. 

The train_test_split method is for the one model, where we separate some columns and try to predict them based on the others.

The train_test_split_sec method separates all data for the test_bench benchmark and moves it to the test set. 

In [5]:
test_ids = pd.DataFrame()
extra_df = []
def train_test_split(df, miss_ratios):
    global test_ids, extra_df
    train_rds = pd.DataFrame()
    train_mrs = pd.DataFrame()
    test_rds  = pd.DataFrame()
    test_mrs  = pd.DataFrame()
    train_cols = ['2048']
    test_cols = ['768', '1024', '1536', '3072', '4096', '6144', '8192']
    for i in train_cols:
        tmp_df = df.copy()
        tmp_df['column'] = float(i)
        train_rds = pd.concat([train_rds, tmp_df], ignore_index=True)
        new_tmp_df = miss_ratios[i]
        train_mrs = pd.concat([train_mrs, new_tmp_df], ignore_index=True)
    
    for test_col in test_cols:
        tmp_df = df.copy()
        tmp_df['column'] = float(test_col)
        test_rds = pd.concat([test_rds, tmp_df], ignore_index=True)
        new_tmp_df = miss_ratios[test_col]
        test_mrs = pd.concat([test_mrs, new_tmp_df], ignore_index=True)
    test_ids = test_rds[["id", "file_name", "source_file", "column"]]
        
    test_ids.to_csv(f'/kaggle/working/id_col_lru_other_4.csv')

    train_rds = train_rds.drop(columns=["id", "file_name", "source_file"])
    test_rds = test_rds.drop(columns=["id", "file_name", "source_file"])
    return train_rds, train_mrs, test_rds, test_mrs

train_rds, train_mrs, test_rds, test_mrs  = train_test_split(rds, mrs)
print(train_rds.shape, train_mrs.shape, test_rds.shape, test_mrs.shape, test_ids.shape)

(6163, 898) (6163, 1) (43141, 898) (43141, 1) (43141, 4)


In [6]:
test_ids = pd.DataFrame()
test_outputs = pd.DataFrame()
def train_test_split_sec(df, miss_ratios, test_bench):
    global test_ids, test_outputs
    train_rds = pd.DataFrame()
    train_mrs = pd.DataFrame()
    test_rds  = pd.DataFrame()
    test_mrs  = pd.DataFrame()
    train_cols = ['8192']
    test_cols = ['768', '1024', '1536', '2048', '3072', '4096', '6144', '8192']
    for i in train_cols:
        tmp_df = df[~df['source_file'].isin(test_bench)].copy()
        tmp_df['column'] = float(i)
        train_rds = pd.concat([train_rds, tmp_df], ignore_index=True)
        new_tmp_df = miss_ratios[~df['source_file'].isin(test_bench)][i]
        train_mrs = pd.concat([train_mrs, new_tmp_df], ignore_index=True)
    
    for test_col in test_cols:
        tmp_df = df[df['source_file'].isin(test_bench)].copy()
        tmp_df['column'] = float(test_col)
        test_rds = pd.concat([test_rds, tmp_df], ignore_index=True)
        new_tmp_df = miss_ratios[df['source_file'].isin(test_bench)][test_col]
        test_mrs = pd.concat([test_mrs, new_tmp_df], ignore_index=True)
    tmp_ids = test_rds[["id", "file_name", "source_file", "column"]]
    test_ids = pd.concat([test_ids, tmp_ids], ignore_index=True)
#     test_ids.to_csv(f'/kaggle/working/id_col_lru_other_6b.csv')
    test_outputs = pd.concat([test_outputs, test_mrs], ignore_index=True)
    train_rds = train_rds.drop(columns=["id", "file_name", "source_file"])
    test_rds = test_rds.drop(columns=["id", "file_name", "source_file"])
    return train_rds, train_mrs, test_rds, test_mrs

train_rds, train_mrs, test_rds, test_mrs  = train_test_split_sec(rds, mrs, [400])
print(train_rds.shape, train_mrs.shape, test_rds.shape, test_mrs.shape, test_ids.shape, test_outputs.shape)

(6160, 898) (6160, 1) (24, 898) (24, 1) (24, 4) (24, 1)


In [7]:
path = '/kaggle/input/cache-estimator-data/dataset_lru_extra.csv'                    # Path to the dataset
embeddings_path = '/kaggle/input/cache-estimator-data/embeddings/flow-aware-f.csv'   # Path to the embeddings
rds, mrs = read_df(path, embeddings_path)
# mrs = mrs.drop(columns = ['3072', '6144'])
train_rds, train_mrs, test_rds, test_mrs = train_test_split(rds, mrs)

x = []
for i in train_rds['embeddings']:
    x.append(i)
train_embeddings = np.array(x)
train_rds = train_rds.drop(columns=["embeddings"])

x = []
for i in test_rds['embeddings']:
    x.append(i)
test_embeddings = np.array(x)
test_rds = test_rds.drop(columns=["embeddings"])

train_sizes = train_rds['column']
test_sizes = test_rds['column']

train_rds = train_rds.drop(columns=["column"])
test_rds = test_rds.drop(columns=["column"])

embedding_scaler = StandardScaler()
embedding_scaler = embedding_scaler.fit(train_embeddings)
# train_embeddings = embedding_scaler.transform(train_embeddings)
# test_embeddings  = embedding_scaler.transform(test_embeddings)
scaler = StandardScaler()
scaler = scaler.fit(train_rds)
# train_rds = scaler.transform(train_rds)
# test_rds = scaler.transform(test_rds)

In [8]:
print(set(train_sizes))

{2048.0}


In [9]:
train_rds.shape, train_sizes.shape, train_embeddings.shape

((6163, 896), (6163,), (6163, 300))

In [10]:
test_rds.shape, test_sizes.shape, test_embeddings.shape

((43141, 896), (43141,), (43141, 300))

# Error functions
Functions to calculate the error

In [11]:
def calc_error(x, y):
    global test_ids
    j='err'
    
    test_ids[j] = (x - y)
    test_ids[j+'_abs'] = test_ids[j].abs()
    print(test_ids.groupby('file_name')[j].mean().abs().mean())

In [12]:
def calc_err(x, y, col_path):
    ids = col_path
    ids['out'] = (x - y)
#     ids['out_abs'] = ids['1024'].abs()
    
    tmp = []
    mean_df = pd.DataFrame()
    for i in set(ids["column"]):
        means = ids[ids["column"]==i].groupby('file_name')['out'].mean().to_frame()
        means['asp'] = [x.split('-')[0] for x in means.index]
        means['weights'] = [float(weights[x]) for x in means.index]
        #means['weights'] = means.groupby('asp')['weights'].apply(lambda x: x / x.sum())
        l = means.groupby('asp')['weights'].sum()
        means['sweights'] = [l[x] for x in means['asp']]
        means['weights'] = means['weights'] / means['sweights']
        means['out'] = means['out'] * means['weights']
        means = means.groupby('asp')['out'].mean().abs() # .apply(gmean) # apply(lambda x: x.max() - x.min())
        tmp.append(gmean(means))
#     print(i, gmean(means.abs()))
#     plt.figure()
#     plt.hist(means.abs(), np.linspace(0,1.0,101))
# #         plt.yticks([*range(0,int(plt.yticks()[0][-1]))])
#     plt.title('Histogram of predictions')
#     plt.ylabel('# benchmarks')
#     plt.xlabel('Error')
#     plt.show()
    del ids
    return *tmp, means.mean(), mean_squared_error(x, y)
tmp = []
# calc_err(l[-1], test_y, cp)

# Dense Neural Network

Creating the DNN network

In [13]:
train_y = train_mrs.to_numpy()
test_y = test_mrs.to_numpy()

# torch.save(torch.from_numpy(test_y), f'/kaggle/working/test_y_6b_cols.pt')

scale = 1

#x = np.array([int(i) for i in train_rds.columns])

if scale:
    embedding_scaler = StandardScaler()
    embedding_scaler = embedding_scaler.fit(train_embeddings)
    # train_embeddings = embedding_scaler.transform(train_embeddings)
    # test_embeddings  = embedding_scaler.transform(test_embeddings)
    scaler = StandardScaler()
    scaler = scaler.fit(train_rds)
    train_rdss = scaler.transform(train_rds)
    test_rdss = scaler.transform(test_rds)
    train_embeddingss = embedding_scaler.transform(train_embeddings)
    test_embeddingss  = embedding_scaler.transform(test_embeddings)
else:
#     scaler2 = StandardScaler()
#     scaler2 = scaler2.fit(train_rds.to_numpy() * x)
    train_rdss = train_rds.to_numpy() # * x # scaler2.transform(train_rds.to_numpy() * x)
    test_rdss = test_rds.to_numpy()   #* x # scaler2.transform(test_rds.to_numpy() * x)
    # print(test_rdss.shape)
    train_embeddingss = train_embeddings # embedding_scaler.transform(train_embeddings) # train_embeddings
    test_embeddingss  = test_embeddings # embedding_scaler.transform(test_embeddings) # 

train_X = np.reshape(train_rdss, (train_rds.shape[0], 896))
test_X = np.reshape(test_rdss, (test_rds.shape[0], 896))
train_emb = np.reshape(train_embeddingss, (train_embeddings.shape[0], 300))
test_emb = np.reshape(test_embeddingss, (test_embeddings.shape[0], 300))
train_s = train_sizes.to_numpy()/512
test_s = test_sizes.to_numpy()/512


print('train_X shape:',train_X.shape)
print('train_e shape:',train_emb.shape)
print('train_y shape:',train_y.shape)
print('~~~~~~~~')
print('test_X shape:',test_X.shape)
print('test_e shape:',test_emb.shape)
print('test_y shape:',test_y.shape)
print(train_emb[:5])
print(set(train_s))
print(set(test_s))

train_X shape: (6163, 896)
train_e shape: (6163, 300)
train_y shape: (6163, 1)
~~~~~~~~
test_X shape: (43141, 896)
test_e shape: (43141, 300)
test_y shape: (43141, 1)
[[ 0.03509821 -0.28592985 -0.19637353 ...  0.29874005  0.23721744
  -0.13483197]
 [ 0.03509821 -0.28592985 -0.19637353 ...  0.29874005  0.23721744
  -0.13483197]
 [ 0.03509821 -0.28592985 -0.19637353 ...  0.29874005  0.23721744
  -0.13483197]
 [ 0.21592059 -0.26209304 -0.2790132  ...  0.33850083  0.39146787
  -0.28169344]
 [ 0.21592059 -0.26209304 -0.2790132  ...  0.33850083  0.39146787
  -0.28169344]]
{4.0}
{1.5, 2.0, 3.0, 6.0, 8.0, 12.0, 16.0}


In [14]:
# Works  
def create_model(dnn_nodes = 1024):
    
    size_input = Input(shape=(1))
    
    embedding_inputs = Input(shape=(300))
    
    hist_inputs = Input(shape=(896))

    inp = tf.keras.layers.Concatenate()([hist_inputs, embedding_inputs, size_input])

    dense_layer1 = Dense(dnn_nodes, activation='relu')
    dense_output = dense_layer1(inp)
    
    dense_output = tf.keras.layers.Concatenate()([dense_output, size_input])

    dense_layer2 = Dense(dnn_nodes, activation='relu')
    dense_output = dense_layer2(dense_output)
    
    dense_output = tf.keras.layers.Concatenate()([dense_output, size_input])

    dense_layer3 = Dense(dnn_nodes, activation='relu')
    dense_output = dense_layer3(dense_output)
    
    dense_output = tf.keras.layers.Concatenate()([dense_output, size_input])

    dense_layer4 = Dense(dnn_nodes, activation='sigmoid')
    dense_output = dense_layer4(dense_output)
    
    dense_output = tf.keras.layers.Concatenate()([dense_output, size_input])

    output_layer = Dense(1, activation='sigmoid')
    total_output = output_layer(dense_output)
 
    model = Model([hist_inputs, size_input, embedding_inputs], total_output)
    # model.summary()
    model.compile(optimizer="adam", loss="mean_squared_error")
    y_pred = model([test_X, test_s, test_emb])
    print(y_pred.shape)
    print(mean_squared_error(y_pred, test_y))
    return model
f = create_model(256)
f.summary()

(43141, 1)
0.43836799378744257
Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 896)]        0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 300)]        0           []                               
                                                                                                  
 input_1 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 concatenate (Concatenate)      (None, 1197)         0           ['input_3[0][0]',                
                                                               

Predicting an unknown column

In [15]:
mymodel = create_model(512)
# plot_model(mymodel, to_file='/kaggle/working/model_plot.png')

l = []
for i in range(50):
    mymodel.fit([train_X, train_s, train_emb], train_y, epochs=1, batch_size=64, shuffle=True, validation_data=([test_X, test_s, test_emb], test_y))
    l.append(mymodel([test_X, test_s, test_emb]))

(43141, 1)
0.3628756901618664


In [16]:
x = min(l, key=lambda x:mean_squared_error(x, test_y))
print(mean_squared_error(x, test_y))
np.save('lru_pred_known_1', x)
np.save('lru_test_known_1', test_y)

0.05482730738771377


Training method of cross-validation, isolating one benchmark's at a time and moving it to the test set.

In [17]:
test_ids = pd.DataFrame()
test_outputs = pd.DataFrame()
test_preds = pd.DataFrame()
for bench in rds['source_file'].unique():
    train_rds, train_mrs, test_rds, test_mrs  = train_test_split_sec(rds, mrs, [bench])
    train_y = train_mrs.to_numpy()
    test_y = test_mrs.to_numpy()

    x = []
    for i in train_rds['embeddings']:
        x.append(i)
    train_embeddings = np.array(x)
    train_rds = train_rds.drop(columns=["embeddings"])

    x = []
    for i in test_rds['embeddings']:
        x.append(i)
    test_embeddings = np.array(x)
    test_rds = test_rds.drop(columns=["embeddings"])

    train_sizes = train_rds['column']
    test_sizes = test_rds['column']

    train_rds = train_rds.drop(columns=["column"])
    test_rds = test_rds.drop(columns=["column"])

    embedding_scaler = StandardScaler()
    embedding_scaler = embedding_scaler.fit(train_embeddings)
    # train_embeddings = embedding_scaler.transform(train_embeddings)
    # test_embeddings  = embedding_scaler.transform(test_embeddings)
    scaler = StandardScaler()
    scaler = scaler.fit(train_rds)

    #x = np.array([int(i) for i in train_rds.columns])
    scale = 1

    if scale:
        train_rdss = scaler.transform(train_rds)
        test_rdss = scaler.transform(test_rds)
        train_embeddingss = embedding_scaler.transform(train_embeddings)
        test_embeddingss  = embedding_scaler.transform(test_embeddings)
    else:
    #     scaler2 = StandardScaler()
    #     scaler2 = scaler2.fit(train_rds.to_numpy() * x)
        train_rdss = train_rds.to_numpy() # * x # scaler2.transform(train_rds.to_numpy() * x)
        test_rdss = test_rds.to_numpy()   #* x # scaler2.transform(test_rds.to_numpy() * x)
        # print(test_rdss.shape)
        train_embeddingss = train_embeddings # embedding_scaler.transform(train_embeddings) # train_embeddings
        test_embeddingss  = test_embeddings # embedding_scaler.transform(test_embeddings) # 


    train_X = np.reshape(train_rdss, (train_rds.shape[0], 896))
    test_X = np.reshape(test_rdss, (test_rds.shape[0], 896))
    train_emb = np.reshape(train_embeddingss, (train_embeddings.shape[0], 300))
    test_emb = np.reshape(test_embeddingss, (test_embeddings.shape[0], 300))
    train_s = train_sizes.to_numpy()/512
    test_s = test_sizes.to_numpy()/512
    mymodel = create_model(512)
    l = []
    for i in range(50):
        mymodel.fit([train_X, train_s, train_emb], train_y, epochs=1, batch_size=64, shuffle=True, validation_data=([test_X, test_s, test_emb], test_y), verbose = i < 5)
        l.append(mymodel([test_X, test_s, test_emb]))
    x = min(l, key=lambda x:mean_squared_error(x, test_y))
    print('Best', bench, ':', mean_squared_error(x, test_y))
    d = pd.DataFrame(x)
    test_preds = pd.concat([test_preds, d], ignore_index=True)
np.save(f'/kaggle/working/dnn_names_3', test_ids.to_numpy())
np.save(f'/kaggle/working/dnn_test_y_3', test_outputs.to_numpy())
np.save(f'/kaggle/working/dnn_preds_3', test_preds.to_numpy())

(24, 1)
0.09418450924333917
Best 400 : 0.056239205524090945
(416, 1)
0.04042166851343248
Best 401 : 0.04602825283923197
(720, 1)
0.3534184316968012
Best 403 : 0.07269258318414902
(808, 1)
0.7913332303091661
Best 410 : 0.0029275426421590694
(8, 1)
0.012206381855088573
Best 416 : 0.1205834430845593
(6776, 1)
0.12891309692761338
Best 429 : 0.25358535661359793
(1240, 1)
0.6413890241805594
Best 433 : 0.009049030746854822
(88, 1)
0.08228909525872642
Best 434 : 0.03817418769457268
(72, 1)
0.09067339488221313
Best 435 : 0.100085389879108
(112, 1)
0.05264547160959483
Best 436 : 0.004218816072048869
(1000, 1)
0.04287871324663983
Best 437 : 0.008834327196944944
(56, 1)
0.11545010207192206
Best 444 : 0.05087029337186751
(88, 1)
0.11940350592772891
Best 445 : 0.05252235672872341
(32, 1)
0.22122455791876855
Best 447 : 0.18228904892482817
(1464, 1)
0.21019740537784556
Best 450 : 0.22917412802333753
(32, 1)
0.06993015948530909
Best 453 : 0.02058754245065987
(112, 1)
0.055375496116558155
Best 454 : 0.1

In [18]:
print(test_ids.shape, test_outputs.shape, test_preds.shape)

(49304, 4) (49304, 1) (49304, 1)


In [19]:
np.save(f'/kaggle/working/dnn_names_1', test_ids.to_numpy())
np.save(f'/kaggle/working/dnn_test_y_1', test_outputs.to_numpy())
np.save(f'/kaggle/working/dnn_preds_1', test_preds.to_numpy())

In [20]:
x = min(l, key=lambda x:mean_squared_error(x, test_y))
d = pd.DataFrame(x)

In [21]:
cp = pd.read_csv('/kaggle/working/id_col_lru_other_6b.csv')
tmp = []
for x in l:
    tmp.append(calc_err(x, test_y, cp))
print(test_y.shape)
x = min(l, key=lambda x:np.mean(calc_err(x, test_y, cp)[:-2]))
torch.save(x, f'/kaggle/working/lru_changing_6b_cols.pt')
print(calc_err(x, test_y, cp))
print(min(tmp, key=lambda x : np.mean(x[:-2])))

FileNotFoundError: [Errno 2] No such file or directory: '/kaggle/working/id_col_lru_other_6b.csv'

In [None]:
def calc_err(x, y, col_path):
    ids = col_path
    ids['out'] = (x - y)
#     ids['out_abs'] = ids['1024'].abs()
    
    tmp = []
    mean_df = pd.DataFrame()
    for i in set(ids["column"]):
        means = ids[ids["column"]==i].groupby('file_name')['out'].mean().to_frame()
        means['asp'] = [x.split('-')[0] for x in means.index]
        means['weights'] = [float(weights[x]) for x in means.index]
        #means['weights'] = means.groupby('asp')['weights'].apply(lambda x: x / x.sum())
        l = means.groupby('asp')['weights'].sum()
        means['sweights'] = [l[x] for x in means['asp']]
        means['weights'] = means['weights'] / means['sweights']
        means['out'] = means['out'] * means['weights']
        means = means.groupby('asp')['out'].sum().abs() # .apply(gmean) # apply(lambda x: x.max() - x.min())
        tmp.append(gmean(means))
#     print(i, gmean(means.abs()))
#     plt.figure()
#     plt.hist(means.abs(), np.linspace(0,1.0,101))
# #         plt.yticks([*range(0,int(plt.yticks()[0][-1]))])
#     plt.title('Histogram of predictions')
#     plt.ylabel('# benchmarks')
#     plt.xlabel('Error')
#     plt.show()
    del ids
    return *tmp, means.mean(), mean_squared_error(x, y)
tmp = []
calc_err(l[-1], test_y, cp)

In [None]:
for i in l:
    calc_error(i, test_y)

In [None]:
x = min(l, key=lambda x:mean_squared_error(x, test_y))
torch.save(x, f'/kaggle/working/lru_other')
x = torch.from_numpy(test_y)
torch.save(x, f'/kaggle/working/test_y_other.')

In [None]:
x = min(l, key=lambda x:mean_absolute_error(x, test_y))
print(mean_absolute_error(x, test_y))
torch.save(x, f'/kaggle/working/lru_other.pt')
#x = torch.from_numpy(test_y)
#torch.save(x, f'/kaggle/working/test_y_other.pt')

In [None]:
def calc_err(x, y, col_path):
    ids = col_path
    ids['1024'] = (x - y)
    ids['1024_abs'] = ids['1024'].abs()
    tmp = []
    mean_df = pd.DataFrame()
    i = '1024'
    means = ids.groupby('file_name')[i].mean().to_frame()
    means['asp'] = [x.split('-')[0] for x in means.index]
    means['weights'] = [float(weights[x]) for x in means.index]
    #means['weights'] = means.groupby('asp')['weights'].apply(lambda x: x / x.sum())
    l = means.groupby('asp')['weights'].sum()
    means['sweights'] = [l[x] for x in means['asp']]
    means['weights'] = means['weights'] / means['sweights']
    means[i] = means[i] * means['weights']
    means = means.groupby('asp')[i].mean().abs() # .apply(gmean) # apply(lambda x: x.max() - x.min())
#     print(gmean(means))
#     print(means.mean())
#     print(i, gmean(means.abs()))
#     plt.figure()
#     plt.hist(means.abs(), np.linspace(0,1.0,101))
# #         plt.yticks([*range(0,int(plt.yticks()[0][-1]))])
#     plt.title('Histogram of predictions')
#     plt.ylabel('# benchmarks')
#     plt.xlabel('Error')
#     plt.show()
    return gmean(means), means.mean()
cp = pd.read_csv('/kaggle/working/id_col_lru_other.csv')
tmp = []
for x in l:
    tmp.append(calc_err(x, test_y, cp))

x = min(l, key=lambda x:calc_err(x, test_y, cp)[0])
print(calc_err(x, test_y, cp))
calc_err(x, test_y, cp)
print(min(tmp, key=lambda x : x[0]))
print(min(tmp, key=lambda x : x[1]))