In [1]:
import os 
import csv
import random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from keras.models import Sequential
from keras.layers import Dense
from keras.metrics import MeanSquaredError
from keras.losses import MeanSquaredError

In [2]:
random.seed(42)
path = '../ce_data/output_512/dataset_ship.csv'

In [3]:
def read_df(path):
    # Read csv
    df = pd.read_csv(path)
    # Calculate miss ratios in seperate dataframe
    miss_ratios = pd.DataFrame()
    for i in ["1024","2048","4096","8192"]:
        miss_ratios[i] = df['misses_'+i] / df['accesses_'+i]
        df = df.drop(columns=['misses_'+i, 'accesses_'+i])
        if 'l1d_accesses_'+i in df.columns:
            df = df.drop(columns=['l1d_accesses_'+i])
    # Append filename column
    filenames = []
    for i in df["id"]:
        nmb = i.split('_')[-1]
        filenames.append(i[:-len(nmb)-1])
    df["file_name"] = filenames
    filenames = list(set(filenames))
    return df, miss_ratios

There's a chance that reuse distance histograms within files are similar to each other. This may be due to them being sampled from the same files and therefore generated from the same code, so the histograms will be similar. To avoid this interfering with our process, we'll be moving all the lines that are derived from randomly picked files until they add up to 20% of the dataset. 

In [4]:
def train_test_split(df, miss_ratios, split=0.3):
    filenames = list(set(df["file_name"]))
    collected_rows = 0
    files = []
    while len(df) * split > collected_rows:
        rf = filenames[random.randint(0,188)]
        if rf in files: # No duplicates
            continue
        files.append(rf)
        collected_rows += len(df[df["file_name"] == rf])

    train_rds = df[~df["file_name"].isin(files)]
    train_mrs = miss_ratios[~df["file_name"].isin(files)]
    test_rds = df[df["file_name"].isin(files)]
    test_mrs = miss_ratios[df["file_name"].isin(files)]
    print(f"The test set consists of {len(files)} files (out of 189) with a total of {collected_rows} rows (out of {len(df)})")
    train_rds = train_rds.drop(columns=["id", "file_name"])
    test_rds = test_rds.drop(columns=["id", "file_name"])
    return train_rds, train_mrs, test_rds, test_mrs
#train_rds, train_mrs, test_rds, test_mrs = train_test_split(rds, mrs)

In [5]:
def create_model(layers=[896,896,1]):
    model = Sequential()
    model.add(Dense(896, input_dim=896, activation='sigmoid'))
    
    # Add layers
    for i in range(1, len(layers)):
        model.add(Dense(layers[i], activation='sigmoid'))
    
    # Compile the model
    model.compile(loss=MeanSquaredError() , optimizer='adam')
    return model

In [6]:
rds, mrs = read_df('../ce_data/output_512/dataset_ship.csv')
train_rds, train_mrs, test_rds, test_mrs = train_test_split(rds, mrs, split=0.2)

The test set consists of 37 files (out of 189) with a total of 1261 rows (out of 6182)


In [7]:
j = ['1024', '2048', '4096', '8192']
m = create_model([896,896,len(j)])
m.summary()
for i in range(10):
    m.fit(train_rds.to_numpy(), train_mrs[j].to_numpy(), epochs=1)
#    for i,k in enumerate(j):
#        print(mean_squared_error(m(train_rds.to_numpy())[:,i], train_mrs[k].to_numpy()))
    for i,k in enumerate(j):
        print(f'Mean squared error for cache size {k}\t: {mean_squared_error(m(test_rds.to_numpy())[:,i], test_mrs[k].to_numpy())}')

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 896)               803712    
                                                                 
 dense_1 (Dense)             (None, 896)               803712    
                                                                 
 dense_2 (Dense)             (None, 4)                 3588      
                                                                 
Total params: 1,611,012
Trainable params: 1,611,012
Non-trainable params: 0
_________________________________________________________________
Mean squared error for cache size 1024	: 0.024087349723058114
Mean squared error for cache size 2048	: 0.056733259480368024
Mean squared error for cache size 4096	: 0.0913607822780049
Mean squared error for cache size 8192	: 0.2006422161192692
Mean squared error for cache size 1024	: 0.021033973145429593
Mean

In [8]:
rds, mrs = read_df('../ce_data/output_512/dataset_lru.csv')
# print(rds, mrs)
train_rds, train_mrs, test_rds, test_mrs = train_test_split(rds, mrs, split=0.2)

The test set consists of 59 files (out of 189) with a total of 1287 rows (out of 6183)


In [9]:
j = ['1024', '2048', '4096', '8192']
m = create_model([896,896,len(j)])
m.summary()
#for i,k in enumerate(j):
#   print(mean_squared_error(m(train_rds.to_numpy())[:,i], train_mrs[k].to_numpy()))
for i in range(6):
    m.fit(train_rds.to_numpy(), train_mrs[j].to_numpy(), epochs=1)
#    for i,k in enumerate(j):
#        print(mean_squared_error(m(train_rds.to_numpy())[:,i], train_mrs[k].to_numpy()))
    for i,k in enumerate(j):
        print(f'Mean squared error for cache size {k}\t: {mean_squared_error(m(test_rds.to_numpy())[:,i], test_mrs[k].to_numpy())}')

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_3 (Dense)             (None, 896)               803712    
                                                                 
 dense_4 (Dense)             (None, 896)               803712    
                                                                 
 dense_5 (Dense)             (None, 4)                 3588      
                                                                 
Total params: 1,611,012
Trainable params: 1,611,012
Non-trainable params: 0
_________________________________________________________________
Mean squared error for cache size 1024	: 0.02535776785415273
Mean squared error for cache size 2048	: 0.047127148232584605
Mean squared error for cache size 4096	: 0.057039784410750034
Mean squared error for cache size 8192	: 0.08403313409760366
Mean squared error for cache size 1024	: 0.025319449331812852


In [10]:
print(train_mrs)

          1024      2048      4096      8192
0     0.891123  0.795601  0.662782  0.123358
1     0.866846  0.765354  0.660919  0.077643
2     0.848553  0.727294  0.641544  0.262694
3     0.622860  0.225061  0.040571  0.001862
4     0.588169  0.200199  0.032464  0.001848
...        ...       ...       ...       ...
6178  0.785621  0.600917  0.436724  0.297468
6179  0.739754  0.549444  0.413295  0.301718
6180  0.934980  0.883098  0.810977  0.632974
6181  0.949653  0.904684  0.823035  0.558143
6182  1.000000  1.000000  1.000000  1.000000

[4896 rows x 4 columns]


In [11]:
print(train_mrs.to_numpy())

[[0.89112255 0.79560149 0.66278225 0.12335835]
 [0.86684567 0.76535388 0.66091937 0.07764284]
 [0.84855273 0.72729372 0.64154352 0.26269409]
 ...
 [0.93497967 0.88309828 0.81097736 0.6329736 ]
 [0.94965329 0.90468371 0.8230353  0.5581428 ]
 [1.         1.         1.         1.        ]]


In [12]:
rds, mrs = read_df('../ce_data/output_512/dataset_mj.csv')
# print(rds, mrs)
train_rds, train_mrs, test_rds, test_mrs = train_test_split(rds, mrs, split=0.1)

The test set consists of 22 files (out of 189) with a total of 766 rows (out of 6182)


In [13]:
j = ['1024', '2048', '4096', '8192']
m = create_model([896,896,len(j)])
m.summary()
#for i,k in enumerate(j):
#   print(mean_squared_error(m(train_rds.to_numpy())[:,i], train_mrs[k].to_numpy()))
for i in range(10):
    m.fit(train_rds.to_numpy(), train_mrs[j].to_numpy(), epochs=1)
#    for i,k in enumerate(j):
#        print(mean_squared_error(m(train_rds.to_numpy())[:,i], train_mrs[k].to_numpy()))
    for i,k in enumerate(j):
        print(f'Mean squared error for cache size {k}\t: {mean_squared_error(m(test_rds.to_numpy())[:,i], test_mrs[k].to_numpy())}')

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_6 (Dense)             (None, 896)               803712    
                                                                 
 dense_7 (Dense)             (None, 896)               803712    
                                                                 
 dense_8 (Dense)             (None, 4)                 3588      
                                                                 
Total params: 1,611,012
Trainable params: 1,611,012
Non-trainable params: 0
_________________________________________________________________
Mean squared error for cache size 1024	: 0.09342126770688144
Mean squared error for cache size 2048	: 0.02668399581363608
Mean squared error for cache size 4096	: 0.03988276473244218
Mean squared error for cache size 8192	: 0.0421484686587087
Mean squared error for cache size 1024	: 0.025033938031612066
Mea

In [16]:
rds, mrs = read_df('../ce_data/output_512/dataset_srrip.csv')
# print(rds, mrs)
train_rds, train_mrs, test_rds, test_mrs = train_test_split(rds, mrs, split=0.1)

The test set consists of 29 files (out of 189) with a total of 683 rows (out of 6181)


In [None]:
j = ['1024', '2048', '4096', '8192']
m = create_model([896,896,len(j)])
m.summary()
#for i,k in enumerate(j):
#   print(mean_squared_error(m(train_rds.to_numpy())[:,i], train_mrs[k].to_numpy()))
for i in range(10):
    m.fit(train_rds.to_numpy(), train_mrs[j].to_numpy(), epochs=1)
#    for i,k in enumerate(j):
#        print(mean_squared_error(m(train_rds.to_numpy())[:,i], train_mrs[k].to_numpy()))
    for i,k in enumerate(j):
        print(f'Mean squared error for cache size {k}\t: {mean_squared_error(m(test_rds.to_numpy())[:,i], test_mrs[k].to_numpy())}')

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_12 (Dense)            (None, 896)               803712    
                                                                 
 dense_13 (Dense)            (None, 896)               803712    
                                                                 
 dense_14 (Dense)            (None, 4)                 3588      
                                                                 
Total params: 1,611,012
Trainable params: 1,611,012
Non-trainable params: 0
_________________________________________________________________