# Imports and data loading

In [1]:
import pandas as pd
import seaborn as sns

In [2]:
path = "/content/drive/MyDrive/Thesis/Datasets/Pasteurizer_dataset/"

In [3]:
df = pd.read_csv(path + "dataset_step_by_step.csv", header=None)
df

Unnamed: 0,0,1,2,3,4,5,6
0,34.13,15.76,38.19,20,1,43.0938,15.77
1,38.19,15.77,40.19,30,1,43.0750,15.90
2,40.19,15.90,41.36,40,1,43.0750,15.99
3,41.36,15.99,42.02,50,1,43.0125,16.13
4,42.02,16.13,42.20,60,1,43.0125,16.37
...,...,...,...,...,...,...,...
56583,35.90,40.92,36.02,1790,6,36.9313,40.62
56584,36.02,40.62,36.08,1800,6,37.1000,40.37
56585,36.08,40.37,36.13,1810,6,37.0312,40.14
56586,36.13,40.14,36.19,1820,6,36.9313,39.92


# Setting up the feature names

In [4]:
df.head()

Unnamed: 0,0,1,2,3,4,5,6
0,34.13,15.76,38.19,20,1,43.0938,15.77
1,38.19,15.77,40.19,30,1,43.075,15.9
2,40.19,15.9,41.36,40,1,43.075,15.99
3,41.36,15.99,42.02,50,1,43.0125,16.13
4,42.02,16.13,42.2,60,1,43.0125,16.37


In [5]:
# Change feature order
cols = [3, 0, 1, 2, 4, 5, 6]
df = df.reindex(columns=cols)
df.head()

Unnamed: 0,3,0,1,2,4,5,6
0,20,34.13,15.76,38.19,1,43.0938,15.77
1,30,38.19,15.77,40.19,1,43.075,15.9
2,40,40.19,15.9,41.36,1,43.075,15.99
3,50,41.36,15.99,42.02,1,43.0125,16.13
4,60,42.02,16.13,42.2,1,43.0125,16.37


In [6]:
features = ['paster_timeslot', 'prev_water_temp', 'prev_can_temp', 'curr_water_temp', 'bath_number', 'curr_bath_temp', 'curr_can_temp']
df.columns = features
df.head()

Unnamed: 0,paster_timeslot,prev_water_temp,prev_can_temp,curr_water_temp,bath_number,curr_bath_temp,curr_can_temp
0,20,34.13,15.76,38.19,1,43.0938,15.77
1,30,38.19,15.77,40.19,1,43.075,15.9
2,40,40.19,15.9,41.36,1,43.075,15.99
3,50,41.36,15.99,42.02,1,43.0125,16.13
4,60,42.02,16.13,42.2,1,43.0125,16.37


# Add "id" column

In [7]:
df

Unnamed: 0,paster_timeslot,prev_water_temp,prev_can_temp,curr_water_temp,bath_number,curr_bath_temp,curr_can_temp
0,20,34.13,15.76,38.19,1,43.0938,15.77
1,30,38.19,15.77,40.19,1,43.0750,15.90
2,40,40.19,15.90,41.36,1,43.0750,15.99
3,50,41.36,15.99,42.02,1,43.0125,16.13
4,60,42.02,16.13,42.20,1,43.0125,16.37
...,...,...,...,...,...,...,...
56583,1790,35.90,40.92,36.02,6,36.9313,40.62
56584,1800,36.02,40.62,36.08,6,37.1000,40.37
56585,1810,36.08,40.37,36.13,6,37.0312,40.14
56586,1820,36.13,40.14,36.19,6,36.9313,39.92


In [8]:
# We need to find a way to put a distinct "id" for every pasteurization

In [9]:
# For every pasteurization the feature of the paster timeslot is increasing by 10 or stays the same after each sampling
# So if 

# current_timeslot > (previous_timeslot+10) (1)

# This means that this is the start of a new pasteurization

In [10]:
# We need to add a dummy row at the beggining of the dataset so that equation (1) is true for the first pasteurization
df.loc[-1] = [2010,	99,	99,	99,	99,	99,	99] # adding a dummy row at the end
df.index = df.index + 1  # shifting index
df.sort_index(inplace=True) 
df

Unnamed: 0,paster_timeslot,prev_water_temp,prev_can_temp,curr_water_temp,bath_number,curr_bath_temp,curr_can_temp
0,2010,99.00,99.00,99.00,99,99.0000,99.00
1,20,34.13,15.76,38.19,1,43.0938,15.77
2,30,38.19,15.77,40.19,1,43.0750,15.90
3,40,40.19,15.90,41.36,1,43.0750,15.99
4,50,41.36,15.99,42.02,1,43.0125,16.13
...,...,...,...,...,...,...,...
56584,1790,35.90,40.92,36.02,6,36.9313,40.62
56585,1800,36.02,40.62,36.08,6,37.1000,40.37
56586,1810,36.08,40.37,36.13,6,37.0312,40.14
56587,1820,36.13,40.14,36.19,6,36.9313,39.92


In [11]:
# We create a list of the column for convenience
slots = list(df.paster_timeslot)

In [12]:
# Taking the time slots as a list we see what is the beggining timestamp of every pasteurization

for count in range(1, len(slots)):
  diff = slots[count] - slots[count-1]
  if diff < 0:
    print(f"current: {slots[count]}, previous: {slots[count-1]}")

current: 20, previous: 2010
current: 20, previous: 2010
current: 20, previous: 2010
current: 10, previous: 2010
current: 20, previous: 2010
current: 20, previous: 2010
current: 20, previous: 2010
current: 20, previous: 2010
current: 20, previous: 2010
current: 20, previous: 2010
current: 20, previous: 2010
current: 20, previous: 2010
current: 20, previous: 2010
current: 20, previous: 2010
current: 20, previous: 2010
current: 20, previous: 2010
current: 20, previous: 2010
current: 20, previous: 1830
current: 20, previous: 1890
current: 20, previous: 1790
current: 20, previous: 1890
current: 20, previous: 1960
current: 20, previous: 1880
current: 20, previous: 1860
current: 20, previous: 2010
current: 20, previous: 2010
current: 20, previous: 2010
current: 20, previous: 2010
current: 20, previous: 1830
current: 20, previous: 1790
current: 20, previous: 1970
current: 20, previous: 2010
current: 20, previous: 2010
current: 20, previous: 2010
current: 20, previous: 1850
current: 20, previou

In [13]:
len(slots)

56589

In [14]:
# We now create an id list
id_list = []
id = 0
for count in range(1, len(slots)):
  id_list.append(id)
  diff = slots[count] - slots[count-1]
  if diff < 0:
    id = id + 1
id_list.append(id)

In [15]:
len(id_list)

56589

In [16]:
# Insert id column and delete the first dummy row
df.insert(loc=0, column='paster_id', value=id_list)
df = df.iloc[1: , :]
df = df.reset_index().drop(["index"], axis=1)
df


Unnamed: 0,paster_id,paster_timeslot,prev_water_temp,prev_can_temp,curr_water_temp,bath_number,curr_bath_temp,curr_can_temp
0,1,20,34.13,15.76,38.19,1,43.0938,15.77
1,1,30,38.19,15.77,40.19,1,43.0750,15.90
2,1,40,40.19,15.90,41.36,1,43.0750,15.99
3,1,50,41.36,15.99,42.02,1,43.0125,16.13
4,1,60,42.02,16.13,42.20,1,43.0125,16.37
...,...,...,...,...,...,...,...,...
56583,265,1790,35.90,40.92,36.02,6,36.9313,40.62
56584,265,1800,36.02,40.62,36.08,6,37.1000,40.37
56585,265,1810,36.08,40.37,36.13,6,37.0312,40.14
56586,265,1820,36.13,40.14,36.19,6,36.9313,39.92


In [17]:
df

Unnamed: 0,paster_id,paster_timeslot,prev_water_temp,prev_can_temp,curr_water_temp,bath_number,curr_bath_temp,curr_can_temp
0,1,20,34.13,15.76,38.19,1,43.0938,15.77
1,1,30,38.19,15.77,40.19,1,43.0750,15.90
2,1,40,40.19,15.90,41.36,1,43.0750,15.99
3,1,50,41.36,15.99,42.02,1,43.0125,16.13
4,1,60,42.02,16.13,42.20,1,43.0125,16.37
...,...,...,...,...,...,...,...,...
56583,265,1790,35.90,40.92,36.02,6,36.9313,40.62
56584,265,1800,36.02,40.62,36.08,6,37.1000,40.37
56585,265,1810,36.08,40.37,36.13,6,37.0312,40.14
56586,265,1820,36.13,40.14,36.19,6,36.9313,39.92


In [18]:
# Save full dataset
df.to_csv(path + "paster_dataset_full.csv", index=False)

# Shuffle pasteurization blocks

In [19]:
df

Unnamed: 0,paster_id,paster_timeslot,prev_water_temp,prev_can_temp,curr_water_temp,bath_number,curr_bath_temp,curr_can_temp
0,1,20,34.13,15.76,38.19,1,43.0938,15.77
1,1,30,38.19,15.77,40.19,1,43.0750,15.90
2,1,40,40.19,15.90,41.36,1,43.0750,15.99
3,1,50,41.36,15.99,42.02,1,43.0125,16.13
4,1,60,42.02,16.13,42.20,1,43.0125,16.37
...,...,...,...,...,...,...,...,...
56583,265,1790,35.90,40.92,36.02,6,36.9313,40.62
56584,265,1800,36.02,40.62,36.08,6,37.1000,40.37
56585,265,1810,36.08,40.37,36.13,6,37.0312,40.14
56586,265,1820,36.13,40.14,36.19,6,36.9313,39.92


In [20]:
ids = df.paster_id.unique()
ids

array([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,  91,
        92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103, 104,
       105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
       118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143,
       144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156,
       157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169,
       170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 18

In [21]:
pasteurization_blocks = []
for id in ids:
  pasteurization_blocks.append(df.loc[ df["paster_id"] == id ])

In [22]:
type(type(pasteurization_blocks))

type

In [23]:
type(pasteurization_blocks[0])

pandas.core.frame.DataFrame

In [24]:
len(pasteurization_blocks)

265

In [25]:
pasteurization_blocks[0]

Unnamed: 0,paster_id,paster_timeslot,prev_water_temp,prev_can_temp,curr_water_temp,bath_number,curr_bath_temp,curr_can_temp
0,1,20,34.13,15.76,38.19,1,43.0938,15.77
1,1,30,38.19,15.77,40.19,1,43.0750,15.90
2,1,40,40.19,15.90,41.36,1,43.0750,15.99
3,1,50,41.36,15.99,42.02,1,43.0125,16.13
4,1,60,42.02,16.13,42.20,1,43.0125,16.37
...,...,...,...,...,...,...,...,...
195,1,1970,47.04,52.07,46.61,6,31.3375,51.72
196,1,1980,46.61,51.72,46.35,6,31.3375,51.38
197,1,1990,46.35,51.38,46.17,6,31.2437,51.08
198,1,2000,46.17,51.08,46.06,6,31.2812,50.76


In [26]:
import random

In [27]:
random.Random(0).shuffle(pasteurization_blocks)
pasteurization_blocks[0]

Unnamed: 0,paster_id,paster_timeslot,prev_water_temp,prev_can_temp,curr_water_temp,bath_number,curr_bath_temp,curr_can_temp
27065,129,20,32.84,29.79,34.81,1,40.0312,29.83
27066,129,30,34.81,29.83,36.60,1,40.0312,29.88
27067,129,40,36.60,29.88,36.28,1,40.0312,29.94
27068,129,50,36.28,29.94,35.90,1,40.0062,30.05
27069,129,60,35.90,30.05,36.49,1,40.0062,30.22
...,...,...,...,...,...,...,...,...
27260,129,1970,47.71,49.64,47.35,6,36.6437,49.42
27261,129,1980,47.35,49.42,47.25,6,36.6437,49.22
27262,129,1990,47.25,49.22,47.28,6,36.5438,49.04
27263,129,2000,47.28,49.04,47.29,6,36.4500,48.88


# Train test split (centralized and federated)

In [28]:
# The centralized dataset will be split into 80% train and 20% test
# (meaning that the training/test set will contain 80/20% of the pasteurization blocks)

# Similarly the federated dataset will be split as follows:
# alice set:40%
# bob set:40%
# test set:20%

In [29]:
len(pasteurization_blocks)

265

In [30]:
len(pasteurization_blocks) * 0.2

53.0

In [31]:
len(pasteurization_blocks) * 0.4

106.0

In [32]:
len(pasteurization_blocks) * 0.8

212.0

In [33]:
train_centralized = pasteurization_blocks[:212]
test_centralized = pasteurization_blocks[212:]

alice_federated = pasteurization_blocks[:106]
bob_federated = pasteurization_blocks[106:212]
test_federated = pasteurization_blocks[212:]

In [34]:
train_set = pd.concat(train_centralized)
test_set = pd.concat(test_centralized)

alice_set = pd.concat(alice_federated)
bob_set = pd.concat(bob_federated)
test_set = pd.concat(test_federated)

In [35]:
path

'/content/drive/MyDrive/Thesis/Datasets/Pasteurizer_dataset/'

In [36]:
train_set.to_csv(path + "centralized/train_set.csv", index=False)
test_set.to_csv(path + "centralized/test_set.csv", index=False)

alice_set.to_csv(path + "federated/alice_set.csv", index=False)
bob_set.to_csv(path + "federated/bob_set.csv", index=False)
test_set.to_csv(path + "federated/test_set.csv", index=False)