# Imports and data loading

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np

In [None]:
path = "/content/drive/MyDrive/Thesis/Datasets/Pasteurizer_dataset/"

In [None]:
df = pd.read_csv(path + "dataset_step_by_step.csv", header=None)
df

# Setting up the feature names

In [None]:
df.head()

In [None]:
# Change feature order
cols = [3, 0, 1, 2, 4, 5, 6]
df = df.reindex(columns=cols)
df.head()

In [None]:
features = ['paster_timeslot', 'prev_water_temp', 'prev_can_temp', 'curr_water_temp', 'bath_number', 'curr_bath_temp', 'curr_can_temp']
df.columns = features
df.head()

In [None]:
df

# Add "id" column

In [None]:
df

In [None]:
# We need to find a way to put a distinct "id" for every pasteurization

In [None]:
# For every pasteurization the feature of the paster timeslot is increasing by 10 or stays the same after each sampling
# So if 

# current_timeslot > (previous_timeslot+10) (1)

# This means that this is the start of a new pasteurization

In [None]:
# We need to add a dummy row at the beggining of the dataset so that equation (1) is true for the first pasteurization
df.loc[-1] = [2010,	99,	99,	99,	99,	99,	99] # adding a dummy row at the end
df.index = df.index + 1  # shifting index
df.sort_index(inplace=True) 
df

In [None]:
# We create a list of the column for convenience
slots = list(df.paster_timeslot)

In [None]:
# Taking the time slots as a list we see what is the beggining timestamp of every pasteurization

for count in range(1, len(slots)):
  diff = slots[count] - slots[count-1]
  if diff < 0:
    print(f"current: {slots[count]}, previous: {slots[count-1]}")

In [None]:
len(slots)

59151

In [None]:
# We now create an id list
id_list = []
id = 0
for count in range(1, len(slots)):
  id_list.append(id)
  diff = slots[count] - slots[count-1]
  if diff < 0:
    id = id + 1
id_list.append(id)

In [None]:
len(id_list)

59151

In [None]:
# Insert id column and delete the first dummy row
df.insert(loc=0, column='paster_id', value=id_list)
df = df.iloc[1: , :]
df = df.reset_index().drop(["index"], axis=1)
df


In [None]:
df

In [None]:
# Save full dataset
df.to_csv(path + "paster_dataset_full.csv", index=False)

# Shuffle pasteurization blocks

In [None]:
df

In [None]:
ids = df.paster_id.unique()
ids

In [None]:
pasteurization_blocks = []
for id in ids:
  pasteurization_blocks.append(df.loc[ df["paster_id"] == id ])

In [None]:
type(pasteurization_blocks)

list

In [None]:
type(pasteurization_blocks[0])

pandas.core.frame.DataFrame

In [None]:
len(pasteurization_blocks)

265

In [None]:
pasteurization_blocks[0]

In [None]:
import random

In [None]:
# Shuffle the dataframe list
random.Random(35).shuffle(pasteurization_blocks)
pasteurization_blocks[0]

In [None]:
# The shuffled ids
shuffled_ids = [ block.paster_id.iloc[0] for block in pasteurization_blocks ]
np.array(shuffled_ids)

In [None]:
import random
ids = df.paster_id.unique()
pasteurization_blocks = []
for id in ids:
  pasteurization_blocks.append(df.loc[ df["paster_id"] == id ])
# Shuffle the dataframe list
random.Random(30).shuffle(pasteurization_blocks)
shuffled_ids = [ block.paster_id.iloc[0] for block in pasteurization_blocks ]

# Train test split (centralized and federated)

In [None]:
# The centralized dataset will be split into 80% train and 20% test
# (meaning that the training/test set will contain 80/20% of the pasteurization blocks)

# Similarly the federated dataset will be split as follows:
# alice set:40%
# bob set:40%
# test set:20%

In [None]:
len(pasteurization_blocks)

265

In [None]:
len(pasteurization_blocks) * 0.2

53.0

In [None]:
len(pasteurization_blocks) * 0.4

106.0

In [None]:
len(pasteurization_blocks) * 0.8

212.0

In [None]:
train_centralized = pasteurization_blocks[:212]
test_centralized = pasteurization_blocks[212:]

alice_federated = pasteurization_blocks[:106]
bob_federated = pasteurization_blocks[106:212]
test_federated = pasteurization_blocks[212:]

In [None]:
train_set = pd.concat(train_centralized)
test_set = pd.concat(test_centralized)

alice_set = pd.concat(alice_federated)
bob_set = pd.concat(bob_federated)
test_set = pd.concat(test_federated)

In [None]:
path

'/content/drive/MyDrive/Thesis/Datasets/Pasteurizer_dataset/'

In [None]:
train_set.to_csv(path + "centralized/train_set.csv", index=False)
test_set.to_csv(path + "centralized/test_set.csv", index=False)

alice_set.to_csv(path + "federated/alice_set.csv", index=False)
bob_set.to_csv(path + "federated/bob_set.csv", index=False)
test_set.to_csv(path + "federated/test_set.csv", index=False)