In [152]:
import sys
import pandas as pd
import numpy as np
import logging
import math
import os
from enum import Enum
from tqdm import tqdm

pd.options.mode.copy_on_write = True

#TODO: improve configuration
NUMBER_OF_SAMPLES = 24
# DATA_FILE = "data/mini-baidu-dataset.csv"
# DATA_FILE = "baidu-dataset.csv"
DATA_FILE = "data/baidu-dataset.csv"
CHANGE_RATE_INTERVAL = 6
FEATURE_COUNT = 2
HEALTH_CLASSES_COUNT = 2

class HealthStatusAlgorithm(Enum):
	LINEAR = 1

class FeatureSelectionAlgorithm(Enum):
	Z_SCORE = 1

def computeChangeRates(df, interval):
	# Remove the serial number
	ratesColumns = list(df.columns)[2:]
	titles = [column + " Change Rate" for column in ratesColumns]
	for title in titles:
		df[title] = [None for i in range(len(df))]

	for idx, column in enumerate(ratesColumns):
		tmpValues = list(df[column])
		tmpValues = [np.nan for i in range(interval)] + tmpValues[:-interval]
		dif = np.subtract(df[column], tmpValues)
		df[titles[idx]] = dif

	df = df.dropna()

	return df

def z_score(goodSamples, badSamples):
	nf = len(badSamples)
	ng = len(goodSamples)
	mf = np.average(badSamples)
	mg = np.average(goodSamples)
	vf = np.var(badSamples)
	vg = np.var(goodSamples)

	if vf == 0 and vg == 0:
		return 0

	return math.fabs(mf-mg)/math.sqrt(vf/nf + vg/ng)

def featureSelection(df, algorithm, toKeepCount):
	func = None
	match algorithm:
		case FeatureSelectionAlgorithm.Z_SCORE:
			func = z_score

	# Remove the serial and status
	columns = list(df.columns)[2:]
	good_hard_drives = df[df["Drive Status"] == 1]
	bad_hard_drives = df[df["Drive Status"] == -1]

	results = []

	for col in columns:
		goodSamples = list(good_hard_drives[col])
		badSamples = list(bad_hard_drives[col])

		results.append((func(goodSamples, badSamples), col))

	results.sort()
	toKeep = list(df.columns)[0:2] + [result[1] for result in results][:toKeepCount]

	return df[toKeep]

# Keeps only the last N samples of each disk on the dataframe
def getLastSamples(df, N):
	serialNumbers = df["serial-number"].unique()
	toKeep = []
	for serialNumber in serialNumbers:
		indices = df[df["serial-number"] == serialNumber].index[-N:]
		for index in indices:
			toKeep.append(index)

	return df.loc[toKeep]

# Linearly map the values [0,n-1] to [maxi, mini]
# TODO: add an option to decide if the bigger or smaller intervals get less elements
# For now, there are more elements with bigger values
def LinearAlgorithm(mini, maxi, i, n):
	return maxi - math.floor((maxi-(mini-1))*i/n) - 1

# A column with a score in [1,maxLevel] is given to each sample
# If good is set, it is always equal to maxLevel
# Else the algorithm is used to give a score in [1,maxLevel-1]
def addHealthStatus(df, good, algorithm, maxLevel):
	if good:
		df["Health Status"] = [maxLevel - 1 for i in range(len(df))]
		return df

	func = None

	match algorithm:
		case HealthStatusAlgorithm.LINEAR:
			func = LinearAlgorithm

	serialNumbers = df["serial-number"].unique()
	healthStatusValues = []
	for serialNumber in serialNumbers:
		cnt = len(df[df["serial-number"] == serialNumber])
		newValues = [func(1, maxLevel-1, i, cnt) for i in range(cnt)]
		healthStatusValues = healthStatusValues + newValues

	df["Health Status"] = healthStatusValues
	return df

In [153]:
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO,format='%(message)s')

logger.info("Started")

logger.info("Reading data file")

data = pd.DataFrame()

data = pd.read_csv(DATA_FILE)

logger.info("Computing change rates")
data = computeChangeRates(data, CHANGE_RATE_INTERVAL)

good_hard_drives = data[data["Drive Status"] == 1]
bad_hard_drives = getLastSamples(data[data["Drive Status"] == -1], NUMBER_OF_SAMPLES)

data = pd.concat([bad_hard_drives, good_hard_drives])

# TODO: check if the features change a lot when CHANGE_RATE_INTERVAL and NUMBER_OF_SAMPLES change
logger.info("Selecting %d features using the %s algorithm", FEATURE_COUNT, FeatureSelectionAlgorithm.Z_SCORE.name)
data = featureSelection(data, FeatureSelectionAlgorithm.Z_SCORE, FEATURE_COUNT)
logger.info("Features kept: %s", str(list(data.columns)[2:]))

good_hard_drives = data[data["Drive Status"] == 1]
bad_hard_drives = getLastSamples(data[data["Drive Status"] == -1], NUMBER_OF_SAMPLES)

bad_hard_drives = addHealthStatus(bad_hard_drives, False, HealthStatusAlgorithm.LINEAR, HEALTH_CLASSES_COUNT)
good_hard_drives = addHealthStatus(good_hard_drives, True, HealthStatusAlgorithm.LINEAR, HEALTH_CLASSES_COUNT)

Started
Reading data file


Computing change rates
Selecting 2 features using the Z_SCORE algorithm
Features kept: ['Power on Hours Change Rate', 'Hardware ECC Recovered Change Rate']


In [154]:
import os
import torch
from torch import nn

device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")

Using cpu device


In [190]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(FEATURE_COUNT, 30)
        self.fc2 = nn.Linear(30, 1)

    def forward(self, x):
        x = torch.sigmoid(self.fc1(x))
        x = torch.sigmoid(self.fc2(x))
        return x

model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (fc1): Linear(in_features=2, out_features=30, bias=True)
  (fc2): Linear(in_features=30, out_features=1, bias=True)
)


In [176]:
import math

class LogLoss(nn.Module):
    def __init__(self):
        super(LogLoss, self).__init__()

    def forward(self, output, target):
        # print(output)
        # pred_probab = nn.LogSoftmax()(output)
        # print(pred_probab)
        # return -torch.sum(torch.tensor([math.log(pred_probab[i][target[i]-1]) for i in range(len(output))]))
        return -torch.sum(torch.tensor([output[i][target[i]-1] for i in range(len(output))]))
        return -torch.sum(torch.tensor([pred_probab[i][target[i]-1] for i in range(len(output))]))

loss_fn = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

In [177]:
def train(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        X, y = X.to(device), y.to(device)

        # Compute prediction error
        pred = model(X).squeeze()
        loss = loss_fn(pred, y.float())

        # Backpropagation
        # loss.requires_grad = True
        loss.mean().backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.mean(), (batch + 1) * len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

def test(dataloader, model, loss_fn):
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    model.eval()
    test_loss, correct_good, correct_bad = 0.0, 0, 0
    size_good, size_bad = 0, 0
    with torch.no_grad():
        for X, y in dataloader:
            X, y = X.to(device), y.to(device)
            pred = model(X).squeeze()
            test_loss += loss_fn(pred, y.float()).mean()
            # correct += (pred.argmax(1) == y-1).type(torch.float).sum().item()
            for i in range(len(X)):
              if y[i] == HEALTH_CLASSES_COUNT:
                size_good += 1
                correct_good += 1 if pred[i] > 0 else 0
              else:
                size_bad += 1
                correct_bad += 1 if pred[i] < 0 else 0
    test_loss /= num_batches
    correct_good /= size_good
    correct_bad /= size_bad
    # print(size_good)
    # print(size_bad)
    print(f"Test Error: \n FAR: {(100*(1-correct_good)):>0.1f}%, FDR: {(100*correct_bad):>0.1f}% Avg loss: {test_loss:>8f} \n")
    print(f"Avg loss: {test_loss:>8f} \n")

In [158]:
# from torch.utils.data import DataLoader, TensorDataset

# ratio = bad_hard_drives.size / good_hard_drives.size

# train_good = good_hard_drives.sample(frac=0.8 * ratio)
# test_good = good_hard_drives.sample(frac=0.2 * ratio)

# train_bad = bad_hard_drives.sample(frac=0.8)
# test_bad = bad_hard_drives.sample(frac=0.2)

# training_set = pd.concat([train_good, train_bad])
# test_set = pd.concat([test_good, test_bad])

# # training_set = train_good
# # test_set = test_good

# training_set = training_set.drop(["serial-number", "Drive Status"] , axis = 1)
# train_tensor = torch.tensor(training_set.drop("Health Status", axis = 1).values, dtype=torch.float)

# train_dataset = TensorDataset(train_tensor, torch.IntTensor(training_set["Health Status"].to_list()))
# train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# test_set = test_set.drop(["serial-number", "Drive Status"] , axis = 1)
# test_tensor = torch.tensor(test_set.drop("Health Status", axis = 1).values, dtype=torch.float)

# test_dataset = TensorDataset(test_tensor, torch.IntTensor(test_set["Health Status"].to_list()))
# test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True)


In [159]:
torch.manual_seed(1) # This returns that all HDs are failing
# torch.manual_seed(1) # This returns that all HDs are working

# epochs = 200
# model = NeuralNetwork().to(device)
# for t in range(epochs):
#     print(f"Epoch {t+1}\n-------------------------------")
#     train(train_dataloader, model, loss_fn, optimizer)
#     test(test_dataloader, model, loss_fn)
# print("Done!")

<torch._C.Generator at 0x772e6f9f6af0>

In [178]:
ratio = bad_hard_drives.size / good_hard_drives.size

good_hard_drives = good_hard_drives.sample(frac=ratio)

df = pd.concat([good_hard_drives, bad_hard_drives])

df = df.drop(["serial-number", "Drive Status"] , axis = 1)

y=df.pop("Health Status")
X=df

In [179]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test= train_test_split(X,y,test_size=0.2)

In [180]:
from torch.autograd import Variable

x_train= Variable(torch.from_numpy(np.array(x_train)).type(torch.FloatTensor))
x_test= Variable(torch.from_numpy(np.array(x_test)).type(torch.FloatTensor))
y_train= Variable(torch.from_numpy(np.array(y_train)).type(torch.LongTensor))
y_test= Variable(torch.from_numpy(np.array(y_test)).type(torch.LongTensor))

In [191]:
model = NeuralNetwork().to(device)
epochs = 400
for epoch in range(epochs):
    y_pred = model(x_train).squeeze()
    loss = loss_fn(y_pred, y_train.float())             
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()
    if (epoch+1) % 20 == 0:                                         
        # printing loss values on every 10 epochs to keep track
        print(f'epoch: {epoch+1}, loss = {loss.mean().item():.8f}')

epoch: 20, loss = 0.69418156
epoch: 40, loss = 0.69418156
epoch: 60, loss = 0.69418156
epoch: 80, loss = 0.69418156
epoch: 100, loss = 0.69418156
epoch: 120, loss = 0.69418156
epoch: 140, loss = 0.69418156
epoch: 160, loss = 0.69418156
epoch: 180, loss = 0.69418156
epoch: 200, loss = 0.69418156
epoch: 220, loss = 0.69418156
epoch: 240, loss = 0.69418156
epoch: 260, loss = 0.69418156
epoch: 280, loss = 0.69418156
epoch: 300, loss = 0.69418156
epoch: 320, loss = 0.69418156
epoch: 340, loss = 0.69418156
epoch: 360, loss = 0.69418156
epoch: 380, loss = 0.69418156
epoch: 400, loss = 0.69418156


In [192]:
with torch.no_grad():
    y_predicted = model(x_test)
    print(y_predicted)
    y_predicted_cls = y_predicted.round()
    cnt = [0, 0]
    far, fdr = 0, 0
    for i in range(len(y_predicted)):
        cnt[y_test[i]] += 1
        if(y_predicted_cls[i] == y_test[i]):
            if y_test[i] == 0:
                fdr += 1
            else:
                far += 1
    print(f'FAR: {100*(cnt[1]-far)/cnt[1]}%, FDR: {100*fdr/cnt[0]}%')

tensor([[0.5223],
        [0.5221],
        [0.5216],
        ...,
        [0.5221],
        [0.5208],
        [0.5219]])
FAR: 0.0%, FDR: 0.0%


In [183]:
torch.save(model.state_dict(), "model.pth")