The project is about merging 3 dataset Energy, Productions, and Fermate

In [None]:
from scripts.getDataset import getEntireDataset
from scripts.plots import plot
import pandas as pd

pd.set_option("display.max_rows", None)
pd.options.mode.copy_on_write = False

dataset = getEntireDataset(306, 2023, 1)

plot(dataset)

Now I iterate for every machine to get the data from the 3 datasets and merge them into one dataset, removing the ones that are not useful for the analysis.

IT1 ~ Now we can check for all the dataset and see what we can do with them

In [None]:
from scripts.plots import plot
from scripts.getDataset import getEntireDataset, forEveryMachine, cleanDataset
import pandas as pd

DEBUG = False
pd.set_option("display.max_rows", None)
pd.options.mode.copy_on_write = False

most_interesting = []

def fn(machineId, year, month):
    dataset = getEntireDataset(machineId, year, month, False)

    if dataset.empty:
        return

    if dataset["EnergyConsumption"].dropna().eq(0).all():
        if DEBUG:
            print("Skipping as EnergyConsumption is all 0")
        return
    if dataset["Fermate"].dropna().eq(0).all():
        if DEBUG:
            print("Skipping as Fermate is all 0")
        return

    EC_na = dataset["EnergyConsumption"].isna().sum()
    EC_len = dataset["EnergyConsumption"].shape[0]

    ST_na = dataset["Fermate"].isna().sum()
    ST_len = dataset["Fermate"].shape[0]

    energyRate = EC_na/ EC_len
    stopsRate = ST_na/ ST_len

    if energyRate > 0.5 or stopsRate > 0.5:
        if DEBUG:
            print("Skipping as too many NaN values")
            print("\tEnergy Rate:", energyRate)
            print("\tStops Rate:", stopsRate)
        return

    most_interesting.append((cleanDataset(dataset), machineId, year, month))

forEveryMachine(fn)

print("\n\n\nThe most interesting are the followings:")
for dataset, machineId, year, month in most_interesting:
    dataset.to_csv(f"dataset/results/it1/id-{machineId}_{year}-{month}.csv", index=False)
    print(f"- Machine {machineId} - {year}/{month}")

for dataset, machineId, year, month in most_interesting:
    plot(dataset, machineId, year, month)

IT2 ~ It took 20 minutes to compute everything but now I want to look at the lifetime of the machines and see if I can find something interesting, merging the data into one file for machine

In [None]:
from scripts.getSingleDataset.utils import getCleanDataset
import pandas as pd
import os

files = [p for p in os.listdir("dataset/results/it1") if p.endswith(".csv")]

compete_dataset = {}

for path in files:
    # dataset.to_csv(f"dataset/results/it1/id-{machineId}_{year}-{month}.csv")

    splitted_filename = [p.split("-") for p in path.replace(".csv", "").split("_")]

    machineId = int(splitted_filename[0][1])
    year = int(splitted_filename[1][0])
    month = int(splitted_filename[1][1])

    d = getCleanDataset(f"dataset/results/it1/id-{machineId}_{year}-{month}.csv")

    # plot(d)
    value = [d]

    if machineId in compete_dataset:
        old_data = compete_dataset.get(machineId)
        
        for old in old_data:
            value.append(old)
        

    compete_dataset.update({machineId: value})

for machineId in compete_dataset.keys():
    d = pd.concat(compete_dataset.get(machineId))

    d.to_csv(f"dataset/results/it2/id-{machineId}.csv", mode="w", index=False)

IT3 ~ Now I use the pearson correlation to see the correlation between datasets

In [None]:
import os
from scripts.getSingleDataset.utils import getCleanDataset
from scipy import stats
from scripts.plots import plot

files = [p for p in os.listdir("dataset/results/it2") if p.endswith(".csv")]

STR = "\t\t{:0.2f}\n\t\tP-Value:{:0.2f}"

def is_correlation_usable(data):
    computing_data = data.dropna()

    print(f"Machine {machineId}")
    # Correlazione tra Energia e Produzioni
    energy_productions_corr, energy_productions_pvalue = stats.pearsonr(
        computing_data["EnergyConsumption"], computing_data["Productions"]
    )
    print("\tCorrelation between Energy Consumption and Productions")
    print(STR.format(energy_productions_corr, energy_productions_pvalue))
    if energy_productions_pvalue <= 0.05:
        print("\t\tSignificant")

    # Correlazione tra Energia e Fermate
    energy_stops_corr, energy_stops_pvalue = stats.pearsonr(
        computing_data["EnergyConsumption"], computing_data["Fermate"]
    )
    print("\tCorrelation between Energy Consumption and Stops")
    print(STR.format(energy_stops_corr, energy_stops_pvalue))
    if energy_stops_pvalue <= 0.05:
        print("\t\tSignificant")

    # Correlazione tra Produzioni e Fermate
    productions_stops_corr, productions_stops_pvalue = stats.pearsonr(
        computing_data["Productions"], computing_data["Fermate"]
    )
    print("\tCorrelation between Productions and Stops")
    print(STR.format(productions_stops_corr, productions_stops_pvalue))
    if productions_stops_pvalue <= 0.05:
        print("\t\tSignificant")

    return (
        energy_productions_pvalue <= 0.05
        or energy_stops_pvalue <= 0.05
        or productions_stops_pvalue <= 0.05
    )

dfs = [(int(file.replace(".csv", "").split("-")[1]), getCleanDataset(f"dataset/results/it2/{file}")) for file in files]

usable_datasets = [df for df in dfs if is_correlation_usable(df[1])]

print("Usable datasets: ", len(usable_datasets))

tot = 0
for machineId, d in usable_datasets:
    d.to_csv(f"dataset/results/it3/id-{machineId}.csv", mode="w", index=False)
    tot += d.shape[0]
    print("- Machine", machineId)
print("Total:", tot)

for machineId, d in usable_datasets:
    plot(d, machineId)

Now I want to use a linear model to try to predict the lifetime of the machines

In [None]:
import os
from sklearn import datasets, linear_model
from sklearn.utils import shuffle
import pandas as pd


files = [p for p in os.listdir("dataset/results/it3") if p.endswith(".csv")]

full_data = pd.DataFrame()
for file in files:
    data = pd.read_csv(f"dataset/results/it3/{file}")
    data["MachineId"] = int(file.replace(".csv", "").split("-")[1])
    # data["MachineId"] = int(file.replace(".csv", "").split("_")[0].split("-")[1])

    full_data = pd.concat([full_data, data])

total = 0

for i in range(1000):
    tot = len(files)
    train_len = int(tot * 0.6)

    shuffled = shuffle(full_data.dropna())
    train_data = shuffled[: train_len]
    test_data = shuffled[: tot - train_len]

    model = linear_model.LinearRegression()
    model.fit(train_data[["EnergyConsumption", "Productions"]], train_data["Fermate"])

    # Valutare il modello utilizzando i dati di test
    prediction = model.predict(test_data[["EnergyConsumption", "Productions"]])
    res = ((prediction - test_data["Fermate"]) ** 2).mean()
    
    total += res

print("MeanSquareError:", total/1000)

In [44]:
import os
from sklearn import datasets, linear_model
from sklearn.utils import shuffle
import pandas as pd


train_files = [p for p in os.listdir("dataset/results/it3") if p.endswith(".csv")]
train_data = pd.DataFrame()
for file in train_files:
    data = pd.read_csv(f"dataset/results/it3/{file}")
    data["MachineId"] = int(file.replace(".csv", "").split("-")[1])
    # data["MachineId"] = int(file.replace(".csv", "").split("_")[0].split("-")[1])

    train_data = pd.concat([train_data, data])


test_files = [p for p in os.listdir("dataset/results/it1") if p.endswith(".csv")]
test_data = pd.DataFrame()
for file in test_files:
    data = pd.read_csv(f"dataset/results/it1/{file}")
    data["MachineId"] = int(file.replace(".csv", "").split("-")[1])
    # data["MachineId"] = int(file.replace(".csv", "").split("_")[0].split("-")[1])

    test_data = pd.concat([test_data, data])

total = 0

train_data = train_data.dropna()
test_data = test_data.dropna()
for i in range(1000):
    # train_len = int(len(train_files) * 0.6)

    # shuffled = shuffle(train_data.dropna())
    # test_data = shuffled[: tot - train_len]

    model = linear_model.LinearRegression()
    model.fit(train_data[["EnergyConsumption", "Productions"]], train_data["Fermate"])

    # Valutare il modello utilizzando i dati di test
    prediction = model.predict(test_data[["EnergyConsumption", "Productions"]])
    res = ((prediction - test_data["Fermate"]) ** 2).mean()

    total += res

print("MeanSquareError:", total / 1000)

MeanSquareError: 2.1524654335599256
