In [13]:
import pandas as pd
import numpy as np
import matplotlib as plt
from sklearn.preprocessing import MinMaxScaler
import torch

def get_data():
    faults_df = pd.read_pickle("data/faults.pkl") 

    #Read in the latent variables
    column_names = ["index", "date", "system"]
    latent_mean = pd.read_csv("data/output_latent_means.csv")
    latent_var = pd.read_csv("data/output_latent_variances.csv")


    #Rename the columns (add suffix mean/var and rename the first three columns)
    num_remaining_columns = latent_var.shape[1] - len(column_names)
    mean_cols = [f'mean_{i+1}' for i in range(num_remaining_columns)]
    var_cols = [f'var_{i+1}' for i in range(num_remaining_columns)]
    latent_mean.columns = column_names + mean_cols
    latent_var.columns = column_names + var_cols

    faults_df = faults_df.rename(columns={'day': 'date'})
    faults_df['system'] = faults_df['system'].str.lstrip('0')
    faults_df['system'] = faults_df['system'].astype(int)

    #Full outer join of all three data frames
    latent_merge = pd.merge(latent_mean, latent_var, on = ['date', 'system'])
    latent_merge_full = pd.merge(latent_merge, faults_df, on = ['date', 'system'], how = 'outer')

    #create new column with boolean on wether there was an error or not
    latent_merge_full['error'] = np.where(latent_merge_full['errorcodes'].notna(), True, False)
    latent_merge_full = latent_merge_full.drop(columns=['index_x', 'index_y', 'errorcodes'])

    return latent_merge_full

In [14]:
test = get_data()

In [None]:
def dataset_info(dataset, full_dataset, name): #not accurate for gaussian data
    num_faults = dataset['error'].value_counts(1)
    total_length = len(dataset)
    balance = num_faults[0] / total_length
    anteil = len(dataset) / len(full_dataset)

    print(f"Dataset: {name}")
    print(f"Partition: {anteil:.2%}")
    print(f"Total Length: {total_length}")
    print(f"Number of Errors: {num_faults[1]}")
    print(f"Number of Non-errors: {num_faults[0]}")
    print(f"Percent of non-errors: {balance:.2%}\n")

In [16]:
test.iloc[:, 2:-1]

Unnamed: 0,mean_1,mean_2,mean_3,mean_4,mean_5,mean_6,mean_7,mean_8,mean_9,mean_10,...,var_119,var_120,var_121,var_122,var_123,var_124,var_125,var_126,var_127,var_128
0,0.550808,-1.590361,-2.522740,-0.562001,-0.016431,-1.342680,-1.199262,-2.342837,0.964393,-0.686327,...,0.048792,0.181649,0.104474,0.076905,0.082268,0.145806,0.061905,0.026486,0.084393,0.075032
1,0.655300,2.174991,-1.804639,1.207815,-4.427425,-1.171412,-0.296953,-1.457960,1.533657,0.001986,...,0.080019,0.253188,0.091064,0.081610,0.064597,0.151541,0.265030,0.043913,0.178388,0.045196
2,1.498378,-0.735744,-1.966931,0.939845,-3.146869,-1.699988,-0.969494,-2.150726,1.409788,-0.469866,...,0.055186,0.212213,0.087382,0.080591,0.079010,0.166678,0.103903,0.035413,0.096965,0.066793
3,-0.174130,1.155504,-2.495760,-0.396332,-3.151295,-0.790262,0.845975,-2.007940,1.357967,0.573552,...,0.050461,0.229905,0.082022,0.066515,0.080518,0.140733,0.156672,0.030553,0.112310,0.067935
4,2.462357,1.592187,-3.181734,1.024102,-3.737419,-0.726139,-1.014740,-1.515093,2.216331,0.480475,...,0.076062,0.261602,0.093062,0.088838,0.086154,0.202874,0.171623,0.042859,0.150567,0.050069
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37819,2.739179,0.979631,-3.425041,0.194633,2.062431,1.908500,-4.326291,0.386773,1.373929,-1.787411,...,0.174327,0.424814,0.270994,0.276911,0.197649,0.311280,0.475437,0.047348,0.164252,0.040325
37820,2.033061,1.172527,-3.435138,-1.027316,3.083331,2.011757,-1.935661,-0.537301,0.809522,-1.270876,...,0.289755,0.618372,0.452047,0.471611,0.348698,0.422955,0.975882,0.114996,0.510274,0.067715
37821,1.548666,0.467803,-3.498407,-0.436455,3.126747,1.784789,-2.800200,-0.062768,0.808315,-2.614346,...,0.130752,0.443470,0.245124,0.254351,0.170953,0.209275,0.516431,0.036685,0.146380,0.029629
37822,1.536273,0.774338,-3.114207,-0.018682,2.500621,1.129578,-2.257103,-0.104911,0.556936,-2.741172,...,0.122276,0.408938,0.210588,0.233246,0.142598,0.167750,0.485937,0.033507,0.124207,0.026270
