In [25]:
import pandas as pd
import time
from os.path import join
import json 
import numpy as np 
from tqdm import tqdm
import seaborn as sns
from os import listdir, path
import matplotlib.pyplot as plt


In [44]:

def map_failure(x):
    if pd.isna(x):
        return 0
    else:
        x  = int(x.replace('comp',''))
    return x

def map_error(x):
    if pd.isna(x):
        return 0
    else:
        x  = int(x.replace('error',''))
    return x


In [27]:
data_dir = '/workspaces/predictive_maintenance/data/raw'

listdir(data_dir)

# keep only the files that end with .parquet
files = [join(data_dir,f)  for f in listdir(data_dir) if f.endswith('.parquet')]



In [28]:
# creat a dict key is the file name and value is the path 
files_dict = {f.split('/')[-1].split('.')[0]:f for f in files}
files_dict

{'PdM_failures': '/workspaces/predictive_maintenance/data/raw/PdM_failures.parquet',
 'PdM_machines': '/workspaces/predictive_maintenance/data/raw/PdM_machines.parquet',
 'PdM_telemetry2': '/workspaces/predictive_maintenance/data/raw/PdM_telemetry2.parquet',
 'PdM_errors': '/workspaces/predictive_maintenance/data/raw/PdM_errors.parquet',
 'PdM_maint': '/workspaces/predictive_maintenance/data/raw/PdM_maint.parquet',
 'PdM_telemetry': '/workspaces/predictive_maintenance/data/raw/PdM_telemetry.parquet'}

In [29]:
# read all the files into dataframes, the name of the dataframe is df_ + the name of the file


df_PdM_errors = pd.read_parquet(files_dict['PdM_errors'])
df_PdM_maint = pd.read_parquet(files_dict['PdM_maint'])
df_PdM_telemetry = pd.read_parquet(files_dict['PdM_telemetry'])
df_PdM_failures = pd.read_parquet(files_dict['PdM_failures'])
df_PdM_machines = pd.read_parquet(files_dict['PdM_machines'])
    

In [30]:
display(df_PdM_errors.head())
display(df_PdM_maint.head())
display(df_PdM_telemetry.head())
display(df_PdM_failures.head())
display(df_PdM_machines.head())


Unnamed: 0,datetime,machineID,errorID
0,2015-01-03 07:00:00,1,error1
1,2015-01-03 20:00:00,1,error3
2,2015-01-04 06:00:00,1,error5
3,2015-01-10 15:00:00,1,error4
4,2015-01-22 10:00:00,1,error4


Unnamed: 0,datetime,machineID,comp
0,2014-06-01 06:00:00,1,comp2
1,2014-07-16 06:00:00,1,comp4
2,2014-07-31 06:00:00,1,comp3
3,2014-12-13 06:00:00,1,comp1
4,2015-01-05 06:00:00,1,comp4


Unnamed: 0,datetime,machineID,volt,rotate,pressure,vibration
0,2015-01-01 06:00:00,1,176.217853,418.504078,113.077935,45.087686
1,2015-01-01 07:00:00,1,162.879223,402.74749,95.460525,43.413973
2,2015-01-01 08:00:00,1,170.989902,527.349825,75.237905,34.178847
3,2015-01-01 09:00:00,1,162.462833,346.149335,109.248561,41.122144
4,2015-01-01 10:00:00,1,157.610021,435.376873,111.886648,25.990511


Unnamed: 0,datetime,machineID,failure
0,2015-01-05 06:00:00,1,comp4
1,2015-03-06 06:00:00,1,comp1
2,2015-04-20 06:00:00,1,comp2
3,2015-06-19 06:00:00,1,comp4
4,2015-09-02 06:00:00,1,comp4


Unnamed: 0,machineID,model,age
0,1,model3,18
1,2,model4,7
2,3,model3,8
3,4,model3,7
4,5,model3,2


In [31]:
df_PdM_telemetry
# join the telemetry and failures dataframes


In [32]:
df_failures_machines = df_PdM_failures.merge(df_PdM_machines, on='machineID', how='left')

df_failures_machines['failure'] = df_failures_machines.failure.apply(map_failure)
df_failures_machines['failure_binary'] = df_failures_machines.failure.apply(lambda x: 1 if x > 0 else 0)


df_failures_telemetry = df_PdM_telemetry.merge(df_PdM_failures, on=['datetime', 'machineID'], how='left')
df_failures_telemetry['failure'] = df_failures_telemetry.failure.apply(map_failure)
df_failures_telemetry['failure_binary'] = df_failures_telemetry.failure.apply(lambda x: 1 if x > 0 else 0)

In [47]:
df_PdM_errors

# merge the errors with the df_PdM_failures 
df_failures_errors = df_PdM_errors.merge(df_PdM_failures, on=['datetime', 'machineID'], how='outer')
df_failures_errors['failure'] = df_failures_errors.failure.apply(map_failure)
df_failures_errors['failure_binary'] = df_failures_errors.failure.apply(lambda x: 1 if x > 0 else 0)
df_failures_errors.errorID = df_failures_errors.errorID.apply(map_error)



In [48]:
df_failures_telemetry.to_parquet('/workspaces/predictive_maintenance/data/interim/failures_telemetry.parquet')
df_failures_machines.to_parquet('/workspaces/predictive_maintenance/data/interim/failures_machines.parquet')
df_failures_errors.to_parquet('/workspaces/predictive_maintenance/data/interim/failures_errors.parquet')


## Analisis Bivariado 

In [49]:
df_failures_telemetry = pd.read_parquet('/workspaces/predictive_maintenance/data/interim/failures_telemetry.parquet')
df_failures_machines = pd.read_parquet('/workspaces/predictive_maintenance/data/interim/failures_machines.parquet')
df_failures_errors = pd.read_parquet('/workspaces/predictive_maintenance/data/interim/failures_errors.parquet')


In [53]:
print ('df_failures_telemetry', df_failures_telemetry.shape)
display(df_failures_telemetry.head())

print ('df_failures_machines', df_failures_machines.shape)
display(df_failures_machines.head())

print ('df_failures_errors', df_failures_errors.shape)
display(df_failures_errors.head())


df_failures_telemetry (876142, 8)


Unnamed: 0,datetime,machineID,volt,rotate,pressure,vibration,failure,failure_binary
0,2015-01-01 06:00:00,1,176.217853,418.504078,113.077935,45.087686,0,0
1,2015-01-01 07:00:00,1,162.879223,402.74749,95.460525,43.413973,0,0
2,2015-01-01 08:00:00,1,170.989902,527.349825,75.237905,34.178847,0,0
3,2015-01-01 09:00:00,1,162.462833,346.149335,109.248561,41.122144,0,0
4,2015-01-01 10:00:00,1,157.610021,435.376873,111.886648,25.990511,0,0


df_failures_machines (761, 6)


Unnamed: 0,datetime,machineID,failure,model,age,failure_binary
0,2015-01-05 06:00:00,1,4,model3,18,1
1,2015-03-06 06:00:00,1,1,model3,18,1
2,2015-04-20 06:00:00,1,2,model3,18,1
3,2015-06-19 06:00:00,1,4,model3,18,1
4,2015-09-02 06:00:00,1,4,model3,18,1


df_failures_errors (4677, 5)


Unnamed: 0,datetime,machineID,errorID,failure,failure_binary
0,2015-01-03 07:00:00,1,1,0,0
1,2015-01-03 20:00:00,1,3,0,0
2,2015-01-04 06:00:00,1,5,0,0
3,2015-01-10 15:00:00,1,4,0,0
4,2015-01-22 10:00:00,1,4,0,0


Bad pipe message: %s [b'a\xaeC\x06-\x06\xcc0\n!\xc3\x96\xf0\xdf\x19\x82.\xd3 %\xf2h^\xd0\xc0[r\xde\xddV\xa2\xdc\xc2\x85\xde\xb7\xb4T\xe4\xde\xbf\x17\xed\xc0/\xc6i\x0b\xb8\x85>\x00\x08\x13\x02\x13\x03\x13\x01\x00\xff\x01\x00\x00\x8f\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00']
Bad pipe message: %s [b"#\xe9vs\x88\xe0f\x7f+\x00\x16(qu\x16\xc8+\xac\x00\x00\xf4\xc00\xc0,\xc0(\xc0$\xc0\x14\xc0\n\x00\xa5\x00\xa3\x00\xa1\x00\x9f\x00k\x00j\x00i\x00h\x009\x008\x007\x006\x00\x88\x00\x87\x00\x86\x00\x85\xc0\x19\x00\xa7\x00m\x00:\x00\x89\xc02\xc0.\xc0*\xc0&\xc0\x0f\xc0\x05\x00\x9d\x00=\x005\x00\x84\xc0/\xc0+\xc0'\xc0#\xc0\x13\xc0\t\x00\xa4\x00\xa2\x00\xa0\x00\x9e\x00g\x00@\x00?\x00>\x003\x002\x001\x000\x00\x9a\x00\x99\x00\x98\x00\x97\x00E\x00D\x00C\x00B\xc0\x18\x00\xa6\x00l\x004\x00\x9b\x00F\xc01\xc0-\xc0)\xc0%\xc0\x0e\xc0\x04\x00\x9c\x00<\x00/\x00\x96\x00A\x00\x07\xc0\x11\xc0\x07\xc0\x16\x00"]
