In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from datetime import datetime
import numpy as np
import os
import matplotlib.pyplot as plt
import matplotlib
%matplotlib inline

In [None]:
root_dir="/content/drive/MyDrive/RCA/variations"

tot_var_files=[]
for i in os.listdir(root_dir):
  if(i.endswith(".csv")):
    # print(i)
    tot_var_files.append(root_dir+"/"+i)

print(len(tot_var_files))

180


In [None]:
fault_names={'Battery':'BFaults','Internal Temperature':'ITFaults','Light Intensity':'LIA1Faults','Humidity':'HFaults','Irometer':'IROFaults','Temperature':'TFaults','Soil Temperature':'STFaults','Signal Strength':'SSFaults'}
variation_names={'Battery':'BVAR','Internal Temperature':'ITVAR','Light Intensity':'LIA1VAR','Humidity':'HVAR','Irometer':'IROVAR','Temperature':'TVAR','Soil Temperature':'STVAR','Signal Strength':'SSVAR'}

In [None]:
variation_column_names=['Battery VAR','Internal Temperature VAR','Light Intensity VAR','Humidity VAR','Irometer VAR','Temperature VAR','Soil Temperature VAR','Signal Strength VAR']

In [None]:
def check_wrong_data_points(x,df_,vals,sensor):
  if((x<vals[0])|(x>vals[1])):
    return None
  else:
    return x

def absolute_val(x):
  if(x<0):
    x=x*(-1)
    return x
  elif(x>0):
    return x
  else:
    return x


In [None]:
#Automating variation analysis
def automated_variation_analysis(file,fault_names,variation_names):
  try:
    df_=pd.read_csv(file)
    o=pd.read_csv("/content/drive/MyDrive/RCA/devices/"+os.path.basename('{}'.format(file)))
    
    final=df_[['time']].copy()
    final['time']=pd.to_datetime(final['time'])
    for col in df_.columns:
      if col in fault_names.values():
        print("column name:",col)
        for i in fault_names:
          if fault_names[i]==col:
            sensor=i
        print("sensor name:",sensor)
        ndf=df_[df_[col]==variation_names[sensor]]
        ndf['time']=pd.to_datetime(ndf['time'])

      
        # print("Data:\n")
        # print(ndf.head())

        cols={'Battery':{'min':260,'max':314},'Internal Temperature':{'min':-55,'max':125},'Light Intensity':{'min':0,'max':65535},'Humidity':{'min':0,'max':110},'Irometer':{'min':0,'max':200},'Temperature':{'min':-40,'max':125},'Soil Temperature':{'min':-55,'max':125},'Signal Strength':{'min':2,'max':30}}
        vals=[]
        #dropping wrong data points
        for i in o.columns:
          if(i==sensor):
            for sen,val in cols[i].items():
              vals.append(val)
          else:
            pass
        print("vals:",vals)
        ndf[sensor]=ndf[sensor].apply(lambda x: check_wrong_data_points(x,ndf,vals,sensor))
        ndf = ndf.dropna(subset=[sensor])

        if ndf.empty:
          pass
        else:
          # print("Data:\n")
          # print(ndf.head())

          new=ndf[['time',sensor]].copy()
          

          window_size=5
          tot_length=new.shape[0]
          print("tot_length:",tot_length)
          window_indices= [i for i in range(tot_length) if i % window_size ==0]  
            
            
          if 'index' in new.columns:
            pass
          else:
            new.reset_index(inplace=True)

          df=pd.DataFrame()
          for i in window_indices:
            df=df.append(new.loc[i])

          # print("df:\n")
          # print(df) 
      
          df['next']=df[sensor].shift(-1)
          df.fillna(0.0,inplace=True)
          df['diff']=df[sensor]-df['next']
          df['diff']=df['diff'].apply(lambda x:absolute_val(x))


          sensor_range=vals[1]-vals[0]
          const_val=0.5*sensor_range
          
          #define conditions
          conditions = [df['diff']>const_val, 
                          df['diff']<const_val,df['diff']==const_val,df['diff']==0]

          #define choices
          choices = [sensor+' HIGH_VAR', sensor+' LOW_VAR',sensor+' NORMAL_VAR',sensor+' SAME_VAR']

          #create new column in DataFrame that displays results of comparisons
          df['var'] = np.select(conditions, choices, default=None)

          for i in df['time']:
            new.loc[new['time']==i,['var']]=df.loc[df['time']==i,'var'].values[0]

          start=0
          end=5
          while((end<tot_length+1) or (end==tot_length+1)):
            new.loc[start:end,'VAR']=new.loc[start,'var']
            start=start+5
            end=end+5
          new.drop('var', axis=1, inplace=True)
            
          print(sensor+" new columns:",new.columns)

          if 'VAR' in new.columns:
            for i in new['time']:
              final.loc[final['time']==i,[sensor+' VAR']]=new.loc[new['time']==i,['VAR']].values[0]
          else:
            pass

    final_merged=pd.DataFrame()
    for i in final.columns[1:]:
      dummies=pd.get_dummies(final[i],dummy_na=True)
      sensor_name=i.rsplit(' ', 1)[0]
      dummies=dummies.rename(columns=str).rename(columns={'nan':sensor_name+" NAN"})
      final_merged=pd.concat([final_merged,dummies],axis='columns')

    final=pd.concat([final,final_merged],axis='columns')
    # print("final:\n",final)
    for i in final.columns[1:]:
      if i in variation_column_names:
        final.drop(i, axis=1, inplace=True)
    print("final:\n",final)
    return final
  except pd.errors.EmptyDataError:
    pass
 

In [None]:
for i in range(len(tot_var_files)):
  print("i:\n",i)
  f_name=os.path.basename('{}'.format(tot_var_files[i]))
  data_frame=automated_variation_analysis(tot_var_files[i],fault_names,variation_names)
  if data_frame is None:
    continue
  else:
    data_frame.to_csv('/content/drive/MyDrive/RCA/new_device_variations/'+f_name,index=False)

In [None]:
# f_name=os.path.basename('{}'.format(tot_var_files[28]))
# data_frame=automated_variation_analysis(tot_var_files[28],fault_names,variation_names)
# data_frame.to_csv('/content/drive/MyDrive/RCA/device_variations/'+f_name,index=False)

In [None]:
sensors=['Battery','Internal Temperature','Light Intensity','Humidity','Irometer','Temperature','Soil Temperature','Signal Strength']

In [None]:
root_dir="/content/drive/MyDrive/RCA/faults"

tot_fault_files=[]
var_fault_files=[]
indx=0
for i in os.listdir(root_dir):
  indx+=1
  if(i.endswith(".csv")):
    f_basename=os.path.basename('{}'.format(i))
    filepath='/content/drive/MyDrive/RCA/new_device_variations/'+f_basename
    tot_fault_files.append(root_dir+"/"+i)
    if(os.path.exists(filepath)):
      var_fault_files.append(root_dir+"/"+i)
      try:
        ff=pd.read_csv(root_dir+"/"+i)
        vf=pd.read_csv(filepath)
        ff['time']=pd.to_datetime(ff['time'])
        for k in vf.columns[1:]:
          for i in vf['time']:
            ff.loc[ff['time']==i,[k]]=vf.loc[vf['time']==i,k].values[0]
        print("Successffully merged both the files for "+f_basename,indx)
        ff.to_csv('/content/drive/MyDrive/RCA/new_merged_devices/'+f_basename)
      except pd.errors.EmptyDataError:
        pass
    else:
      continue
        

print("total variation fault files: ",len(var_fault_files))
print("total fault files: ",len(tot_fault_files))