**Sync With Drive**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import necessary libraries

In [2]:
import numpy as np
import pandas as pd

import dask.dataframe as dd
import dask.array as da
import dask.bag as db

# **1. Import the main csv file sent by embedded team**

In [3]:
ddf = dd.read_csv("/content/drive/MyDrive/Senzmate/RCA/jsonoutput3.csv", dtype={'deviceId': 'object'})

In [4]:
ddf.head()

Unnamed: 0.1,Unnamed: 0,_id,_class,deviceId,number,code,time,value,creationDate,modifiedDate
0,0,61cf998d2d3b872d3baa528e,com.magma.core.data.entity.Sensor,869170034808734,0,T,2022-01-01T00:00:13.521Z,21.14,2022-01-01T00:00:13.522Z,2022-01-01T00:00:13.522Z
1,1,61cf998d2d3b872d3baa528f,com.magma.core.data.entity.Sensor,869170034808734,1,H,2022-01-01T00:00:13.521Z,103.10,2022-01-01T00:00:13.523Z,2022-01-01T00:00:13.523Z
2,2,61cf998d2d3b872d3baa5290,com.magma.core.data.entity.Sensor,869170034808734,2,MEA4,2022-01-01T00:00:13.521Z,545.00/163.00,2022-01-01T00:00:13.524Z,2022-01-01T00:00:13.524Z
3,3,61cf998d2d3b872d3baa5291,com.magma.core.data.entity.Sensor,869170034808734,3,B,2022-01-01T00:00:13.521Z,266,2022-01-01T00:00:13.524Z,2022-01-01T00:00:13.524Z
4,4,61cf99902d3b872d3baa5297,com.magma.core.data.entity.Sensor,869170034809062,0,T,2022-01-01T00:00:16.445Z,13.44,2022-01-01T00:00:16.445Z,2022-01-01T00:00:16.445Z


# **Recreating a CSV by adding error flags directly to main csv**

Here didn't group the devices. If the dataponit is an error then the error flag i 1, otherwise 0.(If the datapoint is outlier => outlier flag =1)

**remove unwanted column**

In [5]:
ddf = ddf.drop(columns=['Unnamed: 0', '_id', '_class', 'number', 'creationDate', 'modifiedDate'])
ddf.head()

Unnamed: 0,deviceId,code,time,value
0,869170034808734,T,2022-01-01T00:00:13.521Z,21.14
1,869170034808734,H,2022-01-01T00:00:13.521Z,103.10
2,869170034808734,MEA4,2022-01-01T00:00:13.521Z,545.00/163.00
3,869170034808734,B,2022-01-01T00:00:13.521Z,266
4,869170034809062,T,2022-01-01T00:00:16.445Z,13.44


**Remove unwanted sensor data**

In [6]:
needed_values = ["B", "IT", "LIA1", "H", "IRO", "T", "ST", "SS"]
ddf2 = ddf.loc[ddf['code'].isin(needed_values)].reset_index(drop=True)

In [7]:
ddf2.head()

Unnamed: 0,deviceId,code,time,value
0,869170034808734,T,2022-01-01T00:00:13.521Z,21.14
1,869170034808734,H,2022-01-01T00:00:13.521Z,103.1
2,869170034808734,B,2022-01-01T00:00:13.521Z,266.0
3,869170034809062,T,2022-01-01T00:00:16.445Z,13.44
4,869170034809062,H,2022-01-01T00:00:16.445Z,98.55


**outlier detection**

In [8]:
VALUES_RANGE = {'B': {"min": 260, "max": 314},
                'IT': {"min": -55, "max": 125},
                'LIA1': {"min": 0, "max": 65535},
                'H': {"min": 0, "max": 110},
                'IRO': {"min": 0, "max": 200},
                'T': {"min": -40, "max": 125},
                'ST': {"min": -55, "max": 125},
                'SS': {"min": 2, "max": 30},
                }
def detect_outlier(row):
  try:
    if VALUES_RANGE[row.code]['min'] <= float(row.value) <= VALUES_RANGE[row.code]['max']:
      return 0
  except:
    pass
  return 1

In [9]:
ddf2['outlier_data_flag'] = ddf2.apply(lambda row: detect_outlier(row), axis=1, meta=pd.Series(dtype="int32"))

In [10]:
ddf2.head(5)

Unnamed: 0,deviceId,code,time,value,outlier_data_flag
0,869170034808734,T,2022-01-01T00:00:13.521Z,21.14,0
1,869170034808734,H,2022-01-01T00:00:13.521Z,103.1,0
2,869170034808734,B,2022-01-01T00:00:13.521Z,266.0,0
3,869170034809062,T,2022-01-01T00:00:16.445Z,13.44,0
4,869170034809062,H,2022-01-01T00:00:16.445Z,98.55,0


In [11]:
ddf2.tail(5)

Unnamed: 0,deviceId,code,time,value,outlier_data_flag
283293,8691700348087260,T,2022-07-01T00:00:00.000Z,18.15,0
283294,8691700348087260,H,2022-07-01T00:00:00.000Z,101.55,0
283295,8691700348087260,B,2022-07-01T00:00:00.000Z,273.0,0
283296,8691700348087260,IT,2022-07-01T00:00:00.000Z,43.0,0
283297,8691700348087260,ST,2022-07-01T00:00:00.000Z,0.0,0


**high volume check**

In [12]:
ddf2['high_volume'] = 1
ddf2.groupby(['deviceId','code', 'time']).high_volume.count().reset_index()

Unnamed: 0_level_0,deviceId,code,time,high_volume
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
,object,object,object,int64
,...,...,...,...


In [13]:
ddf2.head(5)

Unnamed: 0,deviceId,code,time,value,outlier_data_flag,high_volume
0,869170034808734,T,2022-01-01T00:00:13.521Z,21.14,0,1
1,869170034808734,H,2022-01-01T00:00:13.521Z,103.1,0,1
2,869170034808734,B,2022-01-01T00:00:13.521Z,266.0,0,1
3,869170034809062,T,2022-01-01T00:00:16.445Z,13.44,0,1
4,869170034809062,H,2022-01-01T00:00:16.445Z,98.55,0,1


In [14]:
# ddf2.to_csv('/content/drive/MyDrive/Senzmate/RCA/result_1/export-*.csv', index=False)  

In [14]:
ddf2['high_volume_flag'] = ddf2.apply(lambda row: 1 if row.high_volume > 1 else 0, axis=1, meta=pd.Series(dtype="int32"))
ddf2.head(5)

Unnamed: 0,deviceId,code,time,value,outlier_data_flag,high_volume,high_volume_flag
0,869170034808734,T,2022-01-01T00:00:13.521Z,21.14,0,1,0
1,869170034808734,H,2022-01-01T00:00:13.521Z,103.1,0,1,0
2,869170034808734,B,2022-01-01T00:00:13.521Z,266.0,0,1,0
3,869170034809062,T,2022-01-01T00:00:16.445Z,13.44,0,1,0
4,869170034809062,H,2022-01-01T00:00:16.445Z,98.55,0,1,0


In [16]:
# ddf2.to_csv('/content/drive/MyDrive/Senzmate/RCA/result_2/export-*.csv') 

In [17]:
# ddf2['high_volume'] = ddf2.groupby(['deviceId', 'code', 'time'])['value'].transform('size')
# ddf2['high_volume_flag'] = ddf2.apply(lambda row: 1 if row.high_volume > 1 else 0, axis=1, meta=pd.Series(dtype="int64"))

In [15]:
ddf2.head(5)

Unnamed: 0,deviceId,code,time,value,outlier_data_flag,high_volume,high_volume_flag
0,869170034808734,T,2022-01-01T00:00:13.521Z,21.14,0,1,0
1,869170034808734,H,2022-01-01T00:00:13.521Z,103.1,0,1,0
2,869170034808734,B,2022-01-01T00:00:13.521Z,266.0,0,1,0
3,869170034809062,T,2022-01-01T00:00:16.445Z,13.44,0,1,0
4,869170034809062,H,2022-01-01T00:00:16.445Z,98.55,0,1,0


In [16]:
ddf2['miss_data_flag'] = 0
ddf2.head(0)

Unnamed: 0,deviceId,code,time,value,outlier_data_flag,high_volume,high_volume_flag,miss_data_flag


In [20]:
# grp_deviceId_time = ddf2.groupby(['deviceId', 'time'])

In [21]:
# ddf2.groupby(['deviceId', 'time']).apply(func)

In [17]:
l1 = []
def my_function(group_df, grouped):
  # group_name = grouped.keys()
  captured_values = group_df['code'].values
  res = [*set(captured_values)]
  for sensor in needed_values:
    if sensor not in res:
      new_row = {'deviceId':[1], 'code':[sensor], 'time':[1], 'value':[0], 'outlier_data_flag':[0], 'high_volume':[0], 'high_volume_flag':[0], 'miss_data_flag': [1]}
      new_df = pd.DataFrame.from_dict(new_row)

      group_df.append(new_df, ignore_index=True)
  # l1.append(group_df)
  return group_df

In [18]:
grouped = ddf2.groupby(['deviceId', 'time'])

In [24]:
# print(grp_deviceId_time.grouper)

In [None]:
# meta_df = pd.DataFrame(columns=['outlier_data_flag', 'high_volume', 'high_volume_flag', 'miss_data_flag'], index=[0])
new_grouped = grouped.apply(my_function, grouped=grouped, meta = ddf2.head(0)).compute()

In [None]:
# print(type(l1))
# dd.concat(l1)

In [None]:
new_grouped.to_csv('/content/drive/MyDrive/Senzmate/RCA/result_3/result_all.csv', index=False)
# new_grouped.to_csv('/content/drive/MyDrive/Senzmate/RCA/result_3/export-*.csv', index=False)

In [None]:
new_grouped.to_csv('/content/drive/MyDrive/Senzmate/RCA/result_all.csv', index=False)

In [19]:
grouped = ddf2.groupby('deviceId')

In [23]:
def my_function_2(group_df):
  grouped_time = group_df.groupby('time')
  for id, time_df in grouped_time: 
    captured_values = time_df['code'].values
    res = [*set(captured_values)]
    for sensor in needed_values:
      if sensor not in res:
        new_row = {'deviceId':[1], 'code':[sensor], 'time':[id], 'value':[0], 'outlier_data_flag':[0], 'high_volume':[0], 'high_volume_flag':[0], 'miss_data_flag': [1]}
        new_df = pd.DataFrame.from_dict(new_row)

        time_df.append(new_df, ignore_index=True)
    # l1.append(group_df)
  return group_df

In [None]:
new_grouped = grouped.apply(my_function_2, meta = ddf2.head(0)).compute()

In [None]:
new_grouped.to_csv('/content/drive/MyDrive/Senzmate/RCA/result_all.csv', index=False)