## Issues

### Issues with Timestamp

- 11:01:00 - 11:01:59: only the minute was recorded --> done
- 14:20:01 - 14:40:30: the cell format is incorrect, only numerical values of a certain format are stored 
- 16:00:00 - 17:00:00: recorded in 12-hour format (e.g. 4:00:00 PM) --> DONE
- 13:59:46: recorded twice --> done
- 12:08:00 - 12:08:59: every other time stamp is missing --> Done

### Issues with PVs

- 12:11:01 - 12:16:06: the unit of PT102/OUT.CV changed from barg to mPa (1 barg ~~ 0.1 mPa)
- 15:14:00 - 15:16:00: measurement error in FIC103/PID1/PV; FIC103/PID1/PV is different from FIC103/PID1/SP and FT103/OUT.CV.
- Missing values scattered in the data: connectivity error, nan, empty cells; some values (e.g. missing value in FT104/OUT.CV can be infered from FIC104/PID1/PV) --> Done using interpolation. Maybe come back to this one
- Duplicated entries: "PT101/OUT.CV" at column E and "Pressure measure" at column Q

In [1]:
from pipeline.dataset import Dataset
df_dirty = Dataset('data/real/00-complete/dirty_data.csv')

In [2]:
df_dirty.infer_format()

'%H:%M:%S'

In [3]:
print(df_dirty.time_format)

%H:%M:%S


In [4]:
df_dirty.apply_format()

In [5]:
df_dirty.clean_stamps()

['11:01:00' '0x0']
Changing duplicate timestamp 11:01:00
Changing duplicate timestamp 0x0


In [6]:
import os
cwd = os.getcwd()
cwd

'F:\\time_series_pipeline'

In [96]:
df_dirty.to_csv(cwd+'/output2.csv', index=False)
#df_dirty.to_csv(cwd+'/output.csv')

In [7]:
import pandas as pd

In [8]:
df_dirty.df['TIMESTAMP'] = pd.to_datetime(df_dirty.df['TIMESTAMP'])

In [9]:
df_dirty.df['TIMESTAMP'].diff().value_counts()

0 days 00:00:01    25142
0 days 00:00:02       29
Name: TIMESTAMP, dtype: int64

In [10]:
freq = df_dirty.df['TIMESTAMP'].diff().median()

In [84]:
import numpy as np

In [85]:
df = df_dirty.df
df = df.set_index('TIMESTAMP')
df

Unnamed: 0_level_0,Sm3/h,Sm3/h.1,C,barg,kg/s,kg/s.1,C.1,kg/m3,barg.1,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,barg.2
TIMESTAMP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2021-05-09 10:00:00,99.547951,0.008761,18.372320,1.155875,0.003782,0.009572,19.138130,998.3886719,1.129974,0.0,0.002739,0.003782,0.0,0.0,0.009572,1.155875
2021-05-09 10:00:01,99.428741,0.008641,18.373461,1.155437,0.003782,0.009573,19.138130,998.3886108,1.129535,0.0,0.002595,0.003782,0.0,0.0,0.009573,1.155437
2021-05-09 10:00:02,99.466888,0.008521,18.374611,1.154999,0.003783,0.009573,19.138130,998.3884888,1.129095,0.0,0.002451,0.003783,0.0,0.0,0.009573,1.154999
2021-05-09 10:00:03,99.409668,0.008401,18.375759,1.154631,0.003783,0.009573,19.138130,998.3884888,1.128848,0.0,0.002451,0.003783,0.0,0.0,0.009573,1.154631
2021-05-09 10:00:04,99.295219,0.008281,18.376909,1.154263,0.003783,0.009573,19.273861,998.3884888,1.128601,0.0,0.002451,0.003783,0.0,0.0,0.009573,1.154263
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-09 16:59:56,2.252615,0.000140,20.911320,0.573384,0.000346,0.000208,24.160521,997.1323853,0.607013,0.0,0.002451,0.000346,0.0,0.0,0.000208,0.573384
2021-05-09 16:59:57,2.181088,0.000139,20.911610,0.573393,0.000346,0.000208,24.160521,997.1326294,0.607017,0.0,0.002451,0.000346,0.0,0.0,0.000208,0.573393
2021-05-09 16:59:58,2.193009,0.000138,20.911900,0.573402,0.000347,0.000208,24.160521,997.1328125,0.607020,0.0,0.002451,0.000347,0.0,0.0,0.000208,0.573402
2021-05-09 16:59:59,2.204931,0.000138,20.912189,0.573410,0.000347,0.000209,24.160521,997.1329956,0.607024,0.0,0.002451,0.000347,0.0,0.0,0.000208,0.573410


In [86]:
cols = df.columns
cols

Index(['Sm3/h', 'Sm3/h.1', 'C', 'barg', 'kg/s', 'kg/s.1', 'C.1', 'kg/m3',
       'barg.1', 'Unnamed: 10', 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13',
       'Unnamed: 14', 'Unnamed: 15', 'barg.2'],
      dtype='object')

In [87]:
start = df_dirty.df['TIMESTAMP'].iloc[0]
end = df_dirty.df['TIMESTAMP'].iloc[-1]

In [88]:
new_index = pd.date_range(start=start, end=end, freq=freq )

In [89]:
new_df = pd.DataFrame(index=new_index, columns = cols)

In [90]:
new_df

Unnamed: 0,Sm3/h,Sm3/h.1,C,barg,kg/s,kg/s.1,C.1,kg/m3,barg.1,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,barg.2
2021-05-09 10:00:00,,,,,,,,,,,,,,,,
2021-05-09 10:00:01,,,,,,,,,,,,,,,,
2021-05-09 10:00:02,,,,,,,,,,,,,,,,
2021-05-09 10:00:03,,,,,,,,,,,,,,,,
2021-05-09 10:00:04,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-09 16:59:56,,,,,,,,,,,,,,,,
2021-05-09 16:59:57,,,,,,,,,,,,,,,,
2021-05-09 16:59:58,,,,,,,,,,,,,,,,
2021-05-09 16:59:59,,,,,,,,,,,,,,,,


In [91]:
comb_df = pd.concat([df, new_df])
comb_df = comb_df[~comb_df.index.duplicated(keep='first')]
comb_df = comb_df.sort_index()
comb_df

Unnamed: 0,Sm3/h,Sm3/h.1,C,barg,kg/s,kg/s.1,C.1,kg/m3,barg.1,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,barg.2
2021-05-09 10:00:00,99.547951,0.008761,18.372320,1.155875,0.003782,0.009572,19.138130,998.3886719,1.129974,0.0,0.002739,0.003782,0.0,0.0,0.009572,1.155875
2021-05-09 10:00:01,99.428741,0.008641,18.373461,1.155437,0.003782,0.009573,19.138130,998.3886108,1.129535,0.0,0.002595,0.003782,0.0,0.0,0.009573,1.155437
2021-05-09 10:00:02,99.466888,0.008521,18.374611,1.154999,0.003783,0.009573,19.138130,998.3884888,1.129095,0.0,0.002451,0.003783,0.0,0.0,0.009573,1.154999
2021-05-09 10:00:03,99.409668,0.008401,18.375759,1.154631,0.003783,0.009573,19.138130,998.3884888,1.128848,0.0,0.002451,0.003783,0.0,0.0,0.009573,1.154631
2021-05-09 10:00:04,99.295219,0.008281,18.376909,1.154263,0.003783,0.009573,19.273861,998.3884888,1.128601,0.0,0.002451,0.003783,0.0,0.0,0.009573,1.154263
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2021-05-09 16:59:56,2.252615,0.000140,20.911320,0.573384,0.000346,0.000208,24.160521,997.1323853,0.607013,0.0,0.002451,0.000346,0.0,0.0,0.000208,0.573384
2021-05-09 16:59:57,2.181088,0.000139,20.911610,0.573393,0.000346,0.000208,24.160521,997.1326294,0.607017,0.0,0.002451,0.000346,0.0,0.0,0.000208,0.573393
2021-05-09 16:59:58,2.193009,0.000138,20.911900,0.573402,0.000347,0.000208,24.160521,997.1328125,0.607020,0.0,0.002451,0.000347,0.0,0.0,0.000208,0.573402
2021-05-09 16:59:59,2.204931,0.000138,20.912189,0.573410,0.000347,0.000209,24.160521,997.1329956,0.607024,0.0,0.002451,0.000347,0.0,0.0,0.000208,0.573410


In [92]:
#Find added rows:
compare=pd.concat([comb_df,comb_df.dropna(how='all')])
compare=[~compare.index.duplicated(keep=False)]

In [109]:
comb_df = comb_df.interpolate(method='time').ffill().bfill()

In [110]:
comb_df[comb_df.isnull().any(axis=1)]

Unnamed: 0,Sm3/h,Sm3/h.1,C,barg,kg/s,kg/s.1,C.1,kg/m3,barg.1,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,barg.2


In [112]:
comb_df.to_csv(cwd+'/output.csv', index=True)
#df_dirty.to_csv(cwd+'/output.csv')

In [None]:
from pipeline.dataset import Dataset
df_dirty = Dataset('data/real/00-complete/dirty_data.csv')