In [65]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [66]:
df_clean = pd.read_csv('data/real/00-complete/clean_data.csv', header=2)
df_dirty = pd.read_csv('data/real/00-complete/dirty_data.csv', header=2)

In [67]:
df_clean.head()

Unnamed: 0,TIMESTAMP,Sm3/h,Sm3/h.1,C,barg,kg/s,kg/s.1,C.1,kg/m3,barg.1,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15
0,10:00:00,99.547951,0.008761,18.37232,1.155875,0.003782,0.009572,19.13813,998.388672,1.129974,0,0.002739,0.003782,0.0,0.0,0.009572
1,10:00:01,99.428741,0.008641,18.373461,1.155437,0.003782,0.009573,19.13813,998.388611,1.129535,0,0.002595,0.003782,0.0,0.0,0.009573
2,10:00:02,99.466888,0.008521,18.374611,1.154999,0.003783,0.009573,19.13813,998.388489,1.129095,0,0.002451,0.003783,0.0,0.0,0.009573
3,10:00:03,99.409668,0.008401,18.375759,1.154631,0.003783,0.009573,19.13813,998.388489,1.128848,0,0.002451,0.003783,0.0,0.0,0.009573
4,10:00:04,99.295219,0.008281,18.376909,1.154263,0.003783,0.009573,19.273861,998.388489,1.128601,0,0.002451,0.003783,0.0,0.0,0.009573


In [68]:
df_dirty.head()

Unnamed: 0,TIMESTAMP,Sm3/h,Sm3/h.1,C,barg,kg/s,kg/s.1,C.1,kg/m3,barg.1,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,barg.2
0,10:00:00,99.547951,0.008761,18.37232,1.155875,0.003782,0.009572,19.13813,998.3886719,1.129974,0.0,0.002739,0.003782,0.0,0.0,0.009572,1.155875
1,10:00:01,99.428741,0.008641,18.373461,1.155437,0.003782,0.009573,19.13813,998.3886108,1.129535,0.0,0.002595,0.003782,0.0,0.0,0.009573,1.155437
2,10:00:02,99.466888,0.008521,18.374611,1.154999,0.003783,0.009573,19.13813,998.3884888,1.129095,0.0,0.002451,0.003783,0.0,0.0,0.009573,1.154999
3,10:00:03,99.409668,0.008401,18.375759,1.154631,0.003783,0.009573,19.13813,998.3884888,1.128848,0.0,0.002451,0.003783,0.0,0.0,0.009573,1.154631
4,10:00:04,99.295219,0.008281,18.376909,1.154263,0.003783,0.009573,19.273861,998.3884888,1.128601,0.0,0.002451,0.003783,0.0,0.0,0.009573,1.154263


## Issues

### Issues with Timestamp

- 11:01:00 - 11:01:59: only the minute was recorded
- 14:20:01 - 14:40:30: the cell format is incorrect, only numerical values of a certain format are stored
- 16:00:00 - 17:00:00: recorded in 12-hour format (e.g. 4:00:00 PM)
- 13:59:46: recorded twice --> done
- 12:08:00 - 12:08:59: every other time stamp is missing 

### Issues with PVs

- 12:11:01 - 12:16:06: the unit of PT102/OUT.CV changed from barg to mPa (1 barg ~~ 0.1 mPa)
- 15:14:00 - 15:16:00: measurement error in FIC103/PID1/PV; FIC103/PID1/PV is different from FIC103/PID1/SP and FT103/OUT.CV.
- Missing values scattered in the data: connectivity error, nan, empty cells; some values (e.g. missing value in FT104/OUT.CV can be infered from FIC104/PID1/PV)
- Duplicated entries: "PT101/OUT.CV" at column E and "Pressure measure" at column Q

In [10]:
df_clean.loc[['11:01:00','11:01:01']]

Unnamed: 0_level_0,Sm3/h,Sm3/h.1,C,barg,kg/s,kg/s.1,C.1,kg/m3,barg.1,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15
TIMESTAMP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
11:01:00,118.461998,1.820567,18.946951,1.278085,0.003088,0.095124,19.329821,998.189087,1.2321,0,0.000289,0.003088,21.594589,0.1,0.095124
11:01:01,118.492996,12.29687,18.946911,1.268398,0.003046,0.095123,19.322849,998.189087,1.235527,0,0.000649,0.003046,21.594431,0.1,0.095123


In [11]:
df_dirty.loc[['11:01:00']]

Unnamed: 0_level_0,Sm3/h,Sm3/h.1,C,barg,kg/s,kg/s.1,C.1,kg/m3,barg.1,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,barg.2
TIMESTAMP,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
11:01:00,118.461998,1.820567,18.946951,1.278085,0.003088,0.095124,19.329821,998.1890869,1.2321,0.0,0.000289,0.003088,21.594589,0.1,0.095124,1.278085
11:01:00,118.492996,12.29687,18.946911,1.268398,0.003046,0.095123,19.322849,998.1890869,1.235527,0.0,0.000649,0.003046,21.594431,0.1,0.095123,1.268398
11:01:00,118.578796,19.86515,18.946871,1.262572,0.003004,0.095123,19.31588,998.1890259,1.229973,0.0,0.001009,0.003004,21.594271,0.1,0.095123,1.262572
11:01:00,118.712303,25.823469,18.946831,1.264785,0.002962,0.095123,19.30891,998.1890259,1.220318,0.0,0.000649,0.002962,21.59411,0.1,0.095123,1.264785
11:01:00,119.074699,27.618219,18.946791,1.258705,0.00292,0.095123,19.301941,998.1889038,1.204361,0.0,0.000289,0.00292,21.59395,0.1,0.095123,1.258705
11:01:00,119.694603,23.015181,18.946751,1.252625,0.002879,0.095122,19.29497,998.1887817,1.215086,0.0,0.0,0.002879,21.59379,0.1,0.095122,1.252625
11:01:00,120.481399,19.179319,18.94672,1.253789,0.002837,0.095122,19.10676,998.1887817,1.225811,0.0,0.0,0.002837,21.59363,0.1,0.095122,1.253789
11:01:00,120.934403,15.98277,18.94668,1.254954,0.002795,0.095122,19.10676,998.1887817,1.23054,0.0,0.0,0.002795,21.59347,0.1,0.095122,1.254954
11:01:00,121.168098,13.31897,18.94664,1.258215,0.002753,0.095121,19.10676,998.1887207,1.228223,0.0,0.0,0.002753,21.593309,0.1,0.095121,1.258215
11:01:00,121.277702,11.09914,18.9466,1.261477,0.002711,0.095121,19.10676,998.1887207,1.222889,0.0,0.00173,0.002711,21.593149,0.1,0.095121,1.261477


In [77]:
df_test = df_dirty.drop_duplicates()

In [79]:
df_test = df_test[df_test.duplicated(subset=['TIMESTAMP'],keep=False)]

In [119]:
from dateutil import parser
from datetime import datetime, timedelta

In [128]:
stamp = df_test.iloc[0]['TIMESTAMP']
stamp_time = parser.parse(stamp)
stamp_time

datetime.datetime(2021, 3, 28, 11, 1)

In [129]:
double_index = df_test[~df_test.duplicated(subset=['TIMESTAMP'],keep='first')].index[0]

In [130]:
df_test_2 = df_dirty[~df_dirty.duplicated(subset=['TIMESTAMP'],keep='first')]

In [131]:
df_test_2

Unnamed: 0,TIMESTAMP,Sm3/h,Sm3/h.1,C,barg,kg/s,kg/s.1,C.1,kg/m3,barg.1,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,barg.2
0,10:00:00,99.547951,0.008761,18.372320,1.155875,0.003782,0.009572,19.138130,998.3886719,1.129974,0.0,0.002739,0.003782,0.0,0.0,0.009572,1.155875
1,10:00:01,99.428741,0.008641,18.373461,1.155437,0.003782,0.009573,19.138130,998.3886108,1.129535,0.0,0.002595,0.003782,0.0,0.0,0.009573,1.155437
2,10:00:02,99.466888,0.008521,18.374611,1.154999,0.003783,0.009573,19.138130,998.3884888,1.129095,0.0,0.002451,0.003783,0.0,0.0,0.009573,1.154999
3,10:00:03,99.409668,0.008401,18.375759,1.154631,0.003783,0.009573,19.138130,998.3884888,1.128848,0.0,0.002451,0.003783,0.0,0.0,0.009573,1.154631
4,10:00:04,99.295219,0.008281,18.376909,1.154263,0.003783,0.009573,19.273861,998.3884888,1.128601,0.0,0.002451,0.003783,0.0,0.0,0.009573,1.154263
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
25168,4:59:56 PM,2.252615,0.000140,20.911320,0.573384,0.000346,0.000208,24.160521,997.1323853,0.607013,0.0,0.002451,0.000346,0.0,0.0,0.000208,0.573384
25169,4:59:57 PM,2.181088,0.000139,20.911610,0.573393,0.000346,0.000208,24.160521,997.1326294,0.607017,0.0,0.002451,0.000346,0.0,0.0,0.000208,0.573393
25170,4:59:58 PM,2.193009,0.000138,20.911900,0.573402,0.000347,0.000208,24.160521,997.1328125,0.607020,0.0,0.002451,0.000347,0.0,0.0,0.000208,0.573402
25171,4:59:59 PM,2.204931,0.000138,20.912189,0.573410,0.000347,0.000209,24.160521,997.1329956,0.607024,0.0,0.002451,0.000347,0.0,0.0,0.000208,0.573410


In [132]:
stamp_next = df_test_2.iloc[double_index+1]['TIMESTAMP']
stamp_next = parser.parse(stamp_next)
stamp_next

datetime.datetime(2021, 3, 28, 11, 2)

In [147]:
diff = stamp_next - stamp_time
diff /= len(df_test)
diff

datetime.timedelta(seconds=1)

In [163]:
for i in range(len(df_test)):
    stamp = df_dirty.iloc[double_index+i]['TIMESTAMP']
    stamp = parser.parse(stamp)
    stamp += i*diff
    print(stamp)

2021-03-28 11:01:00
2021-03-28 11:01:01
2021-03-28 11:01:02
2021-03-28 11:01:03
2021-03-28 11:01:04
2021-03-28 11:01:05
2021-03-28 11:01:06
2021-03-28 11:01:07
2021-03-28 11:01:08
2021-03-28 11:01:09
2021-03-28 11:01:10
2021-03-28 11:01:11
2021-03-28 11:01:12
2021-03-28 11:01:13
2021-03-28 11:01:14
2021-03-28 11:01:15
2021-03-28 11:01:16
2021-03-28 11:01:17
2021-03-28 11:01:18
2021-03-28 11:01:19
2021-03-28 11:01:20
2021-03-28 11:01:21
2021-03-28 11:01:22
2021-03-28 11:01:23
2021-03-28 11:01:24
2021-03-28 11:01:25
2021-03-28 11:01:26
2021-03-28 11:01:27
2021-03-28 11:01:28
2021-03-28 11:01:29
2021-03-28 11:01:30
2021-03-28 11:01:31
2021-03-28 11:01:32
2021-03-28 11:01:33
2021-03-28 11:01:34
2021-03-28 11:01:35
2021-03-28 11:01:36
2021-03-28 11:01:37
2021-03-28 11:01:38
2021-03-28 11:01:39
2021-03-28 11:01:40
2021-03-28 11:01:41
2021-03-28 11:01:42
2021-03-28 11:01:43
2021-03-28 11:01:44
2021-03-28 11:01:45
2021-03-28 11:01:46
2021-03-28 11:01:47
2021-03-28 11:01:48
2021-03-28 11:01:49


datetime.timedelta(seconds=1)