In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy
import gc
from tqdm import tqdm_notebook
import sys
sys.path.append('/home/ndsviriden/MinMax94/src/utils')
from Preprocessing import Preprocessor
import warnings
warnings.filterwarnings('ignore')

In [3]:
def InterpolatePatterns(pattern_list):

    def round_30min(time):
        round_time = time
        round_time.minute = time.minute - 
        return datetime.datetime(time.year, time.month, time.day, time.hour, time.minute - time.minute % 30, 0)

    def interpolate(df):
        # copy the initial dataframe, so that no actions inside the function can affect it
        df_result = deepcopy(df)

        start = round_30min(df_result.date_time.min())
        end = round_30min(df_result.date_time.max())

        df_add = pd.DataFrame(pd.date_range(start=start, end=end, freq='30min'),
                              columns=pd.MultiIndex.from_tuples([('date_time', '')]))

        # !!if len(df_result.station_id.unique()) > 1 --- Raise Error ##
        df_add[('station_id', '')] = df_result.station_id.unique()[0]

        # adding a new column, so that we can define, which data is original and which is interpolated
        df_result[('interpol', '')] = False
        df_add[('interpol', '')] = True

        # Merging 2 tables
        df_result = pd.merge(df_result, df_add, how='outer',
                             on=['date_time', 'interpol', 'station_id'])

        df_result = df_result.set_index('date_time')
        df_result = df_result.sort_index()
        df_result.valid = True

        # Interpolating data in '30min - round' timestamp
        df_result.data = df_result.data.interpolate(method='time', limit_direction='both')
        df_result.id = df_result.id.interpolate(method='nearest', limit_direction='both')

        # Deleting original data, leaving only interpolated rows
        df_result = df_result[df_result.interpol]
        del df_result[('interpol',)]
        df_result = df_result.dropna()
        df_result = df_result.reset_index()
        return df_result

    patterns_interpolated = [interpolate(pattern) for pattern in tqdm_notebook(pattern_list)]

Unnamed: 0,station_id,date_time,data_cloudiness,data_dampness,data_pressure,data_t_air,data_t_road,data_t_underroad,id_cloudiness,id_dampness,id_pressure,id_t_air,id_t_road,id_t_underroad
0,113,2012-09-17 18:01:48,7.0,831.0,7357.0,136.0,140.0,140.0,1.170373e+08,1.170373e+08,1.170373e+08,1.170373e+08,1.170373e+08,1.170373e+08
1,113,2012-09-18 14:33:21,3.0,664.0,7398.0,156.0,194.0,187.0,1.170407e+08,1.170407e+08,1.170407e+08,1.170407e+08,1.170407e+08,1.170407e+08
2,113,2012-09-18 16:02:38,2.0,646.0,7398.0,157.0,200.0,187.0,1.170517e+08,1.170517e+08,1.170517e+08,1.170517e+08,1.170517e+08,1.170517e+08
3,113,2012-09-18 17:03:11,2.0,674.0,7398.0,152.0,194.0,181.0,1.170599e+08,1.170599e+08,1.170599e+08,1.170599e+08,1.170599e+08,1.170599e+08
4,113,2012-09-18 17:32:17,3.0,692.0,7406.0,147.0,190.0,177.0,1.170635e+08,1.170635e+08,1.170635e+08,1.170635e+08,1.170635e+08,1.170635e+08
5,113,2012-09-18 18:32:58,6.0,764.0,7406.0,135.0,141.0,145.0,1.170710e+08,1.170710e+08,1.170710e+08,1.170710e+08,1.170710e+08,1.170710e+08
6,113,2012-09-18 21:32:29,7.0,1000.0,7414.0,77.0,75.0,76.0,1.170935e+08,1.170935e+08,1.170935e+08,1.170935e+08,1.170935e+08,1.170935e+08
7,113,2012-09-18 23:32:49,8.0,1000.0,7414.0,62.0,56.0,58.0,1.171088e+08,1.171088e+08,1.171088e+08,1.171088e+08,1.171088e+08,1.171088e+08
8,113,2012-09-19 00:03:09,8.0,1000.0,7423.0,71.0,61.0,61.0,1.171127e+08,1.171127e+08,1.171127e+08,1.171127e+08,1.171127e+08,1.171127e+08
9,113,2012-09-19 00:34:59,8.0,1000.0,7414.0,60.0,56.0,58.0,1.171169e+08,1.171169e+08,1.171169e+08,1.171169e+08,1.171169e+08,1.171169e+08


In [4]:
# getting file tree in directory "data_csv", which contains raw unfiltered data
mypath = '/mnt/HARD/MinMax94/data/data_all/CSV/Raw_extended/113_raw.csv'

# reading loaded csv files from data_csv directory, output is a list (length=number of stations) of raw df
raw_lmeteo = pd.read_csv(mypath, parse_dates = ['date_time'])

preprocessor = Preprocessor()
useful_features = ['t_air', 't_road', 't_underroad', 'pressure', 'dampness', 'cloudiness']
raw_lmeteo = preprocessor.SelectFeatures(raw_lmeteo, useful_features)
lmeteo_pivot = preprocessor.PivotTable(raw_lmeteo)
lmeteo_pivot = preprocessor.FixPressureScale(lmeteo_pivot)
pattern_list = preprocessor.CreatePatternList(lmeteo_pivot)
pattern_list[0].head()

Unnamed: 0,station_id,date_time,data_cloudiness,data_dampness,data_pressure,data_t_air,data_t_road,data_t_underroad,id_cloudiness,id_dampness,id_pressure,id_t_air,id_t_road,id_t_underroad
7,113,2012-09-18 23:32:49,8.0,1000.0,7414.0,62.0,56.0,58.0,117108758.0,117108768.0,117108757.0,117108762.0,117108759.0,117108767.0
8,113,2012-09-19 00:03:09,8.0,1000.0,7423.0,71.0,61.0,61.0,117112657.0,117112667.0,117112656.0,117112661.0,117112658.0,117112666.0
9,113,2012-09-19 00:34:59,8.0,1000.0,7414.0,60.0,56.0,58.0,117116928.0,117116938.0,117116927.0,117116932.0,117116929.0,117116937.0
10,113,2012-09-19 01:32:50,8.0,1000.0,7414.0,60.0,54.0,56.0,117124765.0,117124775.0,117124764.0,117124769.0,117124766.0,117124774.0
11,113,2012-09-19 02:32:26,8.0,1000.0,7423.0,59.0,53.0,53.0,117132680.0,117132690.0,117132679.0,117132684.0,117132681.0,117132689.0
