In [78]:
import pandas as pd
import numpy as np
from copy import deepcopy
import gc
from tqdm import tqdm_notebook
import sys
sys.path.append('/home/ndsviriden/PycharmProjects/MinMax94/src/utils')
from Preprocessing import Preprocessor
import warnings
warnings.filterwarnings('ignore')

In [79]:
def InterpolatePatterns(pattern_list):

    def round_30min(time):
        return datetime.datetime(time.year, time.month, time.day, time.hour, time.minute - time.minute % 30, 0)

    def interpolate(df):
        # copy the initial dataframe, so that no actions inside the function can affect it
        df_result = deepcopy(df)

        start = round_30min(df_result.date_time.min())
        end = round_30min(df_result.date_time.max())

        df_add = pd.DataFrame(pd.date_range(start=start, end=end, freq='30min'),
                              columns=pd.MultiIndex.from_tuples([('date_time', '')]))

        # !!if len(df_result.station_id.unique()) > 1 --- Raise Error ##
        df_add[('station_id', '')] = df_result.station_id.unique()[0]

        # adding a new column, so that we can define, which data is original and which is interpolated
        df_result[('interpol', '')] = False
        df_add[('interpol', '')] = True

        # Merging 2 tables
        df_result = pd.merge(df_result, df_add, how='outer',
                             on=['date_time', 'interpol', 'station_id'])

        df_result = df_result.set_index('date_time')
        df_result = df_result.sort_index()
        df_result.valid = True

        # Interpolating data in '30min - round' timestamp
        df_result.data = df_result.data.interpolate(method='time', limit_direction='both')
        df_result.id = df_result.id.interpolate(method='nearest', limit_direction='both')

        # Deleting original data, leaving only interpolated rows
        df_result = df_result[df_result.interpol]
        del df_result[('interpol',)]
        df_result = df_result.dropna()
        df_result = df_result.reset_index()
        return df_result

    patterns_interpolated = [interpolate(pattern) for pattern in tqdm_notebook(pattern_list)]
    
def round_30min(time):
    return datetime.datetime(time.year, time.month, time.day, time.hour, time.minute - time.minute % 30, 0)

In [80]:
# getting file tree in directory "data_csv", which contains raw unfiltered data
mypath = '/home/ndsviriden/data_csv/113_raw.csv'

# reading loaded csv files from data_csv directory, output is a list (length=number of stations) of raw df
raw_lmeteo = pd.read_csv(mypath, parse_dates = ['date_time'])

preprocessor = Preprocessor()
useful_features = ['t_air', 't_road', 't_underroad', 'pressure', 'dampness', 'cloudiness']
raw_lmeteo = preprocessor.SelectFeatures(raw_lmeteo, useful_features)
lmeteo_pivot = preprocessor.PivotTable(raw_lmeteo)
lmeteo_pivot = preprocessor.FixPressureScale(lmeteo_pivot)
pattern_list = preprocessor.CreatePatternList(lmeteo_pivot)

data_columns = [column for column in lmeteo_pivot.columns if column.startswith('data_')]
id_columns = [column for column in lmeteo_pivot.columns if column.startswith('id_')]

In [81]:
test = pattern_list[28]

In [90]:
%%time
data_columns = [column for column in lmeteo_pivot.columns if column.startswith('data_')]
id_columns = [column for column in lmeteo_pivot.columns if column.startswith('id_')]

# copy the initial dataframe, so that no actions inside the function can affect it
df_result = deepcopy(test)
df_result = df_result.set_index('date_time')
df_result['interpol'] = False

# create table with rounded date_time
start = df_result.index.min().round('30min')
end = df_result.index.max().round('30min')
df_add = pd.DataFrame(index=pd.date_range(start, end, freq='30min', name='date_time'))
df_add['interpol'] = True
df_add['station_id'] = df_result['station_id'].unique()[0]

df_result = df_result.merge(df_add, how='outer', on=['station_id', 'interpol'], left_index=True, right_index=True, sort=True)

for column in data_columns:
    df_result[column] = df_result[column].resample('30T').transform('ffil', method='time', limit_direction='both', limit=4)

for column in id_columns:
    df_result[column] = df_result[column].interpolate(method='nearest', limit_direction='both', limit=4)
    
df_result = df_result[df_result['interpol']]
del df_result['interpol']
df_result

CPU times: user 40 ms, sys: 0 ns, total: 40 ms
Wall time: 43.2 ms


In [None]:
df_add = pd.DataFrame(pd.date_range(start=start, end=end, freq='30min'),
                      columns=pd.MultiIndex.from_tuples([('date_time', '')]))

# !!if len(df_result.station_id.unique()) > 1 --- Raise Error ##
df_add[('station_id', '')] = df_result.station_id.unique()[0]

# adding a new column, so that we can define, which data is original and which is interpolated
df_result[('interpol', '')] = False
df_add[('interpol', '')] = True

# Merging 2 tables
df_result = pd.merge(df_result, df_add, how='outer',
                     on=['date_time', 'interpol', 'station_id'])

df_result = df_result.set_index('date_time')
df_result = df_result.sort_index()
df_result.valid = True

# Interpolating data in '30min - round' timestamp
df_result.data = df_result.data.interpolate(method='time', limit_direction='both')
df_result.id = df_result.id.interpolate(method='nearest', limit_direction='both')

# Deleting original data, leaving only interpolated rows
df_result = df_result[df_result.interpol]
del df_result[('interpol',)]
df_result = df_result.dropna()
df_result = df_result.reset_index()
return df_result