In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy
import gc
from tqdm import tqdm_notebook
import sys
sys.path.append('/home/ndsviriden/MinMax94/src/utils')
from Preprocessing import Preprocessor
from converters import convert_rp5_to_mmx, mmx_datetime_to_metro_format
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

preprocessor = Preprocessor()

## Load and preprocess MM94 stations data

In [2]:
%%time
# getting file tree in directory "data_csv", which contains raw unfiltered data
station_list = [113, 114, 612, 115] 
                #, 116, 117, 118, 119, 1809, 1810, 1811, 1812, 1813, 1814, 1815]
mypath = '/mnt/HARD/MinMax94/data/data_all/CSV/Raw_extended/'

# reading loaded csv files from data_csv directory, output is a list (length=number of stations) of raw df
raw_list = [pd.read_csv(mypath + str(station_id) + '_raw.csv', parse_dates = ['date_time']) 
              for station_id in station_list]
raw = pd.concat(raw_list)
raw = raw.reset_index(drop=True)

CPU times: user 2.87 s, sys: 378 ms, total: 3.25 s
Wall time: 3.25 s


In [3]:
%%time
useful_features = ['t_air', 't_road', 't_underroad', 'pressure', 'dampness', 'cloudiness', 'precip_code', 'salinity',
                  'visibility', 'p_weather', 'wind_dir']
raw_lmeteo = preprocessor.SelectFeatures(raw, useful_features)
raw_lmeteo = preprocessor.AddUTC(raw_lmeteo)
lmeteo_pivot = preprocessor.PivotTable(raw_lmeteo)
lmeteo_pivot = preprocessor.FixPressureScale(lmeteo_pivot)
lmeteo_pivot = preprocessor.ConvertDataToMmx(lmeteo_pivot)
lmeteo_patterns = preprocessor.CreatePatternList(lmeteo_pivot)
lmeteo_interpolated = preprocessor.InterpolatePatterns(lmeteo_patterns)

CPU times: user 3.25 s, sys: 556 ms, total: 3.8 s
Wall time: 3.8 s


## Load and preprocess RP5 stations data

In [6]:
%%time
station_list = [22831, 22925, 22867, 28696, 31318, 31917]
mypath = '/mnt/HARD/MinMax94/data/data_all/CSV/RP5/'

rp5_list = []
for station_id in station_list:
    df = pd.read_csv(mypath + str(station_id) + '.csv', sep=';', skiprows=6, index_col=False)
    date_time_col = [col for col in df.columns if col.startswith('Местное время')][0]
    df = df.rename(columns={date_time_col: 'Местное время'})
    df['station_id'] = station_id
    rp5_list.append(df)

CPU times: user 380 ms, sys: 4.15 ms, total: 384 ms
Wall time: 384 ms


In [7]:
rp5_data = pd.concat(rp5_list).reset_index(drop=True)
rp5_data = convert_rp5_to_mmx(rp5_data)
del rp5_data['data_p_weather'], rp5_data['data_precip_interval']

rp5_data = preprocessor.AddUTC(rp5_data, '/mnt/HARD/MinMax94/data/data_all/CSV/stations_rp5_def.csv')
rp5_interpolated = preprocessor.InterpolatePatterns(rp5_data)

## Convert MM94 data to METRO

In [8]:
rp5_interpolated['date_time_utc'].apply(mmx_datetime_to_metro_format)

0        2011-12-31 21:30 UTC
1        2011-12-31 22:00 UTC
2        2011-12-31 22:30 UTC
3        2011-12-31 23:00 UTC
4        2011-12-31 23:30 UTC
5        2012-01-01 00:30 UTC
6        2012-01-01 01:00 UTC
7        2012-01-01 01:30 UTC
8        2012-01-01 02:00 UTC
9        2012-01-01 02:30 UTC
10       2012-01-01 03:30 UTC
11       2012-01-01 04:00 UTC
12       2012-01-01 04:30 UTC
13       2012-01-01 05:00 UTC
14       2012-01-01 05:30 UTC
15       2012-01-01 06:30 UTC
16       2012-01-01 07:00 UTC
17       2012-01-01 07:30 UTC
18       2012-01-01 08:00 UTC
19       2012-01-01 08:30 UTC
20       2012-01-01 09:30 UTC
21       2012-01-01 10:00 UTC
22       2012-01-01 10:30 UTC
23       2012-01-01 11:00 UTC
24       2012-01-01 11:30 UTC
25       2012-01-01 12:30 UTC
26       2012-01-01 13:00 UTC
27       2012-01-01 13:30 UTC
28       2012-01-01 14:00 UTC
29       2012-01-01 14:30 UTC
                 ...         
53366    2016-12-30 04:30 UTC
53367    2016-12-30 05:00 UTC
53368    2