In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy
import gc
from tqdm import tqdm_notebook
import sys
sys.path.append('/home/ndsviriden/MinMax94/src/utils')
from Preprocessing import Preprocessor
from converters import convert_format
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

preprocessor = Preprocessor()

## Load and preprocess MM94 stations data

In [2]:
%%time
# getting file tree in directory "data_csv", which contains raw unfiltered data
station_list = [113, 114, 612, 115] 
                #, 116, 117, 118, 119, 1809, 1810, 1811, 1812, 1813, 1814, 1815]
mypath = '/mnt/HARD/MinMax94/data/data_all/CSV/Raw_extended/'

# reading loaded csv files from data_csv directory, output is a list (length=number of stations) of raw df
raw_list = [pd.read_csv(mypath + str(station_id) + '_raw.csv', parse_dates = ['date_time']) 
              for station_id in station_list]
raw = pd.concat(raw_list)
raw = raw.reset_index(drop=True)

CPU times: user 2.96 s, sys: 328 ms, total: 3.28 s
Wall time: 3.28 s


In [3]:
%%time
useful_features = ['t_air', 't_road', 't_underroad', 'pressure', 'dampness', 'cloudiness', 'precip_code', 'salinity',
                  'visibility', 'p_weather', 'wind_dir']
raw_data = preprocessor.SelectFeatures(raw, useful_features)
raw_data = preprocessor.PivotTable(raw_data)
mmx_data = preprocessor.ConvertData(raw_data, from_format="Raw", to_format="Mmx")
mmx_data = preprocessor.AddUTC(mmx_data)
mmx_patterns = preprocessor.CreatePatternList(mmx_data)
mmx_interpolated = preprocessor.InterpolatePatterns(mmx_patterns)

CPU times: user 2.69 s, sys: 439 ms, total: 3.13 s
Wall time: 3.13 s


## Load and preprocess RP5 stations data

In [7]:
%%time
station_list = [22831, 22925, 22867, 28696, 31318, 31917]
mypath = '/mnt/HARD/MinMax94/data/data_all/CSV/RP5/'

rp5_list = []
for station_id in station_list:
    df = pd.read_csv(mypath + str(station_id) + '.csv', sep=';', skiprows=6, index_col=False)
    date_time_col = [col for col in df.columns if col.startswith('Местное время')][0]
    df = df.rename(columns={date_time_col: 'Местное время'})
    df['station_id'] = station_id
    rp5_list.append(df)
rp5 = pd.concat(rp5_list).reset_index(drop=True)
#rp5_data

CPU times: user 418 ms, sys: 15.1 ms, total: 433 ms
Wall time: 433 ms


In [11]:
rp5_data = preprocessor.ConvertData(rp5, from_format="RP5", to_format="Mmx")
del rp5_data['data_p_weather'], rp5_data['data_precip_interval']
rp5_data = preprocessor.AddUTC(rp5_data, '/mnt/HARD/MinMax94/data/data_all/CSV/stations_rp5_def.csv')
#rp5_interpolated = preprocessor.InterpolatePatterns(rp5_data)

In [12]:
rp5_data

Unnamed: 0,data_dew_point,data_pressure,data_precip_count,data_wind_speedmax,data_t_air,station_id,data_precip_code,data_wind_velocity,data_dampness,data_wind_dir,data_cloudiness,data_visibility,date_time,date_time_utc
0,-4.6,746.0,0.033333,,-4.2,22831,0,2.0,97.0,135.0,90.0,10000.0,2017-01-01 21:00:00,2017-01-01 18:00:00
1,-3.8,745.6,0.033333,,-3.1,22831,0,1.0,95.0,180.0,75.0,10000.0,2017-01-01 18:00:00,2017-01-01 15:00:00
2,-1.4,744.4,,,-0.6,22831,10,1.0,94.0,180.0,90.0,20000.0,2017-01-01 15:00:00,2017-01-01 12:00:00
3,-0.1,743.2,,,0.2,22831,10,2.0,98.0,90.0,90.0,10000.0,2017-01-01 12:00:00,2017-01-01 09:00:00
4,-0.6,742.3,0.004167,,0.6,22831,10,2.0,92.0,90.0,100.0,4000.0,2017-01-01 09:00:00,2017-01-01 06:00:00
5,-0.3,741.8,0.000000,,0.7,22831,10,3.0,93.0,90.0,100.0,4000.0,2017-01-01 06:00:00,2017-01-01 03:00:00
6,-0.7,741.4,,,1.4,22831,0,4.0,86.0,90.0,90.0,10000.0,2017-01-01 03:00:00,2017-01-01 00:00:00
7,-0.2,741.7,,,2.0,22831,0,3.0,86.0,90.0,90.0,10000.0,2017-01-01 00:00:00,2016-12-31 21:00:00
8,-0.1,742.0,0.000000,,1.8,22831,10,3.0,87.0,67.5,90.0,10000.0,2016-12-31 21:00:00,2016-12-31 18:00:00
9,0.4,743.0,0.004167,,2.1,22831,10,4.0,89.0,67.5,90.0,10000.0,2016-12-31 18:00:00,2016-12-31 15:00:00


## Convert MM94 data to METRO