In [1]:
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error
import warnings
import tqdm
from IPython.display import clear_output
from multiprocessing import Pool
from time import time
warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
cd "/content/drive/Shareddrives/Machine Learning - Transformers"

/content/drive/Shareddrives/Machine Learning - Transformers


In [44]:
train_data = pd.read_csv('Data/data_final_transformers_0_train.csv')

In [4]:
test_data = pd.read_csv('Data/data_final_transformers_0_test.csv')

### make a time index to split into 24 hours

In [49]:
time_index = [1]

for i in range(len(train_data['time'])-1):
  
  if train_data['time'][i] != train_data['time'][i+1]:
    time_index.append(time_index[-1]+1) 
  else:
    time_index.append(time_index[-1]) 
train_data.insert(len(train_data.columns), "time_index", time_index)
train_stations = train_data.station_id.unique()

In [8]:
time_index = [1]

for i in range(len(test_data['time'])-1):
  
  if test_data['time'][i] != test_data['time'][i+1]:
    time_index.append(time_index[-1]+1) 
  else:
    time_index.append(time_index[-1]) 
test_data.insert(len(test_data.columns), "time_index", time_index)
test_stations = test_data.station_id.unique()
  

### Split data into local and remote stations and split into 24 hours

In [9]:
time_window = 24

In [17]:
def get_train_features(i):
    # For train data
    remote_metaq_data = []
    remote_dist_data = []
    local_met_data = []
    local_aq_data = []
    local_stations = []
    
    tmp_df = train_data[(train_data['time_index']>=i) & (train_data['time_index']<i+time_window)]

    
    for station in train_stations:
        # Remote side
        remote_side = tmp_df[tmp_df.station_id != station]
        remote_side['PM25_Concentration'] = MinMaxScaler().fit_transform(np.array(remote_side['PM25_Concentration']).reshape(-1,1))
        station_met_aq = remote_side.drop(columns=['station_id', 'longitude', 'latitude'])
        station_met_aq2 = np.array(np.split(station_met_aq.values, time_window, axis=0)).swapaxes(0,1).swapaxes(1,2)[np.newaxis, :]
        remote_metaq_data.append(station_met_aq2)
        
        # Local side
        local_side = tmp_df[tmp_df.station_id == station]
        local_stations.append(local_side['station_id'].values[-1].reshape(-1,1))
        local_met = local_side.drop(columns=['station_id', 'longitude', 'latitude', 'PM25_Concentration']).values.swapaxes(0,1)[np.newaxis, :]
        local_met_data.append(local_met)
        local_aq = local_side['PM25_Concentration'].values[-1].reshape(-1,1)
        local_aq_data.append(local_aq)
        
        station_dist = (remote_side.drop_duplicates('station_id')[['longitude', 'latitude']].values -\
        local_side.drop_duplicates('station_id')[['longitude', 'latitude']].values)[np.newaxis, :]
        remote_dist_data.append(station_dist)
    return [np.concatenate(remote_metaq_data), 
            np.concatenate(remote_dist_data), 
            np.concatenate(local_met_data), 
            np.concatenate(local_aq_data),
            np.concatenate(local_stations)]

In [51]:
def get_test_features(i):
    # For test data
    remote_metaq_data = []
    remote_dist_data = []
    local_met_data = []
    local_aq_data = []
    local_stations = []
    
    tmp_df_tst = test_data[(test_data['time_index']>=i) & (test_data['time_index']<i+time_window)]
    
    remote_side = train_data[(train_data['time_index']>=i) & (train_data['time_index']<i+time_window)]
    remote_side['PM25_Concentration'] = MinMaxScaler().fit_transform(np.array(remote_side['PM25_Concentration']).reshape(-1,1))
    remote_met_aq = remote_side.drop(columns=['station_id', 'longitude', 'latitude'])
    remote_met_aq2 = np.array(np.split(remote_met_aq.values, time_window, axis=0)).swapaxes(0,1).swapaxes(1,2)[np.newaxis, :]
    
    for station in test_stations:
        remote_metaq_data.append(remote_met_aq2)
        
        # Local side
        local_side = tmp_df_tst[tmp_df_tst.station_id == station]
        local_stations.append(local_side['station_id'].values[-1].reshape(-1,1))
        local_met = local_side.drop(columns=['station_id', 'longitude', 'latitude', 
                                             'PM25_Concentration']).values.swapaxes(0,1)[np.newaxis, :]
        local_met_data.append(local_met)
        local_aq = local_side['PM25_Concentration'].values[-1].reshape(-1,1)
        local_aq_data.append(local_aq)
        
        remote_dist = (remote_side.drop_duplicates('station_id')[['longitude', 'latitude']].values -\
        local_side.drop_duplicates('station_id')[['longitude', 'latitude']].values)[np.newaxis, :]
        remote_dist_data.append(remote_dist)
    
    return [np.concatenate(remote_metaq_data), 
            np.concatenate(remote_dist_data), 
            np.concatenate(local_met_data), 
            np.concatenate(local_aq_data),
            np.concatenate(local_stations)]

In [64]:
from sklearn.preprocessing import StandardScaler

fold=0 # not using for loop to avoid ram overflow
print('fold', fold)

cols_to_scale = ['temperature','latitude','longitude','wind_speed','humidity', 'pressure']
scaler =  MinMaxScaler().fit(train_data[cols_to_scale])
train_data[cols_to_scale] = scaler.transform(train_data[cols_to_scale])
scaler =  MinMaxScaler().fit(test_data[cols_to_scale])
test_data[cols_to_scale] = scaler.transform(test_data[cols_to_scale])
                                         
# print('train start')
# time_range = train_data.time_index.unique()
# with Pool(26) as p:
#     train_combo = list(tqdm.tqdm(p.imap(get_train_features, range(1, len(time_range)-24)), total=len(time_range)-24+1))
# print('train finished')
print('test start')
time_range = test_data.time_index.unique()
with Pool(26) as p:
    test_combo = list(tqdm.tqdm(p.imap(get_test_features, range(1, len(time_range)-24)), total=len(time_range)-24+1))
print('test finished')

# for combo_data, name in zip([train_combo], ['train']):
#     station_metaq_data = np.concatenate([combo[0] for combo in combo_data])
#     station_dist_data = np.concatenate([combo[1] for combo in combo_data])
#     local_met_data = np.concatenate([combo[2] for combo in combo_data])
#     local_aq_data = np.concatenate([combo[3] for combo in combo_data])
#     local_stations = np.concatenate([combo[4] for combo in combo_data])
#     np.savez('./Data/Final_Data_Transformers/transformers_station_metaq_data_'+name, station_metaq_data) 
#     np.savez('./Data/Final_Data_Transformers/transformers_station_dist_data_'+name, station_dist_data)
#     np.savez('./Data/Final_Data_Transformers/transformers_local_met_data_'+name, local_met_data)
#     np.savez('./Data/Final_Data_Transformers/transformers_local_aq_data_'+name, local_aq_data)
#     np.savez('./Data/Final_Data_Transformers/transformers_local_stationids_'+name, local_stations)

for combo_data, name in zip([test_combo], ['test']):
    station_metaq_data = np.concatenate([combo[0] for combo in combo_data])
    station_dist_data = np.concatenate([combo[1] for combo in combo_data])
    local_met_data = np.concatenate([combo[2] for combo in combo_data])
    local_aq_data = np.concatenate([combo[3] for combo in combo_data])
    local_stations = np.concatenate([combo[4] for combo in combo_data])
    np.savez('./Data/Final_Data_Transformers/transformers_station_metaq_data_'+name, station_metaq_data) 
    np.savez('./Data/Final_Data_Transformers/transformers_station_dist_data_'+name, station_dist_data)
    np.savez('./Data/Final_Data_Transformers/transformers_local_met_data_'+name, local_met_data)
    np.savez('./Data/Final_Data_Transformers/transformers_local_aq_data_'+name, local_aq_data)
    np.savez('./Data/Final_Data_Transformers/transformers_local_stationids_'+name, local_stations)


fold 0
test start


100%|█████████▉| 719/721 [00:44<00:00, 16.15it/s]


test finished


In [54]:
local_met_data.shape, local_aq_data.shape, station_metaq_data.shape, station_dist_data.shape

((8628, 32, 24), (8628, 1), (8628, 23, 33, 24), (8628, 23, 2))

In [None]:
# name = 'train'
# local_met_data = np.load(f'Data/Final_Data_Transformers/transformers_local_met_data_{name}.npz', allow_pickle=True)['arr_0']
# local_met = np.array([np.delete(local_met_data[i].T, [0,31], axis=1) for i in range(local_met_data.shape[0])], dtype=float)
# np.savez('./Data/Final_Data_Transformers/final_transformers_local_met_data_'+ name, local_met)

In [None]:
name = 'test'
local_met_data = np.load(f'Data/Final_Data_Transformers/transformers_local_met_data_{name}.npz', allow_pickle=True)['arr_0']
local_met = np.array([np.delete(local_met_data[i].T, [0,31], axis=1) for i in range(local_met_data.shape[0])], dtype=float)
np.savez('./Data/Final_Data_Transformers/final_transformers_local_met_data_'+ name, local_met)

In [None]:
pd.DataFrame(local_met[0])

In [None]:
# name = 'train'
# station_metaq_data = np.load(f'Data/Final_Data_Transformers/transformers_station_metaq_data_{name}.npz', allow_pickle=True)['arr_0']
# station_metaq = []
# for i in range(station_metaq_data.shape[0]):
#   tmp = np.array([np.delete(station_metaq_data[i][j].T, [0,32], axis=1) for j in range(station_metaq_data.shape[1])], dtype=float)
#   station_metaq.append(tmp)
# station_metaq = np.array(station_metaq)
# np.savez('./Data/Final_Data_Transformers/final_transformers_station_metaq_data_'+ name, station_metaq)

In [None]:
name = 'test'
station_metaq_data = np.load(f'Data/Final_Data_Transformers/transformers_station_metaq_data_{name}.npz', allow_pickle=True)['arr_0']
station_metaq = []
for i in range(station_metaq_data.shape[0]):
  tmp = np.array([np.delete(station_metaq_data[i][j].T, [0,32], axis=1) for j in range(station_metaq_data.shape[1])], dtype=float)
  station_metaq.append(tmp)
station_metaq = np.array(station_metaq)
np.savez('./Data/Final_Data_Transformers/final_transformers_station_metaq_data_'+ name, station_metaq)

In [None]:
pd.DataFrame(station_metaq[0][0])