# PREPROCESSING --version 001
- radical drop of colunms
- date split into Y / W


----


In [72]:
version_ID = '001'
version_note = '_coldrop_datesplit_'

In [73]:
import numpy as np
import pandas as pd
import sklearn 
from sklearn.model_selection import train_test_split
from random import shuffle
import os
from os import path
import time
import warnings
import re

-----

### Load datasets

In [74]:
#import data
X = pd.read_csv('../../data/raw/train_data.csv.zip')
summer_weeks = pd.read_csv('../../data/additional/summer_intensity.csv')

#split dataset into train and test
X_train_input, X_test_input = train_test_split(X, test_size=0.2, random_state=42, shuffle=True)
print(X_train_input.shape, X_train_input.shape)

(300587, 246) (300587, 246)


In [75]:
# import helper data
summer_weeks = pd.read_csv('../../data/additional/summer_intensity.csv')

### Prepare columns & NN

In [76]:
#get parameters for normalization --> avoid data leekage
norm_paras = X_train_input.agg(['mean', 'std', 'median'])

# reduce expanded features
collapse = []
coll_patt = ['.+vwnd-250', '.+uwnd-250', '.+vwnd-925', '.+uwnd-925', '.+hgt-850', '.+hgt-500', '.+hgt-10', '.+hgt-100', '^icec', '^sst']

for p in coll_patt:
    pt = re.compile(p)
    collapse += [[c for c in X_train_input.columns if pt.match(c)]]

In [77]:
#get drop columns
drops = ['startdate']
patterns = ['^nmme', '^wind', '^icec', '^sst']

for p in patterns:
    p = re.compile(p)
    drops += [k for k in X_train_input.columns if bool(p.match(k))]
#print(len(drops), '\n', drops)

### Preprocessing pipeline

In [78]:
def preprocess(X, norm_paras=norm_paras):

    warnings.filterwarnings("ignore")
    length = X.shape[0]
    target = 'contest-tmp2m-14d__tmp2m'

    # extract y
    if target in X.columns.values:
        y = X[target].to_numpy()
        X = X.drop(labels=target, axis=1)
    else:    y = None
    
    
    ########################################
    
    # collapse expanded features
    for i in range(len(coll_patt)):
        name = coll_patt[i].strip('.+-')
        X[f'{name}_mn'] = X[collapse[i]].mean(axis=1)
        X[f'{name}_std'] = X[collapse[i]].std(axis=1)
       
    # normalize numerical variables and fill missing values
    numerical = norm_paras.columns[norm_paras.columns != target]
    for col in numerical:
        X[col].fillna(norm_paras[col]['median'], inplace=True)
        X[col] = (X[col] - norm_paras[col]['mean']) / norm_paras[col]['std']
        X[col] = np.float32(X[col])
    
    # convert date (to time progress & summer intensity)
    dates = pd.to_datetime(X.startdate)
    X['time_prog'] = [((d.year - 2014) + d.month)/36 for d in dates] # 2014-2016 are translated to [0,1]
    X['summer'] = [summer_weeks.loc[summer_weeks.week == d.week].values[0][1] for d in dates] # 0: winter, 1: summer
    
    # drop columns
    X = X.drop(labels=drops, axis=1).copy()
    
    # one-hot-encode categorical variables
    X = pd.get_dummies(X, columns=['climateregions__climateregion'])

    ########################################
    
    
    # change to numpy array
    X_final = X.to_numpy()

    # info
    print("PREPROCESSING result:")
    print(f"  Shape X: {X_final.shape}  || Shape Y: {y.shape if y is not None else None}")

    return X_final, y, X

----

### PREPROCESSING

In [79]:
# preprocess train.csv
X_train, y_train, X_df = preprocess(X_train_input)

PREPROCESSING result:
  Shape X: (300587, 69)  || Shape Y: (300587,)


In [81]:
X_test, y_test, _ = preprocess(X_test_input)

PREPROCESSING result:
  Shape X: (75147, 69)  || Shape Y: (75147,)


In [82]:
# preprocess test.csv
X_prediction = pd.read_csv('../../data/raw/test_data.csv.zip')
X_prediction, _, _ = preprocess(X_prediction)

PREPROCESSING result:
  Shape X: (31354, 69)  || Shape Y: None


---------------

# Save dataset 
--version

In [86]:
# prepare folder and info file for dataset versions

timestamp = time.strftime("on %Y_%m_%d at %H_%M")

overview = pd.read_csv('../../data/preprocessed/dataset_overview.csv')

if not os.path.exists(f'../../data/preprocessed/{version_ID}'):
        os.mkdir(f'../../data/preprocessed/{version_ID}')

#append to overview logs
overview = overview.append({'version_ID': version_ID, 'version_note': version_note, 'timestamp': timestamp}, 
                                ignore_index=True)
overview = overview[{'version_ID', 'version_note', 'timestamp'}]
overview.drop_duplicates(inplace=True)
overview.to_csv('../../data/preprocessed/dataset_overview.csv')

# save preprocessed data
np.save(f'../../data/preprocessed/{version_ID}/X_train.npy', X_train)
np.save(f'../../data/preprocessed/{version_ID}/X_test.npy', X_test)
np.save(f'../../data/preprocessed/{version_ID}/y_train.npy', y_train)
np.save(f'../../data/preprocessed/{version_ID}/y_test.npy', y_test)
np.save(f'../../data/preprocessed/{version_ID}/X_predict.npy', X_prediction)
X_df.describe().to_csv(f'../../data/preprocessed/{version_ID}/columns.csv')

In [88]:
# see overview of existant datasets

overview = pd.read_csv('../../data/preprocessed/dataset_overview.csv')
overview

Unnamed: 0.1,Unnamed: 0,timestamp,version_ID,version_note
0,0,on 2023_01_06 at 13_15,first,first
1,1,on 2023_01_06 at 13_35,first,first
2,2,on 2023_01_06 at 15_27,first,first
3,3,on 2023_01_06 at 15_47,first,first
4,4,on 2023_01_31 at 16_54,001,_coldrop_datesplit_


In [12]:
############# use to create new overview-version and DELETE OLD VERSIONS #############

#overview = pd.DataFrame(columns=['version_ID', 'version_note', 'timestamp'])
#overview.to_csv('../data/preprocessed/dataset_overview.csv')