# DATA PREPROCESSING - FIRST

In [None]:
version_ID = 'first'

In [1]:
#import libraries
import numpy as np
import pandas as pd
import sklearn 
from sklearn.model_selection import train_test_split
from random import shuffle
import os
from os import path
import time
import warnings


### Preprocess data for Training and Model Selection (Train & Validation & Test)

TRAIN.SCV

In [2]:
#import dataset
X = pd.read_csv('../../data/raw/train_data.csv.zip')

#split dataset into train and test
X_train_input, X_test_input = train_test_split(X, test_size=0.2, random_state=42, shuffle=True)

#get parameters for normalization --> avoid data leekage
norm_paras = X_train_input.agg(['mean', 'std', 'median'])
#print(norm_paras)

                index       lat       lon  contest-pevpr-sfc-gauss-14d__pevpr  \
mean    187815.244352  0.592658  0.518061                          276.734118   
std     108466.905409  0.251740  0.272093                          198.054535   
median  187706.000000  0.590909  0.533333                          250.390000   

        nmme0-tmp2m-34w__cancm30  nmme0-tmp2m-34w__cancm40  \
mean                   10.810185                 12.736341   
std                    10.956833                 11.079325   
median                  9.950000                 11.890000   

        nmme0-tmp2m-34w__ccsm30  nmme0-tmp2m-34w__ccsm40  \
mean                  10.106994                11.645961   
std                   10.340406                10.579823   
median                 9.490000                11.620000   

        nmme0-tmp2m-34w__cfsv20  nmme0-tmp2m-34w__gfdlflora0  ...  \
mean                  10.741099                    11.781627  ...   
std                    9.407989                

  norm_paras = X_train_input.agg(['mean', 'std', 'median'])


Preprocessing pipeline

In [3]:
def preprocess(X, norm_paras=norm_paras):

    length = X.shape[0]
    print(f'sample size: {length}')
    
    #warnings.filterwarnings('ignore')
    target = 'contest-tmp2m-14d__tmp2m'

    #extract y
    if target in X.columns.values:
        y = X[target].to_numpy()
        X = X.drop(labels=target, axis=1)
        print("(y extracted)")
    else:    y = None

    # convert date (str -> float)
    dates = pd.to_datetime(X.startdate).tolist()
    for i in range(length):
        new = pd.to_datetime(dates[i]).timetuple()
        dates[i] = time.mktime(new)
        
        
    X['startdate'] = dates
    print("(Startdate converted)")
    
    #normalize numerical variables and fill missing values
    numerical = norm_paras.columns[norm_paras.columns != target]
    for col in numerical:
        X[col].fillna(norm_paras[col]['median'], inplace=True)
        X[col] = (X[col] - norm_paras[col]['mean']) / norm_paras[col]['std']
        X[col] = np.float32(X[col])
    
    #one-hot-encode categorical variables
    X = pd.get_dummies(X, columns=['climateregions__climateregion'])
    
    #drop columns
    drops = []
    X = X.drop(labels=drops, axis=1)
    #change to numpy array
    X_final = X.to_numpy()

    #info
    print("This is an extract of df after preprocessing:")
    print(f"Shape X: {X_final.shape}, Shape y: {y.shape if y is not None else None}")
    print(X.info())

    return X_final, y

In [4]:
# preprocess train.csv
X_train, y_train = preprocess(X_train_input)

sample size: 300587
(y extracted)
(Startdate converted)
This is an extract of df after preprocessing:
Shape X: (300587, 259), Shape y: (300587,)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 300587 entries, 242935 to 121958
Columns: 259 entries, index to climateregions__climateregion_Dwb
dtypes: float32(243), float64(1), uint8(15)
memory usage: 287.5 MB
None


In [5]:
X_test, y_test = preprocess(X_test_input)

sample size: 75147
(y extracted)
(Startdate converted)
This is an extract of df after preprocessing:
Shape X: (75147, 259), Shape y: (75147,)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 75147 entries, 363717 to 212790
Columns: 259 entries, index to climateregions__climateregion_Dwb
dtypes: float32(243), float64(1), uint8(15)
memory usage: 71.9 MB
None


In [6]:
# preprocess test.csv

X_prediction = pd.read_csv('../data/raw/test_data.csv.zip')
X_prediction, _ = preprocess(X_prediction)

sample size: 31354
(Startdate converted)
This is an extract of df after preprocessing:
Shape X: (31354, 259), Shape y: None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31354 entries, 0 to 31353
Columns: 259 entries, index to climateregions__climateregion_Dwb
dtypes: float32(243), float64(1), uint8(15)
memory usage: 29.8 MB
None


In [7]:
np.info(X_test)

class:  ndarray
shape:  (75147, 259)
strides:  (8, 601176)
itemsize:  8
aligned:  True
contiguous:  False
fortran:  True
data pointer: 0x7efccab9c010
byteorder:  little
byteswap:  False
type: float64


In [8]:
X_prediction.shape

(31354, 259)

In [9]:
X_prediction[:2][:4]

array([[ 1.73249853e+00, -2.35424924e+00,  1.15869439e+00,
         1.66725720e+09,  3.18830788e-01,  1.83171678e+00,
         1.64122438e+00,  1.84354532e+00,  1.83122528e+00,
         1.99074435e+00,  1.79053164e+00,  1.76269960e+00,
         1.87401891e+00,  1.46039569e+00,  1.78361952e+00,
         4.70474124e-01,  1.56262636e+00,  1.47284222e+00,
         1.57756865e+00,  1.48593760e+00,  1.80036592e+00,
         1.55793643e+00,  1.50246501e+00,  1.62771654e+00,
         1.28008270e+00,  1.55021274e+00,  5.32361209e-01,
        -2.55347162e-01, -1.66224828e-03, -6.80841982e-01,
         3.51670361e-03,  6.03631586e-02, -6.21231735e-01,
         1.35406196e-01,  4.19668317e-01,  3.38014483e-01,
        -1.05244897e-01,  1.50680697e+00,  4.98238564e-01,
        -5.69388270e-01, -5.96465290e-01, -9.96786833e-01,
        -8.39475244e-02, -9.86811221e-01, -1.08370769e+00,
        -1.12020671e+00, -1.38977438e-01, -6.82521582e-01,
        -4.73532349e-01,  4.48357701e-01, -7.67971635e-0

---------------

# Save dataset --version

In [10]:
# prepare folder and info file for dataset versions

version_note = '__'
timestamp = time.strftime("on %Y_%m_%d at %H_%M")

overview = pd.read_csv('../data/preprocessed/dataset_overview.csv')

if not os.path.exists(f'../data/preprocessed/{version_ID}'):
        os.mkdir(f'../data/preprocessed/{version_ID}')

#append to overview logs
overview = overview.append({'version_ID': version_ID, 'version_note': version_note, 'timestamp': timestamp}, 
                                ignore_index=True)
overview = overview[{'version_ID', 'version_note', 'timestamp'}]
overview.drop_duplicates(inplace=True)
overview.to_csv('../data/preprocessed/dataset_overview.csv')

# save preprocessed data
np.save(f'../data/preprocessed/{version_ID}/X_train.npy', X_train)
np.save(f'../data/preprocessed/{version_ID}/X_test.npy', X_test)
np.save(f'../data/preprocessed/{version_ID}/y_train.npy', y_train)
np.save(f'../data/preprocessed/{version_ID}/y_test.npy', y_test)
np.save(f'../data/preprocessed/{version_ID}/X_predict.npy', X_prediction)

  overview = overview.append({'version_ID': version_ID, 'version_note': version_note, 'timestamp': timestamp},
  overview = overview[{'version_ID', 'version_note', 'timestamp'}]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  overview.drop_duplicates(inplace=True)


In [11]:
# see overview of existant datasets

overview = pd.read_csv('../data/preprocessed/dataset_overview.csv')
overview

Unnamed: 0.1,Unnamed: 0,timestamp,version_note,version_ID
0,0,on 2023_01_06 at 13_15,first,first
1,1,on 2023_01_06 at 13_35,first,first
2,2,on 2023_01_06 at 15_27,first,first
3,3,on 2023_01_06 at 15_47,first,first


In [12]:
############# use to create new overview-version and DELETE OLD VERSIONS #############

#overview = pd.DataFrame(columns=['version_ID', 'version_note', 'timestamp'])
#overview.to_csv('../data/preprocessed/dataset_overview.csv')