In [1]:
import os
import datetime
import numpy as np
import pandas as pd
import tensorflow as tf

In [12]:
dirs = ['data_by_station', 'data_years', 'norm_data_01', 'norm_data_11', 'norm_data_o01', 'norm_data_o11']

for dir in dirs:
    if not os.path.exists(dir):
        os.makedirs(dir)

# 1. Compress data into year long zip files

In [2]:
import os

data_years = {}

for file in os.listdir('data/stations_data/'):
    if file.endswith(".csv"):
        year = file.strip('.csv')[:4]
        if year not in data_years:
            data_years[year] = []
        data_years[year].append(file)

for year in data_years:
    print(year)
    if int(year) >= 2020:
        continue
    
    data = []
    for file in data_years[year]:
        df = pd.read_csv('data/stations_data/' + file, sep=',')
        data.append(df)
    data = pd.concat(data)
    data.to_csv('data/data_years/' + year + '.zip', index=False, compression='zip')


2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023


Find which stations do not have any wind related data

In [3]:
no_data_stations = []

for file in os.listdir('data/data_years/'):
    df = pd.read_csv('data/data_years/'+file, compression='zip')
    for station in df['station'].unique():
        df_station = df[df['station'] == station].copy()
        num_minus_ones_col = (df_station[['vel10', 'vel6', 'vel2']] == -1).sum() / df_station.shape[0]

        if num_minus_ones_col['vel10'] == 1 and num_minus_ones_col['vel6'] == 1 and num_minus_ones_col['vel2'] == 1:
            no_data_stations.append(station)

print(set(no_data_stations))

{'MQ', 'W3', 'MV', 'XZ', 'MS', 'KP', 'XB', 'WE', 'WF', 'W8', 'VY', 'XO', 'XH', 'XQ', 'VZ', 'Y4', 'UO', 'W2', 'WO', 'WW', 'VN', 'X2', 'W1', 'UI', 'V4', 'D2', 'UD', 'VX', 'U8', 'YM', 'UE', 'D1', 'UK'}


# 2. Convert data to station data

In [4]:
h = True
for file in os.listdir('data/data_years/'):
    df = pd.read_csv('data/data_years/'+file, compression='zip', header=0, sep=',')
    for station in df['station'].unique():
        if station not in no_data_stations:
            df_station = df[df['station'] == station].copy()

            if not os.path.isfile('data/data_by_station/' + station + '.csv'):
                df_station.to_csv('data/data_by_station/' + station + '.csv', index=False)
            else:
                df_station.to_csv('data/data_by_station/' + station + '.csv', mode='a', index=False, header=h)
    h = False

# 3. Convert the winds from (speed, angle) to (speed_x, speed_y)

In [5]:
import math

for file in os.listdir('data/data_by_station/'):
    if file.endswith(".csv"):
        df = pd.read_csv('data/data_by_station/'+file)

        df = df[df['date'] != 'date'].copy()	
        df = df[(df['date'] >= '2010-01-01') & (df['date'] <= '2020-01-01')].copy()
        df.loc[df['HR'] <= 0, 'HR'] = 0  
        df.loc[df['P'] <= 0, 'P'] = 0  

        df['u2'], df['v2'] = [0, 0]
        df['u6'], df['v6'] = [0, 0]
        df['u10'], df['v10'] = [0, 0]

        df.loc[df['vel2'] != -1, 'u2'] = round(df.loc[df['vel2'] != -1, 'vel2'] * (df.loc[df['vel2'] != -1, 'ang2'] * (math.pi / 180)).apply(math.cos), 1)
        df.loc[df['vel2'] != -1, 'v2'] = round(df.loc[df['vel2'] != -1, 'vel2'] * (df.loc[df['vel2'] != -1, 'ang2'] * (math.pi / 180)).apply(math.sin), 1)

        df.loc[df['vel6'] != -1, 'u6'] = round(df.loc[df['vel6'] != -1, 'vel6'] * (df.loc[df['vel6'] != -1, 'ang6'] * (math.pi / 180)).apply(math.cos), 1)
        df.loc[df['vel6'] != -1, 'v6'] = round(df.loc[df['vel6'] != -1, 'vel6'] * (df.loc[df['vel6'] != -1, 'ang6'] * (math.pi / 180)).apply(math.sin), 1)

        df.loc[df['vel10'] != -1, 'u10'] = round(df.loc[df['vel10'] != -1, 'vel10'] * (df.loc[df['vel10'] != -1, 'ang10'] * (math.pi / 180)).apply(math.cos), 1)
        df.loc[df['vel10'] != -1, 'v10'] = round(df.loc[df['vel10'] != -1, 'vel10'] * (df.loc[df['vel10'] != -1, 'ang10'] * (math.pi / 180)).apply(math.sin), 1)

        df = df.drop(columns=['vel2', 'ang2', 'vel6', 'ang6', 'vel10', 'ang10'])
        
        df.to_csv('data/data_by_station/'+file.strip('.csv')+'.zip', index=False, compression='zip')

# 4. Find the maxs and mins to normalize

In [6]:
min_maxs = {}

for file in os.listdir('data/data_by_station/'):
    if file.endswith(".zip"):
        df = pd.read_csv('data/data_by_station/'+file, compression='zip')
        for col in df.columns:
            if col not in ['date', 'station', 'HR']:
                if df[col].unique().shape[0] == 1:
                    continue
                if col not in min_maxs:
                    min_maxs[col] = [df[df[col] != 0][col].min(), df[df[col] != -1][col].max()]
                else:
                    min_maxs[col][0] = min(min_maxs[col][0], df[df[col] != 0][col].min())
                    min_maxs[col][1] = max(min_maxs[col][1], df[df[col] != 0][col].max())
import json
with open('data/min_maxs.json', 'w') as fp:
    json.dump(min_maxs, fp, indent=4)

In [7]:
for file in os.listdir('data/data_by_station/'):
    if file.endswith(".zip"):
        df = pd.read_csv('data/data_by_station/'+file, compression='zip')
        df_01 = df.copy()
        df_11 = df.copy()

        for col in df.columns:
            if col not in ['date', 'station', 'HR', 'altitud', 'latitud', 'longitud']:
                df_01[col] = (df_01[col] - min_maxs[col][0]) / (min_maxs[col][1] - min_maxs[col][0])
                df_01.loc[df_01[col] < 0, col] = 0

                df_11[col] = (df_11[col] - min_maxs[col][0]) / (min_maxs[col][1] - min_maxs[col][0]) * 2 - 1
                df_11.loc[df_11[col] < -1, col] = -1
            if col == 'HR':
                df_01[col] = df_01[col] / 100
                df_01.loc[df_01[col] < 0, col] = 0

                df_11[col] = df_11[col] / 100 
                df_11.loc[df_11[col] < 0, col] = 0

        df_01.to_csv('data/norm_data_01/'+file.strip('.zip')+'_01.zip', index=False, compression='zip')
        df_11.to_csv('data/norm_data_11/'+file.strip('.zip')+'_11.zip', index=False, compression='zip')

# 5. Compute the IQR and treat outliers, then normalize

In [8]:
quantiles = {}

df = pd.read_csv('data/data_by_station/C6.zip', compression='zip')
cols = df.columns

for col in cols:
    if col in ['date', 'station', 'HR', 'altitud', 'latitud', 'longitud']:
        continue

    data = []
    for file in os.listdir('data/data_by_station/'):
        if file.endswith(".zip"):
            df = pd.read_csv('data/data_by_station/'+file, compression='zip')
            if df[col].unique().shape[0] == 1:
                continue
            
            data.append(df[df[col] != 0.0][col].copy())
    data = pd.concat(data)
    quantiles[col] = [data.quantile(0.25), data.quantile(0.75)]

import json

for i in quantiles.keys():
    iqr = quantiles[i][1] - quantiles[i][0]
    quantiles[i].append(quantiles[i][0] - 1.5 * iqr)
    quantiles[i].append(quantiles[i][1] + 1.5 * iqr)

with open('data/quantiles.json', 'w') as fp:
    json.dump(quantiles, fp, indent=4)

In [10]:
for file in os.listdir('data/data_by_station/'):
    if file.endswith(".zip"):
        df = pd.read_csv('data/data_by_station/'+file, compression='zip')
        df_01 = df.copy()
        df_11 = df.copy()

        for col in df.columns:
            
            if col not in ['date', 'station', 'HR', 'altitud', 'latitud', 'longitud']:
                df_01[col] = (df_01[col] - quantiles[col][2]) / (quantiles[col][3] - quantiles[col][2])
                df_01.loc[df_01[col] < 0, col] = 0
                df_01.loc[df_01[col] > 1, col] = 1


                df_11[col] = (df_11[col] - quantiles[col][2]) / (quantiles[col][3] - quantiles[col][2]) * 2 - 1
                df_11.loc[df_11[col] < -1, col] = -1
                df_11.loc[df_11[col] > 1, col] = 1
            if col == 'HR':
                df_01[col] = df_01[col] / 100
                df_01.loc[df_01[col] < 0, col] = 0

                df_11[col] = df_11[col] / 100 
                df_11.loc[df_11[col] < 0, col] = 0

        df_01.to_csv('data/norm_data_01/'+file.strip('.zip')+'_01.zip', index=False, compression='zip')
        df_11.to_csv('data/norm_data_11/'+file.strip('.zip')+'_11.zip', index=False, compression='zip')