In [None]:
import pandas as pd
import numpy as np
import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf

## Data Cleaning

0)See raw data

In [None]:
def read_raw(path):
    file = path + 'C02Preci.csv'
    Pp = pd.read_csv(file, index_col=0, usecols=['Fecha','Valor (mm)'], dtype={'Valor (mm)': float})#, na_values=[''])

    file = path + 'C02Dviento.csv'
    Dv = pd.read_csv(file, index_col=0, usecols=['Fecha','Valor (°)'], dtype={'Valor (°)': float})#, na_values=[''])
    
    file = path + 'C02Humedad.csv'
    Hr = pd.read_csv(file, index_col=0, usecols=['Fecha','Valor (%)'], dtype={'Valor (%)': float})#, na_values=[''])

    file = path + 'C02Presion.csv'
    Ps = pd.read_csv(file, index_col=0, usecols=['Fecha','Valor (hPa)'], dtype={'Valor (hPa)': float})#, na_values=[''])

    file = path + 'C02RSolar.csv'
    Rs = pd.read_csv(file, index_col=0, usecols=['Fecha','Valor (W/m2)'], dtype={'Valor (W/m2)': float})#, na_values=[''])

    file = path + 'C02Temperatura.csv'
    T = pd.read_csv(file, index_col=0, usecols=['Fecha','Valor (°C)'], dtype={'Valor (°C)': float})#, na_values=[''])

    file = path + 'C02Vviento.csv'
    Vv = pd.read_csv(file, index_col=0, usecols=['Fecha','Valor (m/s)'], dtype={'Valor (m/s)': float})#, na_values=[''])

    Dataset_r = pd.concat([Dv, Hr, Ps, Rs, T,Vv, Pp], axis=1)
    
    return (Dataset_r)

In [None]:
path = '/home/juan/Desktop/TESIS/Codes/data/Rumihurco2021/'
raw = read_raw(path)
#np.round(raw.describe(),1).to_csv(path+'Rumihurco_describe_raw.csv')
(np.round(raw.describe(),1)

In [None]:
path = '/home/juan/Desktop/TESIS/Codes/data/Rumipamba2021/'
raw = read_raw(path)
#np.round(raw.describe(),1).to_csv(path+'Rumipamba_describe_raw.csv')
np.round(raw.describe(),1)

1) Leer los datos, hacer preprocesamiento (sin data-scaling)

In [None]:
def interpolate(data):
    primer_valor = data.iloc[[0]]
    while bool(primer_valor.isnull().values):
        data = data.drop(primer_valor.index,axis=0)
        primer_valor = data.iloc[[0]]
    data = data.interpolate()
    return(data)

    

In [None]:
# This fuction calls the csv file of each climate feature
# reads only whats inside usecols=["..."] 
# dtype cast the values
# na_values indicates what are the null values expected to be
# for each feature: reads, remove duplicates, linear interpolate 
# return the Dataset as a whole


def read_csv(path):
    file = path + 'C02Preci.csv'
    Pp = pd.read_csv(file, index_col=0, usecols=['Fecha','Valor (mm)'], dtype={'Valor (mm)': float}, na_values=[''])
    Pp = Pp.loc[~Pp.index.duplicated(keep='last')]
    Pp = interpolate(Pp)
    print("Preci.csv index are unique:",Pp.index.is_unique)

    file = path + 'C02Dviento.csv'
    Dv = pd.read_csv(file, index_col=0, usecols=['Fecha','Valor (°)'], dtype={'Valor (°)': float}, na_values=[''])
    Dv = Dv.loc[~Dv.index.duplicated(keep='last')]
    Dv = interpolate(Dv)
    print("Dviento.csv index are unique:",Dv.index.is_unique)

    file = path + 'C02Humedad.csv'
    Hr = pd.read_csv(file, index_col=0, usecols=['Fecha','Valor (%)'], dtype={'Valor (%)': float}, na_values=[''])
    Hr = Hr.loc[~Hr.index.duplicated(keep='last')]
    Hr = interpolate(Hr)
    print("Humedad.csv index are unique:",Hr.index.is_unique)

    file = path + 'C02Presion.csv'
    Ps = pd.read_csv(file, index_col=0, usecols=['Fecha','Valor (hPa)'], dtype={'Valor (hPa)': float}, na_values=[''])
    Ps = Ps.loc[~Ps.index.duplicated(keep='last')]
    Ps = interpolate(Ps)
    print("Presion.csv index are unique:",Ps.index.is_unique)

    file = path + 'C02RSolar.csv'
    Rs = pd.read_csv(file, index_col=0, usecols=['Fecha','Valor (W/m2)'], dtype={'Valor (W/m2)': float}, na_values=[''])
    Rs = Rs.loc[~Rs.index.duplicated(keep='last')]
    Rs = interpolate(Rs)
    print("RSolar.csv index are unique:",Rs.index.is_unique)

    file = path + 'C02Temperatura.csv'
    T = pd.read_csv(file, index_col=0, usecols=['Fecha','Valor (°C)'], dtype={'Valor (°C)': float}, na_values=[''])
    T = T.loc[~T.index.duplicated(keep='last')]
    T = interpolate(T)
    print("Temperatura.csv index are unique:",T.index.is_unique)

    file = path + 'C02Vviento.csv'
    Vv = pd.read_csv(file, index_col=0, usecols=['Fecha','Valor (m/s)'], dtype={'Valor (m/s)': float}, na_values=[''])
    Vv = Vv.loc[~Vv.index.duplicated(keep='last')]
    Vv = interpolate(Vv)
    print("Vviento.csv index are unique:",Vv.index.is_unique)

    # concatenates all the variables trhough inner join
    Dataset = pd.concat([Dv, Hr, Ps, Rs, T,Vv, Pp], axis=1 , join='inner')
    Dataset['Date Time'] = pd.to_datetime(Dataset.index, format='%Y/%m/%d %H:%M:%S')

    
    print("Null values:",Dataset.isnull().values.any())

    
    return (Dataset)

## Feature engineering

2) Convert the wind direction and velocity columns to a wind vector

In [None]:
def convert_wind(Dataset):
    wv = Dataset.pop('Valor (m/s)')

    # Convert to radians.
    wd_rad = Dataset.pop('Valor (°)')*np.pi / 180.0

    # Calculate the wind x and y components.
    Dataset['Wind X'] = wv*np.cos(wd_rad)
    Dataset['Wind Y'] = wv*np.sin(wd_rad)

    return (Dataset)

3) Use sin and cos to convert the time to clear "Time of day" and "Time of year" signals.

Since weather data has daily and yearly periodicity, this gives the model access to the most important frequency features. So, determine which frequencies are important using an fft

3.1. Plot the frequencies 


3.2. Convert using sin and cos 

**Plots the frequency of the data.**

*Note the obvious peaks at frequencies near 1/30 days (one month)*

In [None]:
def plot_fft(feature,col):
    fft = tf.signal.rfft(feature)
    f_per_dataset = np.arange(0, len(fft))

    n_samples_h = len(feature)
    hours_per_year = 24*365.2524
    years_per_dataset = n_samples_h/(hours_per_year)

    f_per_year = f_per_dataset/years_per_dataset
    plt.step(f_per_year, np.abs(fft))
    plt.xscale('log')
    plt.title(col)
    #plt.ylim([0, max(np.abs(fft))])
    plt.ylim([0, 2.2e6])
    plt.xlim([0.1, max(plt.xlim())])
    _ = plt.xlabel('Frequency [Hz]')
    _ = plt.ylabel('Count')
    plt.xticks([1, 30, 60, 365.2524,], labels=[r'$Year^{-1}$',r'$Month^{-1}$',r'$2-Month^{-1}$', r'$Day^{-1}$'],rotation = 45,fontsize=12)
    plt.yticks(fontsize=12)


    plt.show()

In [None]:
 def convert_time(Dataset):
    date_time = pd.to_datetime(Dataset.pop('Date Time'), format='%d.%m.%Y %H:%M:%S')

    #in seconds
    timestamp_s = date_time.map(datetime.datetime.timestamp)

    day = 24*60*60
    thirty_days =30*day
    #year = (365.2425)*day

    Dataset['Monthly sin'] = np.sin(timestamp_s * (2 * np.pi / thirty_days))
    Dataset['Monthly cos'] = np.cos(timestamp_s * (2 * np.pi / thirty_days))

    bi_month = 60*day
    Dataset['Bi-monthly sin'] = np.sin(timestamp_s * (2 * np.pi / bi_month))
    Dataset['Bi-monthly cos'] = np.cos(timestamp_s * (2 * np.pi / bi_month))

    return (Dataset)


## Ejecutar para Rumihurco 

In [None]:
path = '/home/juan/Desktop/TESIS/Codes/data/Rumihurco2021/'
Dataset = read_csv(path)
np.round(Dataset.describe(),1).to_csv(path+'Describe_Rumihurco_clean.csv')
#for col in Dataset.columns:
#    plot_fft(Dataset[col],col)



In [None]:
Dataset = convert_time(Dataset)
Dataset = convert_wind(Dataset)
Dataset.to_csv("/home/juan/Desktop/TESIS/Codes/codesTesis/Paso1/Rumihurco.csv")
np.round(Dataset.describe(),1).to_csv(path+'Describe_Rumihurco_feature_eng.csv')
Dataset.to_csv(path+'Rumihurco.csv')
Dataset

4) Correlation Analysis

In [None]:
ax = sns.heatmap(Dataset.corr(),cmap=sns.color_palette("vlag", as_cmap=True))

## Repetimos para para Rumipamba

In [None]:
path = '/home/juan/Desktop/TESIS/Codes/data/Rumipamba2021/'
Dataset = read_csv(path)
np.round(Dataset.describe(),1).to_csv(path+'Describe_Rumipamba_clean.csv')

Dataset = convert_time(Dataset)
Dataset = convert_wind(Dataset)
Dataset.to_csv("/home/juan/Desktop/TESIS/Codes/codesTesis/Paso1/Rumipamba.csv")
np.round(Dataset.describe(),1).to_csv(path+'Describe_Rumipamba_feature_eng.csv')
Dataset.to_csv(path+'Rumipamba.csv')
Dataset

4) Correlation analysis

In [None]:
ax = sns.heatmap(Dataset.corr(),cmap=sns.color_palette("vlag", as_cmap=True))