In [51]:
import pandas as pd
import geopandas as gpd
import glob
import os
# Operations on geometries
import shapely
import numpy as np
import math
import xarray as xr
import xarray
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn import linear_model
import statsmodels.api as sm
import seaborn as sns
import pickle
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from tqdm import tqdm_notebook as tqdm

In [52]:
# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures (change the path!)
PROJECT_ROOT_DIR = "/Users/noeliaotero/Documents/CAS_ML/data_test/"
CHAPTER_ID = "ML"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID)
os.makedirs(IMAGES_PATH, exist_ok=True)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
import warnings
warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [113]:
def readcsv(csvfiles, DATE_START, DATE_END, l_vars):
    """"Read CSV files located in DIR_DATA
        Select the time period """
    dataframes = []  # a list to hold all the individual pandas DataFrames
    for ifile in range(0,len(l_files)):
        df = pd.read_csv(l_files[ifile])
        print(l_vars[ifile])
        if (l_vars[ifile] == 'precip'):
            df_time = pd.to_datetime({
                'year': df.year,
                'month': df.month,
                'day': df.day})
            df.insert(0, "date", df_time, True)
            # select the total column (CH)
            df= df[['date','reg_tot']]
        else:
            df = convertUnits(df, l_vars[ifile])
            
        df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d').dt.date
        df = df[(df['date'] >= DATE_START) & (df['date'] <= DATE_END)]
        
        dataframes.append(df)
        
    return(dataframes)

In [114]:
def convertUnits(df, mvar):
    """Convert each to the corresponding units"""
    
    if(mvar == 'T2MMEAN'):
        df[mvar] = df[mvar] -273.15
    elif(mvar == 'Z'):
        df[df.columns[1:6]] = df[df.columns[1:6]]/G
    elif (mvar == 'MSL'):
        df[mvar] = df[mvar]/100
        
    return(df)
    

In [115]:
# load the data
G = 9.80665
DIR_DATA = "/Users/noeliaotero/Documents/CAS_ML/data_test/"
#DATE_START = '1979-01-01'
DATE_START = pd.to_datetime('1979-01-01').date()
#DATE_END = '2020-12-31'
DATE_END = pd.to_datetime('2020-12-31').date()
l_files  = glob.glob(os.path.join(DIR_DATA, '*.csv'))
l_vars  = ['Z', 'MSL', 'T2MMEAN', 'precip']

In [127]:
df = pd.read_csv(l_files[1])
df['date'] = pd.to_datetime(df['date'], format='%Y-%m-%d').dt.date
#df = df[(df['date'] >= DATE_START) & (df['date'] <= DATE_END)]
df = df[(df['date'] >= DATE_START) & (df['date'] <= DATE_END)]

In [128]:
l_all = readcsv(l_files, DATE_START, DATE_END, l_vars)


Z
MSL
T2MMEAN
precip


In [126]:
l_all[1]

Unnamed: 0,date,MSL
0,1979-01-01,1009.09700
1,1979-01-02,1026.47890
2,1979-01-03,1024.08980
3,1979-01-04,1009.13220
4,1979-01-05,1015.91220
...,...,...
15305,2020-12-27,1004.52890
15306,2020-12-28,986.39984
15307,2020-12-29,996.18420
15308,2020-12-30,1007.27516


In [52]:
# Create exceedances of precipitation
def prec_ex(dd):
    


precip_p95 = precip.copy()
precip_p99 = precip.copy()

for key, ts in precip.iteritems():
    if key in ['year', 'month', 'day']: continue
    precip_p95[key] = ts > ts.quantile(0.95)
    precip_p99[key] = ts > ts.quantile(0.99)

In [None]:
def split_data(df, yy_train, yy_test, attributes, ylabel):
    """Split the data into train and test
        df is the data
        attributes are the covariates 
        ylabel is the target variable"""
    train_dataset = df[(df.date.dt.year >= yy_train[0]) & (df.date.dt.year <= yy_train[1])]
    test_dataset = df[(df.date.dt.year >= yy_test[0]) & (df.date.dt.year <= yy_test[1])]
    # extract the dates for each datasets
    train_dates = train_dataset['date']
    test_dates = test_dataset['date']
    # extract labels
    train_labels = train_dataset[ylabel].copy()
    test_labels = test_dataset[ylabel].copy()
    # extract predictors
    train_dataset = train_dataset[attributes]
    test_dataset = test_dataset[attributes]

    return(train_dataset, train_labels, test_dataset, test_labels, train_dates, test_dates)

In [None]:
def prepareData(train_dataset):
    
    num_attribs = list(train_dataset)
    
    num_pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")),
        ('std_scaler', StandardScaler()),
    ])

    full_pipeline = ColumnTransformer([
        ("num", num_pipeline, num_attribs),
    ])

    df_prepared = full_pipeline.fit_transform(train_dataset)
    return(df_prepared)
    