This notebook tests linear regression models on the merged data set to predict soil moisture.  

Ridge, Lasso, SVR (RBF/Linear Kernels) and Kernel Ridge Regression (RBF/Linear) are test.

In [1]:
import numpy as np
%matplotlib inline  
import matplotlib.pyplot as plt  
import pandas as pd
import sklearn.linear_model
import sklearn.preprocessing

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import time

In [3]:
# Tell the notebook to display all of the results.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

Data Import

In [4]:
ndre_data = pd.read_csv('../data/final_join_subbed_missing_soilM.csv')
ndre_data_sub = pd.read_csv('../data/final_join_subbed_bare_soil_40.csv')

In [5]:
ndre_data.columns

Index(['field', 'average_adjacent', 'stdev_adjacent', 'sensor', 'date',
       'sensor_full_name', 'depth_1', 'depth_2', 'depth_3', 'precip.cm',
       'tair.C', 'rh.pct', 'wind_sp.m_per_s', 'irradiance.w_per_m.2', 'year',
       'sand_1', 'sand_2', 'sand_3', 'silt_1', 'silt_2', 'silt_3', 'clay_1',
       'clay_2', 'clay_3', 'avg_soilM'],
      dtype='object')

In [6]:
ndre_data_sub.columns

Index(['field', 'average_adjacent', 'stdev_adjacent', 'sensor', 'date',
       'sensor_full_name', 'depth_1', 'depth_2', 'depth_3', 'precip.cm',
       'tair.C', 'rh.pct', 'wind_sp.m_per_s', 'irradiance.w_per_m.2', 'year',
       'sand_1', 'sand_2', 'sand_3', 'silt_1', 'silt_2', 'silt_3', 'clay_1',
       'clay_2', 'clay_3', 'avg_soilM'],
      dtype='object')

In [7]:
keep_cols = ['avg_soilM', 'average_adjacent', 'precip.cm', 'tair.C', 'rh.pct', 'wind_sp.m_per_s', 
             'irradiance.w_per_m.2', 'sand_1', 'sand_2', 'sand_3', 'silt_1', 'silt_2', 'silt_3', 'clay_1', 
             'clay_2', 'clay_3']

In [8]:
dataset = ndre_data_sub[keep_cols]

In [9]:
dataset.columns.shape

(16,)

In [10]:
X = dataset.iloc[:,1:]
X = X.values

y = dataset.iloc[:,0]
y = y.values

In [11]:
# split data into train and test sets
seed = 7
test_size = 0.15
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

Normalization

In [12]:
mean_X = X_train.mean(axis=0)
std_X = X_train.std(axis=0)

X_train = (X_train - mean_X) / std_X
X_test = (X_test - mean_X) / std_X

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

mean_y = y_train.mean(axis=0)
std_y = y_train.std(axis=0)

y_train = (y_train - mean_y) / std_y
y_test = (y_test - mean_y) / std_y

y_train = y_train.astype('float32')
y_test = y_test.astype('float32')

In [13]:
print(X_train.shape)
print(X_test.shape)
print('')
print(y_train.shape)
print(y_test.shape)

(317, 15)
(57, 15)

(317,)
(57,)


Ridge model

In [14]:
from sklearn.linear_model import RidgeCV

rcv = RidgeCV(alphas=(0.1, 1.0, 10.0, 100, 1000), fit_intercept = False)
rcv.fit(X_train,y_train)
beta_star = rcv.coef_
beta_star


RidgeCV(alphas=(0.1, 1.0, 10.0, 100, 1000), cv=None, fit_intercept=False,
    gcv_mode=None, normalize=False, scoring=None, store_cv_values=False)

array([-0.15774445, -0.17805534, -0.1978774 ,  0.28073458,  0.19540679,
       -0.02521539,  0.00349261, -0.06454815,  0.22211722,  0.01454951,
       -0.06173585, -0.02864329, -0.01193507,  0.08305963, -0.0574851 ])

In [15]:
from sklearn import metrics

In [16]:
y_pred_rcv = rcv.predict(X_test)

MAE Ridge

In [17]:
y_pred_inv = (y_pred_rcv * std_y) + mean_y
y_test_inv = (y_test * std_y) + mean_y

In [18]:
mean_absolute_error(y_pred=y_pred_inv, y_true=y_test_inv)

0.036381323515726177

Support Vector Machine for Regression

In [19]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

Linear Kernel

Method 1

Linear SVR

In [26]:
svrl = GridSearchCV(SVR(kernel='linear', gamma=0.1), cv=5,
                   param_grid={"C": [1e0, 1e1, 1e2, 1e3],
                               "gamma": np.logspace(-2, 2, 5)})

In [29]:
t0 = time.time()
svrl.fit(X_train, y_train)
svrl_fit = time.time() - t0
print("SVR complexity and bandwidth selected and model fitted in %.3f s"
      % svrl_fit)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [1.0, 10.0, 100.0, 1000.0], 'gamma': array([  1.00000e-02,   1.00000e-01,   1.00000e+00,   1.00000e+01,
         1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

SVR complexity and bandwidth selected and model fitted in 168.538 s


In [30]:
t0 = time.time()
y_svrl = svrl.predict(X_test)
svrl_predict = time.time() - t0

In [31]:
y_pred_inv = (y_svrl * std_y) + mean_y
y_test_inv = (y_test * std_y) + mean_y
mean_absolute_error(y_pred=y_pred_inv, y_true=y_test_inv)

0.037499005560150997

Gaussian SVR

In [32]:
svrg = GridSearchCV(SVR(kernel='rbf', gamma=0.1, epsilon = 0.1), cv=5,
                   param_grid={"C": [1e0, 1e1, 1e2, 1e3],
                               "gamma": np.logspace(-2, 2, 5)})

In [33]:
t0 = time.time()
svrg.fit(X_train, y_train)
svrg_fit = time.time() - t0
print("SVR complexity and bandwidth selected and model fitted in %.3f s"
      % svrg_fit)

GridSearchCV(cv=5, error_score='raise',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.1,
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [1.0, 10.0, 100.0, 1000.0], 'gamma': array([  1.00000e-02,   1.00000e-01,   1.00000e+00,   1.00000e+01,
         1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

SVR complexity and bandwidth selected and model fitted in 1.754 s


In [34]:
t0 = time.time()
y_svrg = svrg.predict(X_test)
svrg_predict = time.time() - t0

In [35]:
y_pred_inv = (y_svrg * std_y) + mean_y
y_test_inv = (y_test * std_y) + mean_y
mean_absolute_error(y_pred=y_pred_inv, y_true=y_test_inv)

0.028481344069534505

#### Kernel Ridge Regression

In [61]:
from sklearn.kernel_ridge import KernelRidge
import time

Gaussian RBF w/ KRR

In [62]:
kr = GridSearchCV(KernelRidge(kernel='rbf', gamma=0.1), cv=5,
                  param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3],
                              "gamma": np.logspace(-2, 2, 5)})

In [63]:
t0 = time.time()
kr.fit(X_train, y_train)
kr_fit = time.time() - t0
print("KRR complexity and bandwidth selected and model fitted in %.3f s"
      % kr_fit)

GridSearchCV(cv=5, error_score='raise',
       estimator=KernelRidge(alpha=1, coef0=1, degree=3, gamma=0.1, kernel='rbf',
      kernel_params=None),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': [1.0, 0.1, 0.01, 0.001], 'gamma': array([  1.00000e-02,   1.00000e-01,   1.00000e+00,   1.00000e+01,
         1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

KRR complexity and bandwidth selected and model fitted in 0.796 s


In [64]:
t0 = time.time()
y_kr = kr.predict(X_test)
kr_predict = time.time() - t0

In [65]:
y_pred_inv = (y_kr * std_y) + mean_y
y_test_inv = (y_test * std_y) + mean_y
mean_absolute_error(y_pred=y_pred_inv, y_true=y_test_inv)

0.026006548

Linear KRR

In [66]:
krl = GridSearchCV(KernelRidge(kernel='linear', gamma=0.1), cv=5,
                  param_grid={"alpha": [1e0, 0.1, 1e-2, 1e-3],
                              "gamma": np.logspace(-2, 2, 5)})

In [67]:
t0 = time.time()
krl.fit(X_train, y_train)
krl_fit = time.time() - t0
print("KRR complexity and bandwidth selected and model fitted in %.3f s"
      % krl_fit)

GridSearchCV(cv=5, error_score='raise',
       estimator=KernelRidge(alpha=1, coef0=1, degree=3, gamma=0.1, kernel='linear',
      kernel_params=None),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'alpha': [1.0, 0.1, 0.01, 0.001], 'gamma': array([  1.00000e-02,   1.00000e-01,   1.00000e+00,   1.00000e+01,
         1.00000e+02])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

KRR complexity and bandwidth selected and model fitted in 0.289 s


In [68]:
t0 = time.time()
y_krl = krl.predict(X_test)
krl_predict = time.time() - t0

In [69]:
y_pred_inv = (y_krl * std_y) + mean_y
y_test_inv = (y_test * std_y) + mean_y
mean_absolute_error(y_pred=y_pred_inv, y_true=y_test_inv)

0.036473174

### Lasso model

In [70]:
dataset.columns

Index(['avg_soilM', 'average_adjacent', 'precip.cm', 'tair.C', 'rh.pct',
       'wind_sp.m_per_s', 'irradiance.w_per_m.2', 'sand_1', 'sand_2', 'sand_3',
       'silt_1', 'silt_2', 'silt_3', 'clay_1', 'clay_2', 'clay_3'],
      dtype='object')

In [71]:
from sklearn.linear_model import LassoCV

lcv = LassoCV(alphas=(0.1, 1.0, 10.0, 100, 1000), fit_intercept = False)
lcv.fit(X_train,y_train)
beta_lcv = lcv.coef_
beta_lcv

LassoCV(alphas=(0.1, 1.0, 10.0, 100, 1000), copy_X=True, cv=None, eps=0.001,
    fit_intercept=False, max_iter=1000, n_alphas=100, n_jobs=1,
    normalize=False, positive=False, precompute='auto', random_state=None,
    selection='cyclic', tol=0.0001, verbose=False)

array([-0.        , -0.00462254, -0.13206047,  0.08615105,  0.16936393,
        0.        ,  0.        ,  0.        ,  0.05607328, -0.        ,
       -0.0105944 ,  0.        ,  0.        ,  0.        , -0.        ], dtype=float32)

In [72]:
y_pred_lcv = lcv.predict(X_test)

In [73]:
lasso_coef = pd.DataFrame()
lasso_coef['feature'] = dataset.columns[1:]
lasso_coef['lasso coef'] = beta_lcv
lasso_coef['ridge coef'] = beta_star
lasso_coef

Unnamed: 0,feature,lasso coef,ridge coef
0,average_adjacent,-0.0,-0.157744
1,precip.cm,-0.004623,-0.178055
2,tair.C,-0.13206,-0.197877
3,rh.pct,0.086151,0.280735
4,wind_sp.m_per_s,0.169364,0.195407
5,irradiance.w_per_m.2,0.0,-0.025215
6,sand_1,0.0,0.003493
7,sand_2,0.0,-0.064548
8,sand_3,0.056073,0.222117
9,silt_1,-0.0,0.01455


MAE Lasso

In [74]:
y_pred_inv = (y_pred_lcv * std_y) + mean_y
y_test_inv = (y_test * std_y) + mean_y

In [75]:
mean_absolute_error(y_pred=y_pred_inv, y_true=y_test_inv)

0.03906586

Subsetting data with features from Lasso

In [76]:
X = dataset.iloc[:,[2,3,4,5,9,11]]
X = X.values

y = dataset.iloc[:,0]
y = y.values

In [77]:
# split data into train and test sets
seed = 7
test_size = 0.15
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

Normalization

In [78]:
mean_X = X_train.mean(axis=0)
std_X = X_train.std(axis=0)

X_train = (X_train - mean_X) / std_X
X_test = (X_test - mean_X) / std_X

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

mean_y = y_train.mean(axis=0)
std_y = y_train.std(axis=0)

y_train = (y_train - mean_y) / std_y
y_test = (y_test - mean_y) / std_y

y_train = y_train.astype('float32')
y_test = y_test.astype('float32')

In [79]:
print(X_train.shape)
print(X_test.shape)
print('')
print(y_train.shape)
print(y_test.shape)

(317, 6)
(57, 6)

(317,)
(57,)


Ridge model with reduced features (post Lasso)

In [80]:
from sklearn.linear_model import RidgeCV

rcv = RidgeCV(alphas=(0.1, 1.0, 10.0, 100, 1000), fit_intercept = False)
rcv.fit(X_train,y_train)
beta_star = rcv.coef_
beta_star


RidgeCV(alphas=(0.1, 1.0, 10.0, 100, 1000), cv=None, fit_intercept=False,
    gcv_mode=None, normalize=False, scoring=None, store_cv_values=False)

array([-0.11989806, -0.20862479,  0.23599008,  0.19058765,  0.18942144,
       -0.11714893])

In [81]:
from sklearn import metrics

In [82]:
y_pred_rcv = rcv.predict(X_test)

MAE Ridge

In [83]:
y_pred_inv = (y_pred_rcv * std_y) + mean_y
y_test_inv = (y_test * std_y) + mean_y

In [84]:
mean_absolute_error(y_pred=y_pred_inv, y_true=y_test_inv)

0.037880783713376824