In [167]:
import numpy as np
%matplotlib inline  
import matplotlib.pyplot as plt  
import pandas as pd
import sklearn.linear_model
import sklearn.preprocessing

In [168]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [169]:
# Tell the notebook to display all of the results.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

Data Import

In [170]:
ndre_data = pd.read_csv('../data/final_join_subbed_missing_soilM.csv')
ndre_data_sub = pd.read_csv('../data/final_join_subbed_bare_soil_40.csv')

In [171]:
keep_cols = ['avg_soilM', 'average_adjacent', 'precip.cm', 'tair.C', 'rh.pct', 'wind_sp.m_per_s', 
             'irradiance.w_per_m.2', 'sand_1', 'sand_2', 'sand_3', 'silt_1', 'silt_2', 'silt_3', 'clay_1', 
             'clay_2', 'clay_3']

In [172]:
dataset = ndre_data_sub[keep_cols]

In [173]:
X = dataset.iloc[:,1:]
X = X.values

y = dataset.iloc[:,0]
y = y.values

In [174]:
# split data into train and test sets
seed = 7
test_size = 0.15
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

Normalization

In [175]:
mean_X = X_train.mean(axis=0)
std_X = X_train.std(axis=0)

X_train = (X_train - mean_X) / std_X
X_test = (X_test - mean_X) / std_X

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

mean_y = y_train.mean(axis=0)
std_y = y_train.std(axis=0)

y_train = (y_train - mean_y) / std_y
y_test = (y_test - mean_y) / std_y

y_train = y_train.astype('float32')
y_test = y_test.astype('float32')

In [176]:
print(X_train.shape)
print(X_test.shape)
print('')
print(y_train.shape)
print(y_test.shape)

(317, 15)
(57, 15)

(317,)
(57,)


Ridge model

In [177]:
from sklearn.linear_model import RidgeCV

rcv = RidgeCV(alphas=(0.1, 1.0, 10.0, 100, 1000), fit_intercept = False)
rcv.fit(X_train,y_train)
beta_star = rcv.coef_
beta_star


RidgeCV(alphas=(0.1, 1.0, 10.0, 100, 1000), cv=None, fit_intercept=False,
    gcv_mode=None, normalize=False, scoring=None, store_cv_values=False)

array([-0.15774445, -0.17805534, -0.1978774 ,  0.28073458,  0.19540679,
       -0.02521539,  0.00349261, -0.06454815,  0.22211722,  0.01454951,
       -0.06173585, -0.02864329, -0.01193507,  0.08305963, -0.0574851 ])

In [178]:
from sklearn import metrics

In [179]:
y_pred_rcv = rcv.predict(X_test)

MAE Ridge

In [180]:
y_pred_inv = (y_pred_rcv * std_y) + mean_y
y_test_inv = (y_test * std_y) + mean_y

In [181]:
mean_absolute_error(y_pred=y_pred_inv, y_true=y_test_inv)

0.036381323515726177

Support Vector Machine for Regression

In [182]:
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV

Linear Kernel

In [183]:
clf = SVR(kernel = 'linear', C=10, epsilon = 0.2)
clf.fit(X_train, y_train)

SVR(C=10, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='auto',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [184]:
y_pred_svr = clf.predict(X_test)

In [185]:
y_pred_inv = (y_pred_svr * std_y) + mean_y
y_test_inv = (y_test * std_y) + mean_y
mean_absolute_error(y_pred=y_pred_inv, y_true=y_test_inv)

0.036836827455762673

In [186]:
svr = SVR(kernel = 'linear', epsilon = 0.2)
param_grid = {'C':[10**i for i in range(-3, 3)]}
clfCV = GridSearchCV(svr, param_grid)
clfCV.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='auto',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [0.001, 0.01, 0.1, 1, 10, 100]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [187]:
y_pred_svr = clfCV.predict(X_test)

In [188]:
y_pred_inv = (y_pred_svr * std_y) + mean_y
y_test_inv = (y_test * std_y) + mean_y
mean_absolute_error(y_pred=y_pred_inv, y_true=y_test_inv)

0.036836827455762673

RBF Kernel

In [135]:
svr = SVR(kernel = 'rbf', epsilon = 0.2)
param_grid = {'C':[10**i for i in range(-5, 5)]}
clfCV = GridSearchCV(svr, param_grid)
clfCV.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'C': [1e-05, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=0)

In [136]:
y_pred_svr = clfCV.predict(X_test)

In [137]:
y_pred_inv = (y_pred_svr * std_y) + mean_y
y_test_inv = (y_test * std_y) + mean_y
mean_absolute_error(y_pred=y_pred_inv, y_true=y_test_inv)

0.029321173974082914

Poly Kernel

In [95]:
clf = SVR(kernel = 'poly', C=1, epsilon = 0.2)
clf.fit(X_train, y_train)

SVR(C=1, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='auto',
  kernel='poly', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [96]:
y_pred_svr = clf.predict(X_test)

In [97]:
y_pred_inv = (y_pred_svr * std_y) + mean_y
y_test_inv = (y_test * std_y) + mean_y
mean_absolute_error(y_pred=y_pred_inv, y_true=y_test_inv)

0.032584283989566247

Lasso model

In [138]:
from sklearn.linear_model import LassoCV

lcv = LassoCV(alphas=(0.1, 1.0, 10.0, 100, 1000), fit_intercept = False)
lcv.fit(X_train,y_train)
beta_lcv = lcv.coef_
beta_lcv

LassoCV(alphas=(0.1, 1.0, 10.0, 100, 1000), copy_X=True, cv=None, eps=0.001,
    fit_intercept=False, max_iter=1000, n_alphas=100, n_jobs=1,
    normalize=False, positive=False, precompute='auto', random_state=None,
    selection='cyclic', tol=0.0001, verbose=False)

array([-0.        , -0.00462254, -0.13206047,  0.08615105,  0.16936393,
        0.        ,  0.        ,  0.        ,  0.05607328, -0.        ,
       -0.0105944 ,  0.        ,  0.        ,  0.        , -0.        ], dtype=float32)

In [139]:
y_pred_lcv = lcv.predict(X_test)

MAE Lasso

In [140]:
y_pred_inv = (y_pred_lcv * std_y) + mean_y
y_test_inv = (y_test * std_y) + mean_y

In [141]:
mean_absolute_error(y_pred=y_pred_inv, y_true=y_test_inv)

0.03906586

Subsetting data with features from Lasso

In [142]:
X = dataset.iloc[:,[2,3,4,5,9,11]]
X = X.values

y = dataset.iloc[:,0]
y = y.values

In [143]:
# split data into train and test sets
seed = 7
test_size = 0.15
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

Normalization

In [144]:
mean_X = X_train.mean(axis=0)
std_X = X_train.std(axis=0)

X_train = (X_train - mean_X) / std_X
X_test = (X_test - mean_X) / std_X

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

mean_y = y_train.mean(axis=0)
std_y = y_train.std(axis=0)

y_train = (y_train - mean_y) / std_y
y_test = (y_test - mean_y) / std_y

y_train = y_train.astype('float32')
y_test = y_test.astype('float32')

In [145]:
print(X_train.shape)
print(X_test.shape)
print('')
print(y_train.shape)
print(y_test.shape)

(317, 6)
(57, 6)

(317,)
(57,)


Ridge model with reduced features (post Lasso)

In [146]:
from sklearn.linear_model import RidgeCV

rcv = RidgeCV(alphas=(0.1, 1.0, 10.0, 100, 1000), fit_intercept = False)
rcv.fit(X_train,y_train)
beta_star = rcv.coef_
beta_star


RidgeCV(alphas=(0.1, 1.0, 10.0, 100, 1000), cv=None, fit_intercept=False,
    gcv_mode=None, normalize=False, scoring=None, store_cv_values=False)

array([-0.11989806, -0.20862479,  0.23599008,  0.19058765,  0.18942144,
       -0.11714893])

In [147]:
from sklearn import metrics

In [148]:
y_pred_rcv = rcv.predict(X_test)

MAE Ridge

In [149]:
y_pred_inv = (y_pred_rcv * std_y) + mean_y
y_test_inv = (y_test * std_y) + mean_y

In [150]:
mean_absolute_error(y_pred=y_pred_inv, y_true=y_test_inv)

0.037880783713376824

Linear Kernel

In [151]:
clf = SVR(kernel = 'linear', C=100, epsilon = 0.2)
clf.fit(X_train, y_train)

SVR(C=100, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='auto',
  kernel='linear', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [152]:
y_pred_svr = clf.predict(X_test)

In [153]:
y_pred_inv = (y_pred_svr * std_y) + mean_y
y_test_inv = (y_test * std_y) + mean_y
mean_absolute_error(y_pred=y_pred_inv, y_true=y_test_inv)

0.038697225329442274

RBF Kernel

In [154]:
clf = SVR(kernel = 'rbf', C=10, epsilon = 0.2)
clf.fit(X_train, y_train)

SVR(C=10, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [155]:
y_pred_svr = clf.predict(X_test)

In [156]:
y_pred_inv = (y_pred_svr * std_y) + mean_y
y_test_inv = (y_test * std_y) + mean_y
mean_absolute_error(y_pred=y_pred_inv, y_true=y_test_inv)

0.029769808291912732

### QQPlot

In [None]:
ndre_data.head(5)

In [None]:
ndre_data.columns

#### Precip

In [None]:
import seaborn as sns
sns.set(color_codes=True)
ax = sns.regplot(x=ndre_data["precip.cm"], y=ndre_data["avg_soilM"])

#### T.Air C

In [None]:
import seaborn as sns
sns.set(color_codes=True)
ax = sns.regplot(x=ndre_data["tair.C"], y=ndre_data["avg_soilM"])

In [None]:
import seaborn as sns
sns.set(color_codes=True)
ax = sns.regplot(x=ndre_data["rh.pct"], y=ndre_data["avg_soilM"])

In [None]:
import seaborn as sns
sns.set(color_codes=True)
ax = sns.regplot(x=ndre_data["wind_sp.m_per_s"], y=ndre_data["avg_soilM"])

In [None]:
c = ndre_data.corr()
s = c.unstack()
so = s.sort_values(kind="quicksort")
corrmat = pd.DataFrame(so, columns = ['corr'], index=None)
corr_rr = corrmat.loc['avg_soilM'].sort_values(by = 'corr', ascending = False)

In [None]:
corr_rr['corr_abs'] = abs(corr_rr['corr'])

In [None]:
corr_rr.sort_values(by = 'corr_abs', ascending = False)

Scatterplot Matrix

In [None]:
import pylab 
import scipy.stats as stats

qqdata = 
stats.probplot(measurements, dist="norm", plot=pylab)
pylab.show()