In [1]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import r2_score
from sklearn.cluster import KMeans
from sklearn.model_selection import KFold
from sklearn.cluster import KMeans
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.losses import mean_squared_error



In [2]:
data = pd.read_csv("../../Downloads/AOD_series_IPW.csv")
data

Unnamed: 0,cell,week,aod,elev,temp,mslp
0,1,Wk_1_2014,,,,
1,1,Wk_2_2014,,,,
2,1,Wk_3_2014,,,,
3,1,Wk_4_2014,,,,
4,1,Wk_5_2014,,,,
...,...,...,...,...,...,...
15575995,59000,Wk_49_2018,,,,
15575996,59000,Wk_50_2018,,,,
15575997,59000,Wk_51_2018,,,,
15575998,59000,Wk_52_2018,,,,


In [3]:
data_cull = pd.DataFrame(data[~np.isnan(data["temp"])], copy=True)

In [4]:
data_cull["exists"] = -np.isnan(data_cull["aod"]).astype(int) + 1
data_cull

Unnamed: 0,cell,week,aod,elev,temp,mslp,exists
111408,423,Wk_1_2014,,265.192496,10.066939,1022.410950,0
111409,423,Wk_2_2014,,265.192496,10.081455,1018.204102,0
111410,423,Wk_3_2014,,265.192496,21.171682,1007.017456,0
111411,423,Wk_4_2014,,265.192496,8.754344,1015.050781,0
111412,423,Wk_5_2014,,265.192496,8.272011,1019.083618,0
...,...,...,...,...,...,...,...
15561475,58945,Wk_49_2018,,223.712208,33.953037,1010.413391,0
15561476,58945,Wk_50_2018,,223.712208,27.300304,1025.005981,0
15561477,58945,Wk_51_2018,,223.712208,34.047974,1019.825012,0
15561478,58945,Wk_52_2018,,223.712208,35.349667,1014.067566,0


In [5]:
X = data_cull[["elev", "temp", "mslp"]]
y = data_cull["exists"]

In [6]:
ipw_model = LogisticRegression(random_state = 1312).fit(X, y)

In [7]:
mean_absolute_error(y, ipw_model.predict_proba(X)[:, 1])

0.3829273097032682

In [8]:
ipw_model.classes_

array([0, 1], dtype=int64)

In [9]:
ipw = 1 / ipw_model.predict_proba(X)[:, 1]
ipw

array([5.32604128, 5.29728399, 3.4146381 , ..., 2.18013573, 2.09603664,
       2.17449365])

Now, adding a model that considers week.

In [8]:
week = np.char.partition(np.char.partition(np.array(data_cull["week"], str), sep="_")[:, 2], sep="_")[:, 0]
year = np.char.partition(np.char.partition(np.array(data_cull["week"], str), sep="_")[:, 2], sep="_")[:, 2]
year = year.astype(int) - 2000

In [11]:
X = pd.concat([data_cull[["elev", "temp", "mslp"]].reset_index(drop=True), pd.Series(week)], axis=1)
y = data_cull["exists"]

In [12]:
X.index = data_cull.index

In [13]:
ipw_model = LogisticRegression(random_state = 1312).fit(X, y)

In [14]:
mean_absolute_error(y, ipw_model.predict_proba(X)[:, 1])

0.38244091297258237

Results are slightly better; as this model is in line with the literature we'll keep it regardless.

In [15]:
ipw_model.classes_

array([0, 1], dtype=int64)

In [16]:
ipw = 1 / ipw_model.predict_proba(X)[:, 1]
ipw

array([5.04411525, 5.03525883, 3.24770256, ..., 2.32838869, 2.23740766,
       2.33302117])

For now, we need this model to be monthly.

In [9]:
data_cull["month"] = (week.astype(int) / (53 / 12)).astype(int).astype(str)
data_cull["year"] = year
data_month = data_cull.groupby(["month", "year", "cell"]).mean().reset_index()
data_month["exists"] = -np.isnan(data_month["aod"]).astype(int) + 1
data_month

Unnamed: 0,month,year,cell,aod,elev,temp,mslp,exists
0,0,14,423,,265.192496,12.518605,1015.670822,0
1,0,14,424,,268.061731,12.529349,1015.670273,0
2,0,14,425,,267.379445,12.539673,1015.669708,0
3,0,14,426,,267.551459,12.549627,1015.669189,0
4,0,14,427,,273.196258,12.559257,1015.668625,0
...,...,...,...,...,...,...,...,...
1992505,9,18,58941,0.180513,227.276341,54.864787,1018.436792,1
1992506,9,18,58942,0.214781,227.543436,54.865610,1018.436548,1
1992507,9,18,58943,0.142192,226.309607,54.866460,1018.436328,1
1992508,9,18,58944,0.151084,224.323911,54.867332,1018.436084,1


In [10]:
X = data_month[["elev", "temp", "mslp", "month"]]
y = data_month["exists"]

In [11]:
ipw_model = LogisticRegression(random_state = 1312).fit(X, y)

In [12]:
mean_absolute_error(y, ipw_model.predict_proba(X)[:, 1])

0.18401509148112435

Now, we'll pull in data for the old neural net, and create IPW weights.

In [46]:
df = pd.read_csv("../../Downloads/first_stage_20210601.csv")
df

Unnamed: 0,Raster.Cell,Sen2.5,month,year,AOD,Temp,Elev,MSLP,Vsby,WdVl,...,LC_LowDev,LC_HighDev,PS,relh,Popd,x,y,primary,secondary,motorway
0,3798,14.300000,3,14,0.102649,27.214394,202.651108,1018.851624,8.472521,8.406699,...,0.247,8.406699,13.146420,68.591485,0.002802,-87.914368,43.056945,83.388515,8613.328516,1269.449331
1,3798,6.800000,4,14,0.062412,42.552032,202.651108,1014.221252,8.791099,8.715958,...,0.247,8.715958,13.146420,67.143758,0.002802,-87.914368,43.056945,83.388515,8613.328516,1269.449331
2,3798,5.600000,5,14,0.181493,55.138329,202.651108,1015.420776,9.179679,7.012032,...,0.247,7.012032,13.146420,68.574740,0.002802,-87.914368,43.056945,83.388515,8613.328516,1269.449331
3,3798,7.571429,6,14,0.165752,63.757431,202.651108,1013.547974,7.342515,6.381063,...,0.247,6.381063,13.146420,74.554604,0.002802,-87.914368,43.056945,83.388515,8613.328516,1269.449331
4,3798,8.471429,7,14,0.299681,66.077019,202.651108,1015.308838,9.463487,6.096744,...,0.247,6.096744,13.146420,69.644172,0.002802,-87.914368,43.056945,83.388515,8613.328516,1269.449331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1490,46895,6.649239,6,18,0.150367,71.891273,180.495621,1012.823486,8.968293,5.528109,...,0.000,5.528109,9.735624,77.871421,0.000037,-88.185768,41.224345,0.000000,0.000000,0.000000
1491,46895,7.206279,7,18,0.203455,73.688965,180.495621,1017.014465,9.619825,4.504867,...,0.000,4.504867,9.735624,74.239351,0.000037,-88.185768,41.224345,0.000000,0.000000,0.000000
1492,46895,9.570144,8,18,0.287290,73.940506,180.495621,1015.084167,9.030977,4.548747,...,0.000,4.548747,9.735624,77.673073,0.000037,-88.185768,41.224345,0.000000,0.000000,0.000000
1493,46895,5.248297,10,18,0.102493,52.086712,180.495621,1017.470093,9.208525,6.414171,...,0.000,6.414171,9.735624,75.775710,0.000037,-88.185768,41.224345,0.000000,0.000000,0.000000


In [14]:
# Remove the outliers
df=df[df['Sen2.5']<20].reset_index()
df = df.drop(["index"],axis=1)

In [47]:
ipw_weights = 1 / ipw_model.predict_proba(df[["Elev", "Temp", "MSLP", "month"]])[:, 1]
ipw_weights

array([1.50436348, 1.06891261, 1.01377333, ..., 1.00125491, 1.02824869,
       1.44869803])

Now, replicate the scikit-learn MLPRegressor framework in keras.

In [48]:
# transform data as in original nn
df2 = df.join(pd.get_dummies(df['month'],drop_first=True)).join(pd.get_dummies(df['year'],drop_first=True))
df2 = df2.drop(["month","year"], axis=1)

In [24]:
stopper = EarlyStopping(monitor="loss", patience=10, restore_best_weights=True)

In [160]:
mlp = Sequential()
mlp.add(Dense(10, activation="relu"))
mlp.add(Dense(10, activation="relu"))
mlp.add(Dense(1))
# note: batches default to 200

In [161]:
mlp.compile(loss="mean_squared_error", optimizer="adam")
mlp.fit(df2.drop('Sen2.5', axis=1).drop(["Raster.Cell", "x", "y"], axis=1), df2['Sen2.5'], epochs=2000, batch_size=200, callbacks=[stopper])

Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 64/2000
Epoch 65/2000
Epoch 66/2000
Epoch 67/2000
Epoch 68/2000
Epoch 69/2000
Epoch 70/2000
Epoch 71/2000
Epoch 72/2000
E

<tensorflow.python.keras.callbacks.History at 0x7ff3f986a640>

Fitting seems reasonable; some changes, like removing outliers, scaling variables are missing, so needs work yet to match the base model. We will do that work as we implement the IPW loss function.

In [18]:
def weighted_mse(y_true, y_pred, weight):
    return weight * mean_squared_error(y_true, y_pred)

In [19]:
def get_model():
    """
    resets the neural net model for each fold
    """
    # input tensors; the last 2 are psuedo-inputs to use for the loss
    mlp = Input(shape=(31,))
    y_true = Input(shape=(1,))
    weight = Input(shape=(1,))
    
    # here we use the Model framework, instead of Sequential, as the model has multiple pseudo-inputs
    x = Dense(10, activation="relu")(mlp)
    x = Dense(10, activation="relu")(x)
    y_pred = Dense(1)(mlp)
    model = Model(inputs=[mlp, y_true, weight], outputs=y_pred)
    model.add_loss(weighted_mse(y_true, y_pred, weight))
    model.compile(loss=None, optimizer="adam")
    return model

In [194]:
model = get_model()
stopper = EarlyStopping(monitor="loss", patience=10, restore_best_weights=True)
model.fit([df2.drop('Sen2.5', axis=1).drop(["Raster.Cell", "x", "y"], axis=1), df2['Sen2.5'], ipw_weights], epochs=2000, batch_size=200, callbacks=[stopper])

Epoch 1/2000
Epoch 2/2000
Epoch 3/2000
Epoch 4/2000
Epoch 5/2000
Epoch 6/2000
Epoch 7/2000
Epoch 8/2000
Epoch 9/2000
Epoch 10/2000
Epoch 11/2000
Epoch 12/2000
Epoch 13/2000
Epoch 14/2000
Epoch 15/2000
Epoch 16/2000
Epoch 17/2000
Epoch 18/2000
Epoch 19/2000
Epoch 20/2000
Epoch 21/2000
Epoch 22/2000
Epoch 23/2000
Epoch 24/2000
Epoch 25/2000
Epoch 26/2000
Epoch 27/2000
Epoch 28/2000
Epoch 29/2000
Epoch 30/2000
Epoch 31/2000
Epoch 32/2000
Epoch 33/2000
Epoch 34/2000
Epoch 35/2000
Epoch 36/2000
Epoch 37/2000
Epoch 38/2000
Epoch 39/2000
Epoch 40/2000
Epoch 41/2000
Epoch 42/2000
Epoch 43/2000
Epoch 44/2000
Epoch 45/2000
Epoch 46/2000
Epoch 47/2000
Epoch 48/2000
Epoch 49/2000
Epoch 50/2000
Epoch 51/2000
Epoch 52/2000
Epoch 53/2000
Epoch 54/2000
Epoch 55/2000
Epoch 56/2000
Epoch 57/2000
Epoch 58/2000
Epoch 59/2000
Epoch 60/2000
Epoch 61/2000
Epoch 62/2000
Epoch 63/2000
Epoch 64/2000
Epoch 65/2000
Epoch 66/2000
Epoch 67/2000
Epoch 68/2000
Epoch 69/2000
Epoch 70/2000
Epoch 71/2000
Epoch 72/2000
E

<tensorflow.python.keras.callbacks.History at 0x7ff39cd9fca0>

In [195]:
# this model copies weights and removes the other 2 inputs for easier use
# as shown below, this does not appear to actually work??
pred_model = Model(inputs=mlp, outputs=y_pred)

In [196]:
pred_model.predict(df2.drop('Sen2.5', axis=1).drop(["Raster.Cell", "x", "y"], axis=1))

array([[2389.9194  ],
       [2389.7512  ],
       [2389.8975  ],
       ...,
       [  98.87438 ],
       [  98.93127 ],
       [  99.875916]], dtype=float32)

In [197]:
model.predict([df2.drop('Sen2.5', axis=1).drop(["Raster.Cell", "x", "y"], axis=1), df2['Sen2.5'], ipw_weights])

array([[13.507674],
       [12.085395],
       [10.660815],
       ...,
       [ 9.463694],
       [12.019959],
       [10.612422]], dtype=float32)

Let's put it all together.

In [49]:
#Train the neural network model
# random CV
Xvars = df2.drop('Sen2.5', axis=1).drop(["Raster.Cell", "x", "y"], axis=1)
yvars = df2['Sen2.5']
weights = pd.Series(ipw_weights)
weights.index = Xvars.index
model_weights = []

kf = KFold(n_splits=10, random_state=10, shuffle=True)

MSE_vec_kf = np.zeros(10)
r2_vec_kf_train =  np.zeros(10)
r2_vec_kf = np.zeros(10)

k_ind = int(0)
for train_index, test_index in kf.split(Xvars):
    X_train, X_test = Xvars.loc[train_index], Xvars.loc[test_index]
    y_train, y_test = yvars.loc[train_index], yvars.loc[test_index]
    w_train, w_test = weights.loc[train_index], weights.loc[test_index]

    # Standarize the input variables
    scaler = MinMaxScaler()
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)
    
    # split again for validation data
    X_t, X_val, y_t, y_val, w_t, w_val = train_test_split(X_train, y_train, w_train)
    
    stopper = EarlyStopping(patience=10, restore_best_weights=True)
    mlp = get_model()
    # TODO: need to work this; stopper is too greedy/training too volatile
    # TODO: also consider transform, inverse-transform for y vars
    mlp.fit([X_t, y_t, w_t], validation_data=[X_val, y_val, w_val], epochs=2000, batch_size=200, callbacks=[stopper], verbose=0)
    y_pred_train = mlp.predict([X_train, y_train, w_train])
    y_pred = mlp.predict([X_test, y_test, w_test]).reshape(len(test_index))
    r2_vec_kf_train[k_ind] =  r2_score(y_train, y_pred_train)
    r2_vec_kf[k_ind] = r2_score(y_test, y_pred)
    MSE_vec_kf[k_ind] = ((y_test - y_pred) ** 2).mean()
    print('MSE for test set', k_ind, ' is', MSE_vec_kf[k_ind])
    print('r2 for test set', k_ind, ' is', r2_vec_kf[k_ind])
    k_ind += 1
    model_weights.append(mlp.get_weights())

MSE_kf = MSE_vec_kf.mean()
MSE_kf_std = MSE_vec_kf.std()
print('test estimate MSE k-fold=', MSE_kf,
      'test estimate MSE standard err=', MSE_kf_std)

r2_kf =r2_vec_kf.mean()
r2_kf_std = r2_vec_kf.std()
print('test estimate r2 k-fold=', r2_kf,
      'test estimate r2 standard err=', r2_kf_std)

r2_kf_train =r2_vec_kf_train.mean()
r2_kf_std_train = r2_vec_kf_train.std()
print('train set r2 k-fold=', r2_kf_train,
      'train set r2 standard err=', r2_kf_std_train)

MSE for test set 0  is 4.97306749010275
r2 for test set 0  is 0.44555728404877404
MSE for test set 1  is 6.719892444654097
r2 for test set 1  is 0.4204209414719755
MSE for test set 2  is 18.473919086310914
r2 for test set 2  is 0.21748450141151243
MSE for test set 3  is 6.953672565375391
r2 for test set 3  is 0.37752298498041137
MSE for test set 4  is 3.1921411571914615
r2 for test set 4  is 0.3747639556506792
MSE for test set 5  is 3.1377784409639458
r2 for test set 5  is 0.41296982295697404
MSE for test set 6  is 3.2579540564146687
r2 for test set 6  is 0.3595505906255253
MSE for test set 7  is 3.8944088601121685
r2 for test set 7  is 0.24748725774273883
MSE for test set 8  is 4.219436028704856
r2 for test set 8  is 0.3554339863526096
MSE for test set 9  is 3.238709277780715
r2 for test set 9  is 0.4165866925696836
test estimate MSE k-fold= 5.806097940761097 test estimate MSE standard err= 4.433157477549257
test estimate r2 k-fold= 0.36277780178108837 test estimate r2 standard err= 0

In [50]:
# average the weights of the 10 models
mean_weights = [np.full(model_weights[0][0].shape, 0.0), np.full(model_weights[0][1].shape, 0.0)]
for weight in model_weights:
    for i in range(len(weight)):
        mean_weights[i] += weight[i] / len(model_weights)

In [51]:
model_mean = get_model()
model_mean.set_weights(mean_weights)

In [52]:
y_pred = model_mean.predict([scaler.transform(Xvars), yvars, weights]).reshape(len(yvars))
y_test = yvars
print(r2_score(y_test, y_pred))
print(((y_test - y_pred) ** 2).mean())

0.37672402177127207
5.549890946624801


In [53]:
stats = pd.DataFrame(np.array(["MSE", ((y_test - y_pred) ** 2).mean(), MSE_kf_std, "R2", r2_score(y_test, y_pred), r2_kf_std]).reshape(2, 3),\
            columns=["statistic", "mean", "sd"])
stats

Unnamed: 0,statistic,mean,sd
0,MSE,5.549890946624801,4.433157477549257
1,R2,0.376724021771272,0.0710051388714566


In [55]:
stats.to_csv("../../Downloads/nn_outlier_stats_ipw.csv")

### Spatial CV version

In [37]:
kMeansVars = df2.drop('Sen2.5', axis=1)
Xvars = df2.drop('Sen2.5', axis=1).drop(["Raster.Cell", "x", "y"], axis=1)
yvars = df2['Sen2.5']

In [38]:
kmeans = KMeans(10).fit(np.array(kMeansVars[["x", "y"]]).reshape(-1, 2))

In [39]:
# check fold sizes
fold_sizes = np.zeros(10)
for fold in np.unique(kmeans.labels_):
    fold_sizes[fold] = len(np.unique(df2[kmeans.labels_ == fold]["Raster.Cell"]))
    if (len(np.unique(df2[kmeans.labels_ == fold]["Raster.Cell"])) == 1):
        print(len(np.unique(df2[kmeans.labels_ == fold]["Raster.Cell"])), df2[kmeans.labels_ == fold]["Raster.Cell"].iloc[0])
        continue
    print(len(np.unique(df2[kmeans.labels_ == fold]["Raster.Cell"])))

8
1 16743
6
1 4557
3
5
1 35637
1 46895
2
4


In [40]:
#Train the neural network model

MSE_vec_kf = np.zeros(len(np.unique(df2["Raster.Cell"])))
r2_vec_kf_train =  np.zeros(len(np.unique(df2["Raster.Cell"])))
r2_vec_kf = np.zeros(len(np.unique(df2["Raster.Cell"])))

k_ind = int(0)
for fold in np.unique(kmeans.labels_):
    X_train, X_test = Xvars.loc[kmeans.labels_ != fold], Xvars.loc[kmeans.labels_ == fold]
    y_train, y_test = yvars.loc[kmeans.labels_ != fold], yvars.loc[kmeans.labels_ == fold]
    w_train, w_test = weights.loc[kmeans.labels_ != fold], weights.loc[kmeans.labels_ == fold]

    # Standarize the input variables
    scaler = MinMaxScaler()
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # split again for validation data
    X_t, X_val, y_t, y_val, w_t, w_val = train_test_split(X_train, y_train, w_train)
    
    stopper = EarlyStopping(patience=10, restore_best_weights=True)
    mlp = get_model()
    # TODO: need to work this; stopper is too greedy/training too volatile
    # TODO: also consider transform, inverse-transform for y vars
    mlp.fit([X_t, y_t, w_t], validation_data=[X_val, y_val, w_val], epochs=2000, batch_size=200, callbacks=[stopper], verbose=0)
    y_pred_train = mlp.predict([X_train, y_train, w_train])
    y_pred = mlp.predict([X_test, y_test, w_test]).reshape((kmeans.labels_ == fold).sum())
    r2_train =  r2_score(y_train, y_pred_train)
    r2 = r2_score(y_test, y_pred)
    MSE = ((y_test - y_pred) ** 2).mean()
    print('MSE for test set', fold, ' is', MSE)
    print('r2 for test set', fold, ' is', r2)
    
    # repeats the test statistic values to weight by fold size
    for idx in range(int(fold_sizes[fold])):
        MSE_vec_kf[idx + k_ind] = MSE
        r2_vec_kf[idx + k_ind] = r2
        r2_vec_kf_train[idx + k_ind] = r2_train
    
    k_ind += int(fold_sizes[fold])
    
MSE_kf = MSE_vec_kf.mean()
MSE_kf_std = MSE_vec_kf.std()
print('test estimate MSE k-fold=', MSE_kf,
      'test estimate MSE standard err=', MSE_kf_std)

r2_kf =r2_vec_kf.mean()
r2_kf_std = r2_vec_kf.std()
print('test estimate r2 k-fold=', r2_kf,
      'test estimate r2 standard err=', r2_kf_std)

r2_kf_train =r2_vec_kf_train.mean()
r2_kf_std_train = r2_vec_kf_train.std()
print('train set r2 k-fold=', r2_kf_train,
      'train set r2 standard err=', r2_kf_std_train)

MSE for test set 0  is 7.2280016030554055
r2 for test set 0  is -0.3047400291194615
MSE for test set 1  is 2.0205824601539257
r2 for test set 1  is 0.4602967908121287
MSE for test set 2  is 4.855087012000389
r2 for test set 2  is 0.15528351697640796
MSE for test set 3  is 4.9928763361146355
r2 for test set 3  is -0.1732171682799104
MSE for test set 4  is 2.6741662972756965
r2 for test set 4  is 0.3873175491213702
MSE for test set 5  is 2.701626809709115
r2 for test set 5  is 0.3800076999049452
MSE for test set 6  is 1.9144212973830554
r2 for test set 6  is 0.46122937513707374
MSE for test set 7  is 3.8924698342915405
r2 for test set 7  is -0.058392437697892374
MSE for test set 8  is 3.211952716515134
r2 for test set 8  is 0.4776645911028863
MSE for test set 9  is 6.179079172750569
r2 for test set 9  is -0.4454307635823993
test estimate MSE k-fold= 4.763929371524811 test estimate MSE standard err= 1.8731259658433548
test estimate r2 k-fold= 0.04435295948872399 test estimate r2 standard 

In [41]:
# average the weights of the 10 models
mean_weights = [np.full(model_weights[0][0].shape, 0.0), np.full(model_weights[0][1].shape, 0.0)]
for weight in model_weights:
    for i in range(len(weight)):
        mean_weights[i] += weight[i] / len(model_weights)

In [42]:
model_mean = get_model()
model_mean.set_weights(mean_weights)

In [43]:
y_pred = model_mean.predict([scaler.transform(Xvars), yvars, weights]).reshape(len(yvars))
y_test = yvars
print(r2_score(y_test, y_pred))
print(((y_test - y_pred) ** 2).mean())

0.45900755157054074
2.8947936433060732


In [44]:
stats = pd.DataFrame(np.array(["MSE", ((y_test - y_pred) ** 2).mean(), MSE_kf_std, "R2", r2_score(y_test, y_pred), r2_kf_std]).reshape(2, 3),\
            columns=["statistic", "mean", "sd"])
stats

Unnamed: 0,statistic,mean,sd
0,MSE,2.894793643306073,1.8731259658433548
1,R2,0.4590075515705407,0.3402533255587575


In [45]:
stats.to_csv("nn_spatialcv_stats_ipw.csv")

### Stage 2

In [112]:
# Read in the second stage data
# df_second = pd.read_csv("NN_second_stage.csv")
df_second = pd.read_csv("../../Downloads/second_stage_20210601.csv")
df_second.head()

Unnamed: 0,Raster.Cell,Sen2.5,month,year,AOD,Temp,Elev,MSLP,Vsby,WdVl,...,LC_LowDev,LC_HighDev,PS,relh,Popd,x,y,primary,secondary,motorway
0,610,,4,14,0.1652,42.3448,266.5581,1014.2213,8.9448,8.2791,...,0.0,8.2791,10.5829,68.6514,0.0,-88.528,43.1902,0.0,0.0,0.0
1,610,,5,14,0.1203,55.062,266.5581,1015.4208,9.2196,6.6575,...,0.0,6.6575,10.5829,70.345,0.0,-88.528,43.1902,0.0,0.0,0.0
2,610,,6,14,0.1341,63.9627,266.5581,1013.548,7.6623,6.0854,...,0.0,6.0854,10.5829,76.1868,0.0,-88.528,43.1902,0.0,0.0,0.0
3,610,,7,14,0.4217,65.6833,266.5581,1015.3088,9.5372,5.7522,...,0.0,5.7522,10.5829,72.9863,0.0,-88.528,43.1902,0.0,0.0,0.0
4,610,,8,14,0.2284,67.5053,266.5581,1015.605,8.5467,4.4955,...,0.0,4.4955,10.5829,80.3597,0.0,-88.528,43.1902,0.0,0.0,0.0


In [113]:
# Drop the cells vith NA values
df_second=df_second[(~df_second['NDVI'].isna())& (~df_second['Popd'].isna())]
df_second = df_second.drop("Sen2.5", axis=1)
df_second.shape

(1536690, 21)

In [114]:
# generate IPW weights for 2nd stage
ipw_weights_2 = 1 / ipw_model.predict_proba(df_second[["Elev", "Temp", "MSLP", "month"]])[:, 1]
ipw_weights_2

array([1.09792518, 1.01922982, 1.0062817 , ..., 1.03234357, 1.36567035,
       1.50930033])

In [115]:
cell = df_second[["Raster.Cell","month","year"]]

In [116]:
pd.options.display.float_format = "{:.4f}".format
df_second.drop(["Raster.Cell","month","year"], axis=1).describe()

Unnamed: 0,AOD,Temp,Elev,MSLP,Vsby,WdVl,NDVI,LC_MedDev,LC_LowDev,LC_HighDev,PS,relh,Popd,x,y,primary,secondary,motorway
count,1536690.0,1536690.0,1536690.0,1536690.0,1536690.0,1536690.0,1536690.0,1536690.0,1536690.0,1536690.0,1536690.0,1536690.0,1536690.0,1536690.0,1536690.0,1536690.0,1536690.0,1536690.0
mean,0.1663,53.8314,221.2532,1016.7156,8.8666,6.7942,0.4902,0.0953,0.0407,6.7942,12.1723,73.3439,0.0005,-87.807,41.7993,723.4965,884.6209,599.1359
std,0.0938,15.3615,32.0532,2.6933,0.5477,1.4628,0.2027,0.1378,0.0805,1.4628,8.5076,5.5602,0.0012,0.5991,0.6392,1576.0544,1773.8034,2240.8581
min,0.0,20.9046,149.3274,1011.2849,6.5153,2.9493,-0.1403,0.0,0.0,2.9493,1.0,38.7312,0.0,-88.7758,40.7412,0.0,0.0,0.0
25%,0.1027,39.6843,199.5573,1014.7415,8.5675,5.6362,0.3217,0.002,0.0,5.6362,8.9648,69.7168,0.0,-88.2802,41.2827,0.0,0.0,0.0
50%,0.1425,56.1608,213.3082,1016.3637,8.9549,6.8828,0.4623,0.021,0.0,6.8828,11.2097,74.0631,0.0001,-87.9498,41.6325,0.0,0.0,0.0
75%,0.2056,68.3399,239.6572,1018.6949,9.2581,7.8992,0.6532,0.149,0.039,7.8992,13.1616,77.1286,0.0004,-87.3598,42.3156,475.0318,1007.549,0.0
max,1.7891,75.3651,351.6488,1023.1313,9.8292,10.5263,0.9456,0.835,0.56,10.5263,382.0,93.1307,0.0333,-86.4748,43.1902,18537.3689,23797.0901,40774.7948


In [117]:
df_2 = df_second
df_2['month_str']=df_2[["month"]].astype(str)
df_2['year_str'] =df_2[["year"]].astype(str)
df_2['time_str']= df_2['year_str']+'-'+df_2['month_str']

In [118]:
df_2 = df_2.join(pd.get_dummies(df_2['month'],drop_first=True)).join(pd.get_dummies(df_2['year'],drop_first=True))
df_2 = df_2.drop(['month_str','year_str','time_str'], axis=1)
df_2 = df_2.drop(["month","year","Raster.Cell","x","y"], axis=1)

In [119]:
# generate empty predictions (needed for structure)
y_2 = np.empty(df_2.shape[0])

In [120]:
# Use the model got from stage one to get the predictions
data = scaler.transform(df_2)
data_pred = model_mean.predict([data, y_2, ipw_weights_2])
df_2['pred'] = data_pred

In [121]:
df_2 = df_2.join(cell)

In [122]:
df_2=df_2.drop([2,3,4,5,6,7,8,9,10,11,12,15,16,17,18],axis=1)
df_2.head()

Unnamed: 0,AOD,Temp,Elev,MSLP,Vsby,WdVl,NDVI,LC_MedDev,LC_LowDev,LC_HighDev,PS,relh,Popd,primary,secondary,motorway,pred,Raster.Cell,month,year
0,0.1652,42.3448,266.5581,1014.2213,8.9448,8.2791,0.2914,0.0,0.0,8.2791,10.5829,68.6514,0.0,0.0,0.0,0.0,7.9458,610,4,14
1,0.1203,55.062,266.5581,1015.4208,9.2196,6.6575,0.4495,0.0,0.0,6.6575,10.5829,70.345,0.0,0.0,0.0,0.0,8.3105,610,5,14
2,0.1341,63.9627,266.5581,1013.548,7.6623,6.0854,0.547,0.0,0.0,6.0854,10.5829,76.1868,0.0,0.0,0.0,0.0,9.9541,610,6,14
3,0.4217,65.6833,266.5581,1015.3088,9.5372,5.7522,0.8203,0.0,0.0,5.7522,10.5829,72.9863,0.0,0.0,0.0,0.0,9.7878,610,7,14
4,0.2284,67.5053,266.5581,1015.605,8.5467,4.4955,0.8328,0.0,0.0,4.4955,10.5829,80.3597,0.0,0.0,0.0,0.0,8.836,610,8,14


In [123]:
# Merge the dataframe from stage one and stage two to get the data for stage three
df_3 = df
df_3['pred'] = df_3['Sen2.5']
df_3=df_3.drop('Sen2.5', axis=1)
df_3.head()

Unnamed: 0,Raster.Cell,month,year,AOD,Temp,Elev,MSLP,Vsby,WdVl,NDVI,...,LC_HighDev,PS,relh,Popd,x,y,primary,secondary,motorway,pred
0,3798,3,14,0.1026,27.2144,202.6511,1018.8516,8.4725,8.4067,0.1605,...,8.4067,13.1464,68.5915,0.0028,-87.9144,43.0569,83.3885,8613.3285,1269.4493,14.3
1,3798,4,14,0.0624,42.552,202.6511,1014.2213,8.7911,8.716,0.2352,...,8.716,13.1464,67.1438,0.0028,-87.9144,43.0569,83.3885,8613.3285,1269.4493,6.8
2,3798,5,14,0.1815,55.1383,202.6511,1015.4208,9.1797,7.012,0.3894,...,7.012,13.1464,68.5747,0.0028,-87.9144,43.0569,83.3885,8613.3285,1269.4493,5.6
3,3798,6,14,0.1658,63.7574,202.6511,1013.548,7.3425,6.3811,0.419,...,6.3811,13.1464,74.5546,0.0028,-87.9144,43.0569,83.3885,8613.3285,1269.4493,7.5714
4,3798,7,14,0.2997,66.077,202.6511,1015.3088,9.4635,6.0967,0.4372,...,6.0967,13.1464,69.6442,0.0028,-87.9144,43.0569,83.3885,8613.3285,1269.4493,8.4714


In [124]:
df_4 = df_2.append(df_3, ignore_index=True)

In [125]:
df_4

Unnamed: 0,AOD,Temp,Elev,MSLP,Vsby,WdVl,NDVI,LC_MedDev,LC_LowDev,LC_HighDev,...,Popd,primary,secondary,motorway,pred,Raster.Cell,month,year,x,y
0,0.1652,42.3448,266.5581,1014.2213,8.9448,8.2791,0.2914,0.0000,0.0000,8.2791,...,0.0000,0.0000,0.0000,0.0000,7.9458,610,4,14,,
1,0.1203,55.0620,266.5581,1015.4208,9.2196,6.6575,0.4495,0.0000,0.0000,6.6575,...,0.0000,0.0000,0.0000,0.0000,8.3105,610,5,14,,
2,0.1341,63.9627,266.5581,1013.5480,7.6623,6.0854,0.5470,0.0000,0.0000,6.0854,...,0.0000,0.0000,0.0000,0.0000,9.9541,610,6,14,,
3,0.4217,65.6833,266.5581,1015.3088,9.5372,5.7522,0.8203,0.0000,0.0000,5.7522,...,0.0000,0.0000,0.0000,0.0000,9.7878,610,7,14,,
4,0.2284,67.5053,266.5581,1015.6050,8.5467,4.4955,0.8328,0.0000,0.0000,4.4955,...,0.0000,0.0000,0.0000,0.0000,8.8360,610,8,14,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1538180,0.1504,71.8913,180.4956,1012.8235,8.9683,5.5281,0.7966,0.0300,0.0000,5.5281,...,0.0000,0.0000,0.0000,0.0000,6.6492,46895,6,18,-88.1858,41.2243
1538181,0.2035,73.6890,180.4956,1017.0145,9.6198,4.5049,0.7862,0.0300,0.0000,4.5049,...,0.0000,0.0000,0.0000,0.0000,7.2063,46895,7,18,-88.1858,41.2243
1538182,0.2873,73.9405,180.4956,1015.0842,9.0310,4.5487,0.6050,0.0300,0.0000,4.5487,...,0.0000,0.0000,0.0000,0.0000,9.5701,46895,8,18,-88.1858,41.2243
1538183,0.1025,52.0867,180.4956,1017.4701,9.2085,6.4142,0.3524,0.0300,0.0000,6.4142,...,0.0000,0.0000,0.0000,0.0000,5.2483,46895,10,18,-88.1858,41.2243


In [80]:
df_4.to_csv('../../Downloads/Third_stage_spatialcv_ipw.csv')

In [101]:
df_4.to_csv('../../Downloads/Third_stage_base_ipw.csv')

In [126]:
df_4.to_csv('../../Downloads/Third_stage_outlier_ipw.csv')