In [2]:
import numpy as np

BigX = np.load('data/soybean_data_compressed.npz') ## order: locID, year, yield, W(52*6), S(6*11), P(14)
X=BigX['data']

np.set_printoptions(suppress=True, precision=2)
print(X[0:3, 0:5])

[[   0.   1980.     32.5     0.27    0.  ]
 [   0.   1981.     36.      0.6     0.  ]
 [   0.   1982.     37.      2.1     0.38]]


In [19]:
X_train = X[X[:,1]<=2017]
print(X_train[0:3, 0:5])

X_train = X_train[:,3:]
print(X_train[0:3, 0:5])

[[   0.   1980.     32.5     0.27    0.  ]
 [   0.   1981.     36.      0.6     0.  ]
 [   0.   1982.     37.      2.1     0.38]]
[[0.27 0.   1.62 0.4  0.97]
 [0.6  0.   0.04 0.   0.86]
 [2.1  0.38 1.68 0.53 6.34]]


In [20]:
M=np.mean(X_train, axis=0, keepdims=True)
S=np.std(X_train, axis=0, keepdims=True)
X[:,3:] = (X[:,3:]-M)/S

  X[:,3:] = (X[:,3:]-M)/S


In [21]:
# Remove low yield observations
X=np.nan_to_num(X)
index_low_yield=X[:,2]<5
print('low yield observations',np.sum(index_low_yield))
print(X[index_low_yield][:,1])
X=X[np.logical_not(index_low_yield)]
del BigX

low yield observations 2
[1988. 2003.]


In [22]:
Index=X[:,1]==2018

print('Std %.2f and mean %.2f  of test ' %(np.std(X[Index][:,2]), np.mean(X[Index][:,2])))
print("train data",np.sum(np.logical_not(Index)))
print("test data",np.sum(Index))

Std 10.55 and mean 53.94  of test 
train data 24871
test data 472


In [23]:
print(X.shape)

(25343, 395)


In [42]:
years = np.arange(1980, 2019)
avg_values = {str(year): np.mean(X[X[:, 1] == year][:, 2]) for year in years}
print(avg_values) # --> AVERAGE YIELD FOR EACH YEAR (INCLUDING 2018)

{'1980': np.float64(28.896072507552866), '1981': np.float64(33.37415565345081), '1982': np.float64(32.86419019316493), '1983': np.float64(26.874306569343062), '1984': np.float64(27.47536231884058), '1985': np.float64(34.75277777777778), '1986': np.float64(35.56026200873362), '1987': np.float64(35.70191176470588), '1988': np.float64(26.670444763271156), '1989': np.float64(32.388294797687855), '1990': np.float64(34.28272327964861), '1991': np.float64(34.376106194690266), '1992': np.float64(37.20850439882698), '1993': np.float64(33.805044510385756), '1994': np.float64(41.85523952095809), '1995': np.float64(35.0006015037594), '1996': np.float64(38.595927601809954), '1997': np.float64(39.79051094890511), '1998': np.float64(39.97238372093023), '1999': np.float64(37.414410480349346), '2000': np.float64(36.899146514935985), '2001': np.float64(39.67424023154848), '2002': np.float64(37.19884393063584), '2003': np.float64(32.673932253313694), '2004': np.float64(42.45685131195336), '2005': np.floa

In [43]:
avg2 = list(avg_values.values())
mm = np.mean(avg2)
ss = np.std(avg2)

avg = {str(year): (value - mm) / ss for year, value in avg_values.items()}
avg['2018'] = avg['2017']
print(avg) # --> NORMALIZED AVERAGE YIELD FOR EACH YEAR (2018 REPLACED BY 2017)

{'1980': np.float64(-1.4582864093819747), '1981': np.float64(-0.8110332591297735), '1982': np.float64(-0.8847426432425297), '1983': np.float64(-1.7505084003420137), '1984': np.float64(-1.6636330086477271), '1985': np.float64(-0.6117699842181856), '1986': np.float64(-0.49505783412246773), '1987': np.float64(-0.4745840626898213), '1988': np.float64(-1.7799741766344916), '1989': np.float64(-0.9535276085675289), '1990': np.float64(-0.6797107179633866), '1991': np.float64(-0.6662133388457581), '1992': np.float64(-0.2568241871334897), '1993': np.float64(-0.7487534485560939), '1994': np.float64(0.4148055812735497), '1995': np.float64(-0.5759500402549351), '1996': np.float64(-0.05628882197700863), '1997': np.float64(0.11637385744519654), '1998': np.float64(0.142661382889246), '1999': np.float64(-0.22706293537032865), '2000': np.float64(-0.30153815477994156), '2001': np.float64(0.09956832131751261), '2002': np.float64(-0.2582204918169693), '2003': np.float64(-0.9122421440148659), '2004': np.flo

In [None]:
# add normalized average yield to the data based on year
X = np.concatenate((X, np.array([avg[str(int(year))] for year in X[:, 1]]).reshape(-1, 1)), axis=1)
print(X.shape)


(25343, 396)


In [None]:
print(np.mean(X[X[:, 1] == 2015][:, -1]))
print(np.mean(X[X[:, 1] == 2016][:, -1]))
print(np.mean(X[X[:, 1] == 2017][:, -1]))
print(np.mean(X[X[:, 1] == 2018][:, -1]))

1.6528674736360225
2.2986134626530217
1.75394127659999
1.75394127659999


In [98]:
batch_size = 3
time_steps = 2

In [None]:
out = np.zeros(shape=[batch_size, time_steps, 396])

In [113]:
for i in range(batch_size):
    r1 = np.random.randint(0, 35)
    years = np.array([(r1 + i) for i in range(time_steps)]) + 1980
    print(years)
    
    for j, y in enumerate(years):
        r2 = np.random.randint(X[X[:, 1] == y].shape[0])
        out[i, j, :] = X[X[:, 1] == y][r2, :]

print(out)
print(out.shape)

[1997 1998]
[2001 2002]
[2001 2002]
[[[ 363.   1997.     37.   ...   -0.07   -0.06    0.12]
  [  30.   1998.     46.   ...   -0.07   -0.06    0.14]]

 [[ 575.   2001.     38.   ...   -0.07   -0.06    0.1 ]
  [ 653.   2002.     36.6  ...   -0.07   -0.06   -0.26]]

 [[ 736.   2001.     32.2  ...   -0.07   -0.06    0.1 ]
  [1029.   2002.     38.3  ...   -0.07   -0.06   -0.26]]]
(3, 2, 396)


In [91]:
BigX = np.load('data/soybean_data_compressed.npz') ## order: locID, year, yield, W(52*6), S(6*11), P(14)
X=BigX['data']

np.set_printoptions(suppress=True, precision=2)
print(X[0:3, -5:])

[[ 0.  0.  0.  0.  0.]
 [10.  6.  4.  0.  0.]
 [ 3.  1.  1.  0.  0.]]


In [None]:
def preprocess_data(X, time_steps = 5):
    print("--- Preprocessing ---")
    # 1. remove low yield observations
    X = np.nan_to_num(X)
    index_low_yield = X[:,2] < 5
    print("Remove low yield observations: ", np.sum(index_low_yield))
    print("of years: ", X[index_low_yield][:, 1])
    X = X[np.logical_not(index_low_yield)]
    
    # 2. calculate average yield of each year and standardize it
    years = np.arange(1980, 2017)  # Exclude the last two years (2017 and 2018) for standardization
    _avg = {str(year): np.mean(X[X[:, 1] == year][:, 2]) for year in years}
    avg_m = np.mean(list(_avg.values()))
    avg_s = np.std(list(_avg.values()))
    
    years = np.arange(1980, 2019)
    avg = {str(year): np.mean(X[X[:, 1] == year][:, 2]) for year in years}
    avg = {str(year): (value - avg_m) / avg_s for year, value in avg.items()}
    
    # 3. standardize the data on the training data only
    X_train = X[X[:,1] <= 2016][:, 2:]
    print("Full train data available: ", X_train.shape)

    M=np.mean(X_train, axis=0, keepdims=True)
    S=np.std(X_train, axis=0, keepdims=True)
    epsilon = 1e-8
    
    X[:,2:] = (X[:,2:] - M) / (S + epsilon)
    
    # 4. add time steps  
    for i in range(time_steps):
        avg_prev = np.array([avg[str(int(year - i))] if (year - i) > 1979 else np.nan for year in X[:, 1] ])
        X = np.concatenate((X, avg_prev.reshape(-1, 1)), axis=1)
    
    return X, M[0, 0], S[0, 0]

In [93]:
X, M, S = preprocess_data(X)

n_batches = 3
time_steps = 5

--- Preprocessing ---
Remove low yield observations:  2
of years:  [1988. 2003.]
Full train data available:  (24311, 393)
(25343, 395)
(25343, 400)


In [90]:
print(X.shape)

(25343, 400)


In [None]:
def get_sample(X, batch_size = 1000):
    X_train = X[X[:, 1] <= 2016]
    sample = np.zeros(shape = [batch_size, X.shape[1]])

    for i in range(batch_size):
        r = np.random.randint(len(X_train))   # random index
        obs = X_train[r]
        sample[i] = obs

    return sample.reshape(-1, X.shape[1])      # shape (n_batches*time_steps, 396 + time_steps)

In [89]:
get_sample(X, batch_size = 3).shape

(3, 400)

In [None]:
def get_sample_test(X, time_steps):
    sample = []
    X_test = X[X[:, 1] == 2018]

    for obs in X_test:
        avg_yield_values = []
        
        for k in range(time_steps):
            prev_y = 2018 - k - 1
            prev_data = X[X[:, 1] == prev_y]
            avg_yield_values.append(prev_data[0, -1])
        
        avg_yield_values = np.array(avg_yield_values[::-1])
        obs_with_avg_yield = np.concatenate((obs, avg_yield_values))
        sample.append(obs_with_avg_yield)

    return np.array(sample).reshape(-1, X.shape[1] + time_steps)    # shape (n_batches*time_steps, 396 + time_steps)

-0.6928976489992175
[-1.81 -0.91 -0.62 -0.6  -0.16]
0.5680691631734047
[-0.91 -0.62 -0.6  -0.16 -0.69]
-0.5056279382996418
[-0.62 -0.6  -0.16 -0.69  0.57]
0.05753691455496316
[-0.6  -0.16 -0.69  0.57 -0.51]
0.24465411520448935
[-0.16 -0.69  0.57 -0.51  0.06]
-0.15978634421108592
[-0.4  -1.81 -0.91 -0.62 -0.6 ]
-0.6928976489992175
[-1.81 -0.91 -0.62 -0.6  -0.16]
0.5680691631734047
[-0.91 -0.62 -0.6  -0.16 -0.69]
-0.5056279382996418
[-0.62 -0.6  -0.16 -0.69  0.57]
0.05753691455496316
[-0.6  -0.16 -0.69  0.57 -0.51]
0.7333201149479038
[ 0.23 -0.16 -0.87  0.66  0.82]
0.6843489785936109
[-0.16 -0.87  0.66  0.82  0.73]
0.5096516153631387
[-0.87  0.66  0.82  0.73  0.68]
1.2184480939192959
[0.66 0.82 0.73 0.68 0.51]
1.0300765844659818
[0.82 0.73 0.68 0.51 1.22]


array([[ 655.  , 1993.  ,   29.7 , ...,   -0.62,   -0.6 ,   -0.16],
       [ 383.  , 1994.  ,   35.  , ...,   -0.6 ,   -0.16,   -0.69],
       [  30.  , 1995.  ,   41.5 , ...,   -0.16,   -0.69,    0.57],
       ...,
       [ 582.  , 2008.  ,   43.5 , ...,    0.82,    0.73,    0.68],
       [ 103.  , 2009.  ,   51.5 , ...,    0.73,    0.68,    0.51],
       [ 721.  , 2010.  ,   33.1 , ...,    0.68,    0.51,    1.22]])