In [1]:
import torch 
import torch.nn as nn
import torch.optim as optim
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import time
import plotly.express as px
import plotly.graph_objects as go

In [2]:
# set the device to default as cpu, cuda if it is runnning on nvidia gpu and mps if it is running on a macbook
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
print(f'Device: {device}')

Device: cuda


In [3]:
sim_scores = pd.read_csv('../../refactor/data/pecan/pecans_similarity/dataframes/combined_df_similarity_scores.csv')
weather_data = pd.read_csv('../../refactor/data/pecan/weather.csv')
start_date = '2016-01-01'
end_date = '2022-12-31'

weather_data = weather_data.loc[weather_data['dt'].between(start_date, end_date)]
sim_scores = sim_scores.drop(columns=['Unnamed: 0', 'Similarity Type'])
print(sim_scores.head())
sim_scores = sim_scores.to_numpy().astype(np.float32)
print(f'Weather data shape: {weather_data.shape}')
print(f'Similarity scores shape: {sim_scores.shape}')

sim_scores = sim_scores.flatten()
print(f'Similarity scores shape: {sim_scores.shape}')

repeated_sim_scores = np.repeat(sim_scores[np.newaxis, :], 245376, axis=0)

print(f'Repeated similarity scores shape: {repeated_sim_scores.shape}')
sim_scores = pd.DataFrame(repeated_sim_scores)
print(sim_scores.head())
combined_data = pd.concat([weather_data.reset_index(drop=True), sim_scores.reset_index(drop=True)], axis=1)
print(f'Combined data shape: {combined_data.shape}')

print(combined_data.head())


      sim_lat      sim_lon     sim_temp  sim_dew_point  sim_feels_like  \
0  681.651657  1475.458784  1953.988272    3106.781056     2171.000483   
1  151.793161   769.031368  4539.150712    2890.086334     4881.962673   
2  466.864816   179.698079  1357.260359    2585.302017     1593.666166   
3  529.858496   706.427416  4014.002074    3137.782978     4317.279583   
4  214.786840  1655.156863  1858.861100    3763.668512     2065.357310   

   sim_temp_min  sim_temp_max  sim_pressure  sim_humidity  sim_wind_speed  \
0   1916.711339   2039.767348   1921.502017  10872.091565     1592.502058   
1   4688.181265   4339.046602   1988.776760  11818.156540     1196.956873   
2   1334.880177   1418.370131    621.962218   6646.864524     1414.385875   
3   4291.811025   3715.637905   2202.130559  10875.214343     1447.905650   
4   1873.332613   1880.476383   1810.585817  11639.092362     1542.782991   

   sim_wind_deg  sim_clouds_all  sim_weather_id  
0  81069.632255    23900.096924    55292.4

In [4]:

crop_yield = pd.read_csv('../../refactor/data/pecan/yield.csv')
print(f'Crop yield shape: {crop_yield.shape}')
crop_yield.head()

df = combined_data.reindex(columns=combined_data.columns.tolist() + ['yield'])
print(df.shape)
df.head()

Crop yield shape: (22, 8)
(245376, 565)


Unnamed: 0.1,Unnamed: 0,dt,city_name,lat,lon,temp,dew_point,feels_like,temp_min,temp_max,...,537,538,539,540,541,542,543,544,545,yield
0,324336,2016-01-01 00:00:00+00:00,Doña Ana,32.393081,-106.815781,8.99,1.42,7.93,7.6,9.46,...,inf,inf,inf,5e-06,0.065748,inf,inf,inf,0.008113,
1,324337,2016-01-01 01:00:00+00:00,Doña Ana,32.393081,-106.815781,8.25,2.07,7.63,7.44,8.68,...,inf,inf,inf,5e-06,0.065748,inf,inf,inf,0.008113,
2,324338,2016-01-01 02:00:00+00:00,Doña Ana,32.393081,-106.815781,7.37,1.44,6.04,6.1,7.8,...,inf,inf,inf,5e-06,0.065748,inf,inf,inf,0.008113,
3,324339,2016-01-01 03:00:00+00:00,Doña Ana,32.393081,-106.815781,5.58,1.32,3.13,4.3,6.02,...,inf,inf,inf,5e-06,0.065748,inf,inf,inf,0.008113,
4,324340,2016-01-01 04:00:00+00:00,Doña Ana,32.393081,-106.815781,5.43,0.79,5.43,4.75,5.9,...,inf,inf,inf,5e-06,0.065748,inf,inf,inf,0.008113,


In [5]:
for index, row in crop_yield.iterrows():
    year = row['Year']
    county = row['County']
    yield_value = row['Value']
    df.loc[(df['dt'].str.contains(str(year))) & (df['city_name'].str.contains(str(county))), 'yield'] = yield_value
    
df.drop(columns=['dt', 'city_name', 'weather_main', 'weather_description', 'Unnamed: 0'], inplace=True)
df.head(20)

Unnamed: 0,lat,lon,temp,dew_point,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,...,537,538,539,540,541,542,543,544,545,yield
0,32.393081,-106.815781,8.99,1.42,7.93,7.6,9.46,1019,59,2.1,...,inf,inf,inf,5e-06,0.065748,inf,inf,inf,0.008113,1837
1,32.393081,-106.815781,8.25,2.07,7.63,7.44,8.68,1020,65,1.52,...,inf,inf,inf,5e-06,0.065748,inf,inf,inf,0.008113,1837
2,32.393081,-106.815781,7.37,1.44,6.04,6.1,7.8,1021,66,2.1,...,inf,inf,inf,5e-06,0.065748,inf,inf,inf,0.008113,1837
3,32.393081,-106.815781,5.58,1.32,3.13,4.3,6.02,1022,74,3.1,...,inf,inf,inf,5e-06,0.065748,inf,inf,inf,0.008113,1837
4,32.393081,-106.815781,5.43,0.79,5.43,4.75,5.9,1023,72,1.17,...,inf,inf,inf,5e-06,0.065748,inf,inf,inf,0.008113,1837
5,32.393081,-106.815781,4.68,0.45,4.68,3.7,5.29,1024,74,0.0,...,inf,inf,inf,5e-06,0.065748,inf,inf,inf,0.008113,1837
6,32.393081,-106.815781,3.89,0.23,3.89,3.27,4.51,1025,77,0.0,...,inf,inf,inf,5e-06,0.065748,inf,inf,inf,0.008113,1837
7,32.393081,-106.815781,3.06,-1.15,0.1,2.44,3.7,1025,73,3.1,...,inf,inf,inf,5e-06,0.065748,inf,inf,inf,0.008113,1837
8,32.393081,-106.815781,2.65,-1.5,0.02,2.09,3.36,1026,73,2.6,...,inf,inf,inf,5e-06,0.065748,inf,inf,inf,0.008113,1837
9,32.393081,-106.815781,2.25,-1.36,0.05,1.87,2.94,1027,76,2.1,...,inf,inf,inf,5e-06,0.065748,inf,inf,inf,0.008113,1837


In [6]:
df = df.astype('float', errors='ignore')


In [7]:
print(df.dtypes)
df.head()

lat           float64
lon           float64
temp          float64
dew_point     float64
feels_like    float64
               ...   
542           float64
543           float64
544           float64
545           float64
yield          object
Length: 560, dtype: object


Unnamed: 0,lat,lon,temp,dew_point,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,...,537,538,539,540,541,542,543,544,545,yield
0,32.393081,-106.815781,8.99,1.42,7.93,7.6,9.46,1019.0,59.0,2.1,...,inf,inf,inf,5e-06,0.065748,inf,inf,inf,0.008113,1837
1,32.393081,-106.815781,8.25,2.07,7.63,7.44,8.68,1020.0,65.0,1.52,...,inf,inf,inf,5e-06,0.065748,inf,inf,inf,0.008113,1837
2,32.393081,-106.815781,7.37,1.44,6.04,6.1,7.8,1021.0,66.0,2.1,...,inf,inf,inf,5e-06,0.065748,inf,inf,inf,0.008113,1837
3,32.393081,-106.815781,5.58,1.32,3.13,4.3,6.02,1022.0,74.0,3.1,...,inf,inf,inf,5e-06,0.065748,inf,inf,inf,0.008113,1837
4,32.393081,-106.815781,5.43,0.79,5.43,4.75,5.9,1023.0,72.0,1.17,...,inf,inf,inf,5e-06,0.065748,inf,inf,inf,0.008113,1837


In [8]:
x = df.drop(columns=['yield']).values.astype(np.float32)
df['yield'] = df['yield'].str.replace(',', '').astype(float)
y = df['yield'].values.astype(np.float32)


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print(f'x_train.shape: {x_train.shape}, y_train.shape: {y_train.shape}')
print(f'x_test.shape: {x_test.shape}, y_test.shape: {y_test.shape}')
print(f'x_train.dtype: {x_train.dtype}, y_train.dtype: {y_train.dtype}')


x_train.shape: (196300, 559), y_train.shape: (196300,)
x_test.shape: (49076, 559), y_test.shape: (49076,)
x_train.dtype: float32, y_train.dtype: float32


In [9]:

# Define model
model = nn.Sequential(
    nn.Linear(559, 1024),
    nn.ReLU(),
    nn.Linear(1024, 2048),
    nn.ReLU(),
    nn.Linear(2048, 1)
).to(device)

# Loss function and optimizer
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Assuming x_train and y_train are numpy arrays
x_train = np.nan_to_num(x_train, nan=np.nanmean(x_train), posinf=np.nanmax(x_train[np.isfinite(x_train)]), neginf=np.nanmin(x_train[np.isfinite(x_train)]))
y_train = np.nan_to_num(y_train, nan=np.nanmean(y_train), posinf=np.nanmax(y_train[np.isfinite(y_train)]), neginf=np.nanmin(y_train[np.isfinite(y_train)]))

# Data conversion to tensors and moving them to the correct device
x_train_tensor = torch.from_numpy(x_train).float().to(device)
y_train_tensor = torch.from_numpy(y_train).float().to(device)


# Initial loss check
with torch.no_grad():
    initial_predictions = model(x_train_tensor).squeeze()
    initial_loss = loss_fn(initial_predictions, y_train_tensor)
    print(f'Initial loss: {initial_loss.item()}')

# Training loop
start = time.time()
loss_history = []
for epoch in range(200):
    optimizer.zero_grad()
    predictions = model(x_train_tensor).squeeze()  # Adjust the output dimension
    loss_val = loss_fn(predictions, y_train_tensor)
    loss_val.backward()
    optimizer.step()
    loss_history.append(loss_val.item())
    if epoch % 10 == 0:
        print(f'Epoch {epoch}: Loss = {loss_val.item()}')

end = time.time()

print(f'Training time: {end - start:.2f}s')

# Check gradients
for name, param in model.named_parameters():
    if param.grad is not None:
        print(f'{name} gradient: {param.grad.norm().item()}')
    else:
        print(f'{name} gradient: None')


Initial loss: 3773652.75
Epoch 0: Loss = 3773652.75
Epoch 10: Loss = 3743778.0
Epoch 20: Loss = 3651423.0
Epoch 30: Loss = 3477364.25
Epoch 40: Loss = 3210980.25
Epoch 50: Loss = 2854241.5
Epoch 60: Loss = 2423487.25
Epoch 70: Loss = 1948538.375
Epoch 80: Loss = 1469185.875
Epoch 90: Loss = 1028974.9375
Epoch 100: Loss = 666432.375
Epoch 110: Loss = 405116.28125
Epoch 120: Loss = 246155.40625
Epoch 130: Loss = 168718.8125
Epoch 140: Loss = 140899.078125
Epoch 150: Loss = 134742.390625
Epoch 160: Loss = 134471.125
Epoch 170: Loss = 134658.453125
Epoch 180: Loss = 134570.90625
Epoch 190: Loss = 134447.515625
Training time: 759.42s
0.weight gradient: nan
0.bias gradient: 0.0
2.weight gradient: 0.0
2.bias gradient: 311.7840576171875
4.weight gradient: 311.53363037109375
4.bias gradient: 7.130643367767334


In [10]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.arange(len(loss_history)), y=loss_history, mode='lines', name='Loss'))
fig.update_layout(title='Loss over time', xaxis_title='Epoch', yaxis_title='Loss')
fig.show()

In [3]:
x_train = pd.read_csv('../../data/pecan/dataset4/x_train_05_27_24.csv')
y_train = pd.read_csv('../../data/pecan/dataset4/y_train_05_27_24.csv')
x_test = pd.read_csv('../../data/pecan/dataset4/x_test_05_27_24.csv')
y_test = pd.read_csv('../../data/pecan/dataset4/y_test_05_27_24.csv')

In [4]:
print(x_train.shape)
x_train.head()

(16, 122976)


Unnamed: 0,0.000000000000000000e+00,1.085713633614417972e-01,4.535353535353534582e-01,1.130776794493609239e-01,4.399071925754061141e-01,4.655004859086491753e-01,4.549902152641878694e-01,2.727272727272698205e-01,1.199999999999999956e-01,4.781341107871720508e-01,...,0.000000000000000000e+00.23654,0.000000000000000000e+00.23655,0.000000000000000000e+00.23656,0.000000000000000000e+00.23657,0.000000000000000000e+00.23658,0.000000000000000000e+00.23659,0.000000000000000000e+00.23660,0.000000000000000000e+00.23661,0.000000000000000000e+00.23662,0.000000000000000000e+00.23663
0,0.684904,0.0,0.399495,0.0,0.487239,0.452867,0.363503,0.272727,0.08,0.145773,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1.0,1.0,0.416667,0.358899,0.458005,0.443149,0.371331,0.484848,0.32,0.252672,...,0.697214,0.753897,0.620712,1.0,0.289474,0.0,0.0,0.0,0.0,1.0
2,0.0,0.108571,0.463636,0.410029,0.435267,0.477648,0.44863,0.272727,0.333333,0.564626,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.108571,0.5,0.254179,0.566125,0.504373,0.509295,0.484848,0.186667,0.204082,...,0.543255,0.683347,0.698382,0.999011,0.289474,0.724473,1.0,0.0,0.0,1.0
4,0.684904,0.0,0.564646,0.531465,0.591647,0.612731,0.608121,0.242424,0.373333,0.400389,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
print(y_train.shape)
y_train.head()

(16, 1)


Unnamed: 0,1.850000000000000000e+03
0,2300.0
1,2300.0
2,2010.0
3,1780.0
4,1340.0


In [8]:
# Define model
model = nn.Sequential(
    nn.Linear(122976, 4096), 
    nn.ReLU(),
    nn.Linear(4096, 2048),
    nn.ReLU(),
    nn.Linear(2048, 1)
).to(device)


# Loss function and optimizer
loss_fn = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

# Assuming x_train and y_train are numpy arrays
x_train = np.nan_to_num(x_train, nan=np.nanmean(x_train), posinf=np.nanmax(x_train[np.isfinite(x_train)]), neginf=np.nanmin(x_train[np.isfinite(x_train)]))
y_train = np.nan_to_num(y_train, nan=np.nanmean(y_train), posinf=np.nanmax(y_train[np.isfinite(y_train)]), neginf=np.nanmin(y_train[np.isfinite(y_train)]))

# Data conversion to tensors and moving them to the correct device
x_train_tensor = torch.from_numpy(x_train).float().to(device)
y_train_tensor = torch.from_numpy(y_train).float().to(device)


# Initial loss check
with torch.no_grad():
    initial_predictions = model(x_train_tensor).squeeze()
    initial_loss = loss_fn(initial_predictions, y_train_tensor)
    print(f'Initial loss: {initial_loss.item()}')

# Training loop
start = time.time()
loss_history = []
for epoch in range(1000):
    optimizer.zero_grad()
    predictions = model(x_train_tensor).squeeze()  # Adjust the output dimension
    loss_val = loss_fn(predictions, y_train_tensor)
    loss_val.backward()
    optimizer.step()
    loss_history.append(loss_val.item())
    if epoch % 10 == 0:
        print(f'Epoch {epoch}: Loss = {loss_val.item()}')

end = time.time()

print(f'Training time: {end - start:.2f}s')

# Check gradients
for name, param in model.named_parameters():
    if param.grad is not None:
        print(f'{name} gradient: {param.grad.norm().item()}')
    else:
        print(f'{name} gradient: None')


Initial loss: 3684446.5
Epoch 0: Loss = 3684446.5
Epoch 10: Loss = 3683655.0
Epoch 20: Loss = 3677844.25
Epoch 30: Loss = 3664586.5
Epoch 40: Loss = 3642159.0
Epoch 50: Loss = 3609177.0
Epoch 60: Loss = 3564826.5
Epoch 70: Loss = 3508757.0
Epoch 80: Loss = 3440968.75
Epoch 90: Loss = 3361738.0
Epoch 100: Loss = 3271559.75
Epoch 110: Loss = 3171105.75
Epoch 120: Loss = 3061195.0
Epoch 130: Loss = 2942764.75
Epoch 140: Loss = 2816853.0
Epoch 150: Loss = 2684578.0
Epoch 160: Loss = 2547120.5
Epoch 170: Loss = 2405706.75
Epoch 180: Loss = 2261588.0
Epoch 190: Loss = 2116027.0
Epoch 200: Loss = 1970278.75
Epoch 210: Loss = 1825569.25
Epoch 220: Loss = 1683083.25
Epoch 230: Loss = 1543941.75
Epoch 240: Loss = 1409187.75
Epoch 250: Loss = 1279769.0
Epoch 260: Loss = 1156523.25
Epoch 270: Loss = 1040164.6875
Epoch 280: Loss = 931274.9375
Epoch 290: Loss = 830293.375
Epoch 300: Loss = 737512.5
Epoch 310: Loss = 653078.75
Epoch 320: Loss = 576994.1875
Epoch 330: Loss = 509123.21875
Epoch 340: Lo