In [32]:
# load libraries
import lstm
import torch
import queue
import threading
import numpy as np 
import pandas as pd
import torch.nn as nn
import plotly.express as px
import plotly.graph_objects as go
from sklearn.preprocessing import MinMaxScaler
from torch.utils.data import DataLoader, TensorDataset, random_split

print("Libraries loaded successfully")


# load data
df = pd.read_csv('../../../data/pecan/combined_05_23_24.csv')


# city name is not needed
df = df.drop('city_name', axis=1)

df['date'] = pd.to_datetime(df['date'])

df = df.groupby(df['date'].dt.date).mean()
df = df.drop(columns=['date'], errors='ignore')
df.reset_index(inplace=True)

df['date'] = pd.to_datetime(df['date'])
df['year'] = df['date'].dt.year
df['month'] = df['date'].dt.month
df['day'] = df['date'].dt.day
df = df.drop('date', axis=1)

scaler = MinMaxScaler()

yield_df = df['yield']
df = df.drop('yield', axis=1)
df = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

# add the similarity scores to the x dataset
sim_df = pd.read_csv('../../../data/pecan/dataframes/combined_df_similarity_scores.csv')
sim_df.head()

Libraries loaded successfully


Unnamed: 0.1,Unnamed: 0,lat,lon,temp,dew_point,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,clouds_all,weather_id,Similarity Type
0,Doña Ana-Chaves County,681.651657,1475.458784,1953.988272,3106.781056,2171.000483,1916.711339,2039.767348,1921.502017,10872.091565,1592.502058,81069.632255,23900.096924,55292.409633,Euclidean
1,Doña Ana-Otero County,151.793161,769.031368,4539.150712,2890.086334,4881.962673,4688.181265,4339.046602,1988.77676,11818.15654,1196.956873,73866.389468,22137.119528,59219.870736,Euclidean
2,Doña Ana-Sierra County,466.864816,179.698079,1357.260359,2585.302017,1593.666166,1334.880177,1418.370131,621.962218,6646.864524,1414.385875,81426.845745,19018.533224,47412.686129,Euclidean
3,Chaves County-Otero County,529.858496,706.427416,4014.002074,3137.782978,4317.279583,4291.811025,3715.637905,2202.130559,10875.214343,1447.90565,75529.217082,22826.946795,59970.008679,Euclidean
4,Chaves County-Sierra County,214.78684,1655.156863,1858.8611,3763.668512,2065.35731,1873.332613,1880.476383,1810.585817,11639.092362,1542.782991,83923.732138,23804.896576,54332.471184,Euclidean


In [33]:
sim_df = sim_df.drop(['Unnamed: 0', 'Similarity Type'], axis=1)
sim_df.head()
sim_df = sim_df.replace([np.inf, -np.inf, np.nan], 0)
sim_df = pd.DataFrame(scaler.fit_transform(sim_df))
sim_df.head()
flat_sim = sim_df.to_numpy().flatten().tolist()
len(flat_sim)


sim_df = pd.Series(flat_sim)

df = pd.concat([df, sim_df], axis=1)
df.shape
df.head()



Unnamed: 0,lat,lon,temp,dew_point,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,clouds_all,weather_main,weather_description,year,month,day,0
0,0.718798,0.98755,0.197174,0.46009,0.214945,0.20654,0.19026,0.85546,0.819764,0.153835,0.075982,0.907083,0.318493,0.439759,0.0,0.0,0.0,0.001588
1,0.718798,0.98755,0.183798,0.448994,0.237162,0.184969,0.182947,0.860814,0.83308,0.012551,0.348466,0.78125,0.151541,0.450602,0.0,0.0,0.033333,0.001416
2,0.718798,0.98755,0.167266,0.397118,0.224221,0.168201,0.165925,0.772484,0.755519,0.00427,0.20316,0.2,0.151541,0.708434,0.0,0.0,0.066667,0.000714
3,0.718798,0.98755,0.210058,0.400153,0.254032,0.203974,0.210598,0.686831,0.65483,0.095313,0.297485,0.422292,0.066781,0.70241,0.0,0.0,0.1,0.001792
4,0.718798,0.98755,0.243078,0.524657,0.291816,0.246923,0.238099,0.578694,0.867656,0.049822,0.029049,0.935,0.42637,0.408434,0.0,0.0,0.133333,0.000738


In [34]:
df = df.replace([np.inf, -np.inf, np.nan], 0)
print(df.isna().sum())
print(yield_df.isna().sum())


lat                    0
lon                    0
temp                   0
dew_point              0
feels_like             0
temp_min               0
temp_max               0
pressure               0
humidity               0
wind_speed             0
wind_deg               0
clouds_all             0
weather_main           0
weather_description    0
year                   0
month                  0
day                    0
0                      0
dtype: int64
0


In [35]:
X = torch.tensor(df.values, dtype=torch.float32)
Y = torch.tensor(yield_df.values, dtype=torch.float32)

sequence_length = 32  # xrbitrary number chosen


def create_sequences(X, Y, seq_length):
    xs, ys = [], []
    for i in range(len(X) - seq_length):
        x = X[i:i+seq_length]
        y = Y[i+seq_length]
        xs.append(x)
        ys.append(y)
    return torch.stack(xs), torch.stack(ys)

X, y = create_sequences(X, Y, sequence_length)


print(X.shape, y.shape) 
dataset = TensorDataset(X, y)

torch.save(dataset, '../../../data/pecan/lstm_weather_dataset.pt')
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
new_val_size = int(0.8 * val_size)
test_size = val_size - new_val_size

generator = torch.Generator().manual_seed(42)

train_dataset, val_dataset = random_split(dataset, [train_size, val_size], generator=generator)
val_dataset, test_dataset = random_split(val_dataset, [new_val_size, test_size], generator=generator)

print(f'Train Dataset size: {len(train_dataset)} \nValidation Dataset size {len(val_dataset)}\nTest Dataset size {len(test_dataset)}')

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)





class LSTM(nn.Module):
    def __init__(self, input_size=18, hidden_size=128, num_layers=5, patience=10, min_delta=10):
        super(LSTM, self).__init__()

        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, 1)
        
        self.activations = []
        #member vars for the early stopper
        self.patience = patience
        self.min_delta = min_delta
        self.min_validation_loss = float('inf')

    def forward(self, x):
        batch_size = x.size(0)
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        self.activations.append(out.squeeze())
        return out.squeeze()

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss - self.min_delta):
            self.counter +=1
            if self.counter >= self.patience:
                return True
        return False

class Trainer():
    def __init__(self, model, train_loader, validation_loader, device, criterion=nn.MSELoss(), lr=0.001, num_epochs=1000):
        self.model = model
        self.train_loader = train_loader
        self.validation_loader = validation_loader
        self.criterion = criterion
        self.lr = lr
        self.num_epochs = num_epochs
        self.device = device
        self.optimizer = torch.optim.Adam(model.parameters(), lr=self.lr)
        self.train_losses = []
        self.val_losses = []
        self.loss = None

    def train_one_epoch(self):
        self.model.train()
        epoch_loss = 0
        for data, targets in self.train_loader:
            data = data.to(self.device)
            targets = targets.to(self.device)

            self.optimizer.zero_grad()
            outputs = self.model(data)
            loss = self.criterion(outputs, targets)
            loss.backward()
            self.optimizer.step()
            epoch_loss += loss.item()
        return epoch_loss / len(self.train_loader)

    def val_one_epoch(self):
        self.model.eval()
        epoch_loss = 0
        with torch.no_grad():
            for data, targets in self.validation_loader:
                data = data.to(self.device)
                targets = targets.to(self.device)
                outputs = self.model(data)
                loss = self.criterion(outputs, targets)
                epoch_loss += loss.item()
        return epoch_loss / len(self.validation_loader)
        
    def train(self):
        for epoch in np.arange(self.num_epochs):
            train_loss = self.train_one_epoch()
            val_loss = self.val_one_epoch()
            
            self.train_losses.append(train_loss)
            self.val_losses.append(val_loss)

            if(epoch % 100 == 0):
                print(f'Epoch: {epoch} Train Loss: {train_loss} Validation Loss: {val_loss}')

            if self.model.early_stop(val_loss):
                break
            
        print('#'*100)
        print(f'Final Epoch: {epoch} Train Loss: {train_loss} Validation Loss: {val_loss}')
        return self.model, self.train_losses, self.val_losses



# set device 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device set to:", device)

model = LSTM().to(device)
trainer = Trainer(model, train_loader, val_loader, device)
model, train_losses, val_losses = trainer.train()

torch.Size([2525, 32, 18]) torch.Size([2525])
Train Dataset size: 2020 
Validation Dataset size 404
Test Dataset size 101
Device set to: cuda
Epoch: 0 Train Loss: 3662249.98828125 Validation Loss: 3656313.403846154
Epoch: 100 Train Loss: 1293151.349609375 Validation Loss: 1294664.7115384615
Epoch: 200 Train Loss: 196534.99584960938 Validation Loss: 198162.2764423077
Epoch: 300 Train Loss: 20097.862915039062 Validation Loss: 21155.13311298077
####################################################################################################
Final Epoch: 347 Train Loss: 20086.14385986328 Validation Loss: 21115.64152644231
