In [106]:
%env TF_CPP_MIN_LOG_LEVEL=3

env: TF_CPP_MIN_LOG_LEVEL=3


In [107]:
# Imports
# Importing necessary libraries
# Importing necessary libraries
import joblib
import sklearn
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split  # Import for splitting data  # Import for splitting data
from sklearn.preprocessing import StandardScaler  # Import for data scaling  # Import for data scaling
from tensorflow.keras.models import Sequential  # Import for model creation  # Import for model creation
from tensorflow.keras.layers import Dense, Dropout  # Import for adding layers  # Import for adding layers
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint  # Callbacks for training  # Callbacks for training
import warnings
warnings.filterwarnings('ignore')  # Ignoring warnings  # Ignoring warnings

## Loading dataset

In [108]:
df=pd.read_csv('dataset.csv')
df.head()

Unnamed: 0,data,indice_vegetacao,capacidade_solo,concentracao_co2,nivel_nutrientes,indice_fertilizantes,profundidade_raiz,radiacao_solar,precipitacao,estagio_crescimento,historico_rendimento,umidade
0,2012-12-01,323,455,3102.61,423.45,844.0,468.0,578.0,28.67,207.70504,117.7,79.261905
1,2013-01-01,345,546,3100.45,415.85,799.0,485.0,557.0,24.49,228.94287,4.5,82.193548
2,2013-02-01,362,595,3199.41,410.77,718.0,466.0,552.0,22.06,238.41747,25.1,74.839286
3,2013-03-01,376,636,3281.67,414.82,614.0,442.0,574.0,21.64,218.47599,53.6,77.935484
4,2013-04-01,383,738,3261.65,451.04,619.0,429.0,595.0,22.3,226.1501,166.0,80.45


In [109]:
df.shape

(124, 12)

In [110]:
df.tail()

Unnamed: 0,data,indice_vegetacao,capacidade_solo,concentracao_co2,nivel_nutrientes,indice_fertilizantes,profundidade_raiz,radiacao_solar,precipitacao,estagio_crescimento,historico_rendimento,umidade
119,2022-11-01,362,363,2626.91,1252.78,738.07,427.49,1430.48,60.18,186.68326,38.2,77.95
120,2022-12-01,310,322,2736.64,1287.68,749.57,385.09,1472.27,62.25,210.72987,33.7,76.177419
121,2023-01-01,277,307,2842.81,1289.12,761.6,373.03,1525.43,63.04,244.41912,4.6,74.774194
122,2023-02-01,323,330,2936.19,1303.59,759.59,390.69,1572.25,71.52,223.31732,6.9,66.910714
123,2023-03-01,360,339,2847.84,1234.88,771.62,396.87,1302.61,74.8,228.56676,41.5,69.0


## Exploratory Data Analysis - EDA

In [111]:
# Check the data types of the columns to compare with the dictionary
df.dtypes

data                     object
indice_vegetacao          int64
capacidade_solo           int64
concentracao_co2        float64
nivel_nutrientes        float64
indice_fertilizantes    float64
profundidade_raiz       float64
radiacao_solar          float64
precipitacao            float64
estagio_crescimento     float64
historico_rendimento    float64
umidade                 float64
dtype: object

In [112]:
# Display the dataset columns to check for issues like spaces in the names or other common problems
df.columns

Index(['data', 'indice_vegetacao', 'capacidade_solo', 'concentracao_co2',
       'nivel_nutrientes', 'indice_fertilizantes', 'profundidade_raiz',
       'radiacao_solar', 'precipitacao', 'estagio_crescimento',
       'historico_rendimento', 'umidade'],
      dtype='object')

In [113]:
# Non-numerical columns need to be handled
non_numerical_columns = df.select_dtypes(include = ['object']).columns
print(f'Non-numerical columns: {non_numerical_columns}')

Non-numerical columns: Index(['data'], dtype='object')


In [114]:
#Check for null values
df.isnull().sum()

data                    0
indice_vegetacao        0
capacidade_solo         0
concentracao_co2        0
nivel_nutrientes        0
indice_fertilizantes    0
profundidade_raiz       0
radiacao_solar          0
precipitacao            0
estagio_crescimento     0
historico_rendimento    0
umidade                 0
dtype: int64

## Data cleaning and transform

In [115]:
# The date in this dataset is being used as an ID. Since I am not performing a time series analysis, I will remove this column
df = df.drop(columns = non_numeric_columns)

## Data Standardization

In [116]:
# Define the target
X = df.drop(columns='umidade')
y = df['umidade']

In [117]:
# Split into training and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [118]:
# Define the scaler
scaler = StandardScaler()

In [119]:
# Standardize the data.
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [120]:
# Save the scaler to disk, as it will be used for new data after deployment
joblib.dump(scaler,'scaler.joblib')

['scaler.joblib']

In [121]:
#Define the model architecture.
#I used a typical architecture for regression problems where the goal is to predict a single continuous value based on multiple input features.
model = Sequential([Dense(64, activation = 'relu', input_shape = (X_treino.shape[1],)),
                         Dropout(0.3),
                         Dense(32, activation = 'relu'),
                         Dropout(0.3),
                         Dense(16, activation = 'relu'),
                         Dense(1)])

In [122]:
# Compiling the model
model.compile(optimizer = 'adam', loss = 'mse', metrics = ['mae'])

In [123]:
# The early_stopping callback monitors the validation loss (val_loss) and stops training if it does not improve for 10 consecutive epochs, while restoring the best weights. 
# The model_checkpoint callback saves the model to 'modelo.keras' when it achieves a better validation performance
early_stopping = EarlyStopping(monitor = 'val_loss', patience = 10, restore_best_weights = True)
model_checkpoint = ModelCheckpoint('model.keras', save_best_only = True)

## Model training

In [124]:
modelo.summary()

In [125]:
# Training the model
history = model.fit(X_train_scaled, 
                         y_train,
                         validation_split = 0.2,
                         epochs = 100,
                         batch_size = 32,
                         callbacks = [early_stopping, model_checkpoint])

Epoch 1/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 86ms/step - loss: 5236.8613 - mae: 72.2318 - val_loss: 5207.2036 - val_mae: 72.0575
Epoch 2/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - loss: 5199.6670 - mae: 71.9695 - val_loss: 5190.2344 - val_mae: 71.9405
Epoch 3/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - loss: 5200.2153 - mae: 71.9681 - val_loss: 5172.9248 - val_mae: 71.8208
Epoch 4/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - loss: 5162.4868 - mae: 71.7037 - val_loss: 5154.5278 - val_mae: 71.6933
Epoch 5/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 29ms/step - loss: 5200.8691 - mae: 71.9785 - val_loss: 5134.8135 - val_mae: 71.5564
Epoch 6/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - loss: 5166.1260 - mae: 71.7447 - val_loss: 5113.4941 - val_mae: 71.4080
Epoch 7/100
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0

## Model Evaluation

In [126]:
# Evaluate the model on the test set
teste_loss, teste_mae = model.evaluate(X_teste_scaled, y_teste)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 26ms/step - loss: 170.9769 - mae: 10.7771


In [127]:
print(f'Test Loss: {teste_loss}')
print(f'Test MAE: {teste_mae}')

Test Loss: 170.97691345214844
Test MAE: 10.77706527709961
