#Train the model

In [None]:
import numpy as np #importing libraries
import pandas as pd

In [None]:
#Loading train and test datasets
train=pd.read_csv("model_train.csv")
test=pd.read_csv('model_test_data.csv')
print(train.shape)
print(test.shape)



(12269895, 12)
(12599384, 13)


In [None]:
test.head()

Unnamed: 0,date,id,farm_area,temp_obs,cloudiness,wind_direction,dew_temp,pressure_sea_level,precipitation,wind_speed,year,month,day
0,2017-01-01 00:00:00,0,690.4551,17.8,4.0,100.0,11.7,1021.4,0.0,3.6,2017,1,1
1,2017-01-01 00:00:00,1,252.69617,17.8,4.0,100.0,11.7,1021.4,0.0,3.6,2017,1,1
2,2017-01-01 00:00:00,2,499.44653,17.8,4.0,100.0,11.7,1021.4,0.0,3.6,2017,1,1
3,2017-01-01 00:00:00,3,2200.4075,17.8,4.0,100.0,11.7,1021.4,0.0,3.6,2017,1,1
4,2017-01-01 00:00:00,4,10833.14,17.8,4.0,100.0,11.7,1021.4,0.0,3.6,2017,1,1


Storing test['id'] values in ids for creating submission file

In [None]:
ids=test['id']
ids.shape

(12599384,)

Droping 'id', and 'date' from the test dataset

In [None]:
test_df=test.drop(['id','date'],axis=1)

In [None]:
test_df.shape # checking shape of test data

(12599384, 11)

**Split the data into training and validation sets, and train the model on the training set.**

In [None]:
x=train.drop(['yield'],axis=1)
y=train['yield']

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.3,random_state=50)
print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

(8588926, 11)
(3680969, 11)
(8588926,)
(3680969,)


#Linear Regression

In [None]:
%%time
from sklearn.linear_model import LinearRegression
lr= LinearRegression()
lr.fit(x_train,y_train)
predtrain=lr.predict(x_train)
predtest=lr.predict(x_test)

Wall time: 3.33 s


In [None]:
# print(lr.coef_)

In [None]:
from sklearn.metrics import mean_squared_error
print("RMSE for train:",np.sqrt(mean_squared_error(y_train,predtrain)))
print("RMSE for test:",np.sqrt(mean_squared_error(y_test,predtest)))



RMSE for train: 320.9381155406899
RMSE for test: 317.9513720347462


#DecisionTreeRegressor

In [None]:
%%time
from sklearn.tree import DecisionTreeRegressor
d=DecisionTreeRegressor()
d.fit(x_train,y_train)

Wall time: 1min 48s


DecisionTreeRegressor()

In [None]:
d_predtest=d.predict(x_test)
d_predtrain=d.predict(x_train)

In [None]:
print("RMSE for train:",np.sqrt(mean_squared_error(y_train,d_predtrain)))
print("RMSE for test:",np.sqrt(mean_squared_error(y_test,d_predtest)))

RMSE for train: 46.506993233336075
RMSE for test: 142.1944527858935


#Ridge

In [None]:
%%time
from sklearn.linear_model import Ridge
rig=Ridge()
rig.fit(x_train,y_train)
lr_predtrain=rig.predict(x_train)
lr_predtest=rig.predict(x_test)

Wall time: 2.14 s


In [None]:
print("RMSE for train:",np.sqrt(mean_squared_error(y_train,lr_predtrain)))
print("RMSE for test:",np.sqrt(mean_squared_error(y_test,lr_predtest)))

RMSE for train: 320.93811554413406
RMSE for test: 317.9513710501007


In [None]:
param_grid = {'max_depth' : range(1,10), 
              'min_impurity_decrease' : [0.0001, 0.01]}
#A node will be split if this split induces a decrease of 

#Hyper Parameter Tuning with DecisionTreeRegressor_GridsearchCV


In [None]:
from sklearn.model_selection import GridSearchCV

gdt = GridSearchCV(estimator=d, param_grid=param_grid, cv= 3)


In [None]:
%time gdt.fit(x_train,y_train)

Wall time: 22min 18s


GridSearchCV(cv=3, estimator=DecisionTreeRegressor(),
             param_grid={'max_depth': range(1, 10),
                         'min_impurity_decrease': [0.0001, 0.01]})

In [None]:
gdt_predtrain=gdt.predict(x_train)
gdt_predtest=gdt.predict(x_test)

In [None]:
print("RMSE for train:",np.sqrt(mean_squared_error(y_train,gdt_predtrain)))
print("RMSE for test:",np.sqrt(mean_squared_error(y_test,gdt_predtest)))

RMSE for train: 193.5450690914143
RMSE for test: 191.69329455432023


#RandomForestRegressor

Performing RandomForestRegressor by reading the train data in chucks to reduce the running time of RandomForest.

In [None]:
%%time
from sklearn.ensemble import RandomForestRegressor

# Define the chunk size and number of iterations
chunk_size = 50000
n_iterations = 5

# Create an empty Random Forest model
rf = RandomForestRegressor(n_estimators=100)

# Load the data in chunks
for i, chunk in enumerate(pd.read_csv('/content/model_train.csv', chunksize=chunk_size)):
    # Split the data into features and target
    X_chunk = chunk.drop('yield', axis=1)
    Y_chunk = chunk['yield']
    
    # Concatenate the chunks into X and Y
    if i == 0:
        X = X_chunk
        Y = Y_chunk
    else:
        X = pd.concat([X, X_chunk], axis=0)
        Y = pd.concat([Y, Y_chunk], axis=0)
    
    # If we've processed enough chunks, split into train and test sets
    if i == n_iterations:
        X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3)
        
        # Train the Random Forest model on the training set
        rf.fit(X_train, Y_train)
        
       
        # Break out of the loop
        break


Wall time: 3min 47s


In [None]:
r_predtest = rf.predict(X_test)
r_predtrain=rf.predict(X_train)


In [None]:
print("RMSE for train:",np.sqrt(mean_squared_error(Y_train,r_predtrain)))
print("RMSE for test:",np.sqrt(mean_squared_error(Y_test,r_predtest)))

RMSE for train: 6.344596419857986
RMSE for test: 15.200913402054711


#GradientBoostingRegressor

In [None]:
%%time
from sklearn.ensemble import  GradientBoostingRegressor
grad = GradientBoostingRegressor(n_estimators=100)
grad.fit(x_train,y_train)


Wall time: 22min 27s


GradientBoostingRegressor()

In [None]:
grad_predtrain=grad.predict(x_train)
grad_predtest=grad.predict(x_test)

In [None]:
print("RMSE for train:",np.sqrt(mean_squared_error(y_train,grad_predtrain)))
print("RMSE for test:",np.sqrt(mean_squared_error(y_test,grad_predtest)))

RMSE for train: 241.93764879047143
RMSE for test: 239.24481632092332


#Neural Networks

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Dense, Dropout


In [None]:
# Initialising the ANN
model = Sequential()

# Adding the input layer and the first hidden layer
model.add(Dense(64, activation = 'relu', input_dim = 11,kernel_initializer='normal'))

# Adding the second hidden layer
model.add(Dense(units = 64, activation = 'relu',kernel_initializer='normal'))

# Adding the third hidden layer
model.add(Dense(units = 64, activation = 'relu',kernel_initializer='normal'))

# Adding the output layer

model.add(Dense(units = 1, activation='linear',kernel_initializer='normal'))

In [None]:
%%time
model.compile(optimizer = 'adam', loss = 'mean_squared_error')

# Fitting the ANN to the Training set
model.fit(x_train, y_train, batch_size = 1000, epochs = 50)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Wall time: 25min 47s


<keras.callbacks.History at 0x1ce60e67730>

In [None]:
nn_predtrain=model.predict(x_train)
nn_predtest=model.predict(x_test)



In [None]:
print("RMSE for train:",np.sqrt(mean_squared_error(y_train,nn_predtrain)))
print("RMSE for test:",np.sqrt(mean_squared_error(y_test,nn_predtest)))

RMSE for train: 310.61114637703633
RMSE for test: 307.81549174129094


#Evaluate the model

Evaluating the performance of the model on the validation set. This includes calculating metrics like RMSE for each model.

In [None]:
data={"Model":['LinearRegressor','Ridge','DecisioTreeRegressor','RandomForestRegressor','Decision_grid_search','Gradient_Boosting','Neural Networks'],"RMSE_score_train":[np.sqrt(mean_squared_error(y_train,predtrain)),np.sqrt(mean_squared_error(y_train,lr_predtrain)),np.sqrt(mean_squared_error(y_train,d_predtrain)),np.sqrt(mean_squared_error(Y_train,r_predtrain)),np.sqrt(mean_squared_error(y_train,gdt_predtrain)),np.sqrt(mean_squared_error(y_train,grad_predtrain)),np.sqrt(mean_squared_error(y_train,nn_predtrain))],"RMSE_score_test":[np.sqrt(mean_squared_error(y_test,predtest)),np.sqrt(mean_squared_error(y_test,lr_predtest)),np.sqrt(mean_squared_error(y_test,d_predtest)),np.sqrt(mean_squared_error(Y_test,r_predtest)),np.sqrt(mean_squared_error(y_test,gdt_predtest)),np.sqrt(mean_squared_error(y_test,grad_predtest)),np.sqrt(mean_squared_error(y_test,nn_predtest))]}
data=pd.DataFrame(data)
data

Unnamed: 0,Model,RMSE_score_train,RMSE_score_test
0,LinearRegressor,320.938116,317.951372
1,Ridge,320.938116,317.951371
2,DecisioTreeRegressor,46.506993,142.194453
3,RandomForestRegressor,6.344596,15.200913
4,Decision_grid_search,193.545069,191.693295
5,Gradient_Boosting,241.937649,239.244816
6,Neural Networks,310.611146,307.815492


#LSTM(Long Short-Term Memory)

It is not performed due to high running and facing memory issues.

In [None]:
# import pandas as pd
# import numpy as np
# from sklearn.preprocessing import MinMaxScaler
# from keras.models import Sequential
# from keras.layers import Dense, LSTM, BatchNormalization, Dropout
# from sklearn.metrics import mean_squared_error
# import matplotlib.pyplot as plt

# # Load the data
# # df = pd.read_csv('data.csv')
# # df = df.set_index('date')

# # # Scale the data
# # scaler = MinMaxScaler(feature_range=(0, 1))
# # scaled_data = scaler.fit_transform(df.values)

# # Define the time steps and features
# timesteps = 3
# features = train.shape[1]

# # Split the data into train and test sets
# # train_size = int(len(scaled_data) * 0.8)
# train_data = x
# test_data =  Y_chunk


# # Reshape the training data to have the same shape as the output
# train_data = np.array(train_data).reshape(-1, timesteps, features)

# # Create the time-series generator objects
# from keras.preprocessing.sequence import TimeseriesGenerator
# train_generator = TimeseriesGenerator(train_data, train_data, length=timesteps, batch_size=32)

# # Define the LSTM model
# model = Sequential()
# model.add(LSTM(64, return_sequences=True, input_shape=(timesteps, features)))
# model.add(BatchNormalization())
# model.add(LSTM(64, return_sequences=True))
# model.add(BatchNormalization())
# model.add(Dropout(0.2))
# model.add(Dense(features))

# # Compile the model
# model.compile(loss='mean_squared_error', optimizer='adam')

# # Fit the model
# model.fit(train_generator, epochs=50, verbose=2)

# # Predict the next year's yield using the last 3 months of data
# last_3_months = Y_chunk.values.reshape(1, timesteps, features)
# predicted_yield = []
# for i in range(12):
#     next_month_yield = model.predict(last_3_months)[0][-1]
#     predicted_yield.append(next_month_yield[0])
#     last_3_months = np.append(last_3_months[:, 1:, :], [[next_month_yield]], axis=1)

# # Inverse scale the predicted yield values
# predicted_yield = scaler.inverse_transform(np.array(predicted_yield).reshape(-1, 1)).reshape(-1)

# # Calculate RMSE value
# actual_yield = train['yield'].values
# rmse = np.sqrt(mean_squared_error(actual_yield, predicted_yield))
# print(f'RMSE value: {rmse}')




#Predictions an Test data

In [None]:
test_perd_lr=lr.predict(test_df)
sub_lr=pd.DataFrame({"id":ids,'yield':test_perd_lr})
sub_lr.to_csv("submission_lr.csv",index=False)


In [None]:
sub_lr=pd.read_csv("submission_lr.csv")
sub_lr.head()

In [None]:
test_perd_d=d.predict(test_df)
sub_d=pd.DataFrame({"id":ids,'yield':test_perd_d})
sub_d.to_csv("submission_d.csv",index=False)


In [None]:
sub_d=pd.read_csv("submission_d.csv")
sub_d.head()

In [None]:
test_perd_rf=rf.predict(test_df)
sub_rf=pd.DataFrame({"id":ids,'yield':test_perd_rf})
sub_rf.to_csv("submission_rf.csv",index=False)


In [None]:
sub_rf=pd.read_csv("submission_rf.csv")
sub_rf.head()

In [None]:
test_perd_grad=grad.predict(test_df)
sub_grad=pd.DataFrame({"id":ids,'yield':test_perd_grad})
sub_grad.to_csv("submission_grad.csv",index=False)


In [None]:
sub_grad=pd.read_csv("submission_grad.csv")
sub_grad.head()

In [None]:
# test_perd_nn=model.predict(test_df)


In [None]:
sub_nn=pd.DataFrame({"id":ids,'yield':test_perd_nn.reshape(-1)})
sub_nn.to_csv("submission_nn.csv",index=False)
# print(test_df.shape)
# print(ids.shape)


In [None]:
sub_nn=pd.read_csv("submission_nn.csv")
sub_nn.head()

#Conclusion:

The successful implementation of Random Forest Regression in the development of a Robust Yield Prediction model using a dataset that includes various parameters related to yield has demonstrated the efficacy of Artificial Intelligence in the agricultural sector. This highlights the current trend of using AI techniques and methodologies to enhance accuracy and identify solutions in agriculture. Furthermore, the versatile nature of machine learning algorithms allows for their implementation in various fields, including agriculture.