In [1]:
import numpy as np # for array operations
import pandas as pd # for working with DataFrames
import matplotlib.pyplot as plt # for data visualization
%matplotlib inline

from sklearn.model_selection import train_test_split # for splitting the data
from sklearn.metrics import mean_squared_error # for calculating the cost function
from sklearn.ensemble import RandomForestRegressor # Import the model we are using


In [2]:
df=pd.read_csv(r"C:\Users\TOLS\OneDrive - Cardiff Metropolitan University\Dissertation\DISSERTATION DATA\DISSERTATION.csv")
df.head(5)

Unnamed: 0,year,country,maize_yield,pot_yield,wheat_yield,rice_yield,Production(Maize)Tonnes,Production Potatoes(Tonnes),Production Wheat(Tonnes),Production Rice(Tonnes),solar_rad,co2,temp,precip
0,1980,United States of America,57118,295245,22513,49461,168647008,13785000,64799504,6629250,894031,2713.57,4030.5,592.2
1,1981,United States of America,68380,308199,23229,54015,206222000,15450000,75806304,8289040,894031,2723.25,4784.5,629.3
2,1982,United States of America,71082,312544,23859,52791,209180000,16108500,75251296,6968900,851000,2734.78,4563.0,681.0
3,1983,United States of America,50893,299754,26509,51532,106030000,15137600,65857904,4523200,957440,2748.52,4722.5,941.0
4,1984,United States of America,66981,311082,26073,55523,194880000,16422000,70618000,6296300,800268,2761.06,4546.8,982.0


In [3]:
df.columns

Index(['year', 'country', 'maize_yield', 'pot_yield', 'wheat_yield',
       'rice_yield', 'Production(Maize)Tonnes', 'Production Potatoes(Tonnes)',
       'Production Wheat(Tonnes)', 'Production Rice(Tonnes)', 'solar_rad',
       'co2', 'temp', 'precip'],
      dtype='object')

## RANDOM FOREST REGRESSION FOR CROP MAIZE YIELD

In [4]:
dfm=df[['maize_yield', 'solar_rad', 'co2', 'temp', 'precip']]
dfm.head()

Unnamed: 0,maize_yield,solar_rad,co2,temp,precip
0,57118,894031,2713.57,4030.5,592.2
1,68380,894031,2723.25,4784.5,629.3
2,71082,851000,2734.78,4563.0,681.0
3,50893,957440,2748.52,4722.5,941.0
4,66981,800268,2761.06,4546.8,982.0


In [5]:
#Separating the features and the target variable
X = dfm.drop('maize_yield', axis = 1).values.astype(float) # Features
y = dfm[["maize_yield"]].values.astype(float) # Target
X

array([[8.940310e+05, 2.713570e+03, 4.030500e+03, 5.922000e+02],
       [8.940310e+05, 2.723250e+03, 4.784500e+03, 6.293000e+02],
       [8.510000e+05, 2.734780e+03, 4.563000e+03, 6.810000e+02],
       [9.574400e+05, 2.748520e+03, 4.722500e+03, 9.410000e+02],
       [8.002680e+05, 2.761060e+03, 4.546800e+03, 9.820000e+02],
       [9.674170e+05, 2.773180e+03, 4.747400e+03, 6.544000e+02],
       [1.014520e+06, 2.783230e+03, 4.689800e+03, 4.707000e+02],
       [1.277547e+06, 2.797010e+03, 4.622600e+03, 5.948000e+02],
       [1.063344e+06, 2.815860e+03, 4.525900e+03, 5.231000e+02],
       [1.260174e+06, 2.827570e+03, 4.573100e+03, 1.060100e+03],
       [1.279369e+06, 2.836930e+03, 4.663600e+03, 7.700000e+02],
       [1.147922e+06, 2.848460e+03, 4.877400e+03, 5.254000e+02],
       [1.106325e+06, 2.854980e+03, 4.273000e+03, 6.694000e+02],
       [1.252046e+06, 2.859600e+03, 4.590500e+03, 9.044000e+02],
       [1.273503e+06, 2.873380e+03, 4.634500e+03, 8.351000e+02],
       [1.217411e+06, 2.8

In [6]:
### Sandardization of data ###
from sklearn.preprocessing import StandardScaler
scaledX = StandardScaler()
scaledy = StandardScaler()

In [7]:
# Storing the fit object for later reference
X = scaledX.fit_transform(X)
y = scaledX.fit_transform(y)

In [8]:
y = y.ravel()

In [9]:
# Splitting the dataset into training and testing set (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 5)

In [10]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 100 decision trees
rf = RandomForestRegressor(n_estimators = 100, random_state = 5)
# Train the model on training data
rf.fit(X_train, y_train)

In [11]:
# Predicting the target values of the test set
y_pred = rf.predict(X_test)

In [12]:
from sklearn.metrics import mean_absolute_error as mae
error = mae(y_test, y_pred)
print("Mean absolute error : " + str(error))

Mean absolute error : 0.47634843544785926


In [13]:
# RMSE (Root Mean Square Error)
rmse = float(format(np.sqrt(mean_squared_error(y_test, y_pred)), '.3f'))
print("\nRMSE: ", rmse)


RMSE:  0.581


In [14]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print( "R-Squared :", r2)

R-Squared : 0.765944466173723


In [15]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print( "Mean Square Error :", mse)

Mean Square Error : 0.3380737312784421


## RANDOM FOREST REGRESSION FOR CROP RICE YIELD

In [16]:
dfr=df[['rice_yield', 'solar_rad', 'co2', 'temp', 'precip']]
dfr.head()

Unnamed: 0,rice_yield,solar_rad,co2,temp,precip
0,49461,894031,2713.57,4030.5,592.2
1,54015,894031,2723.25,4784.5,629.3
2,52791,851000,2734.78,4563.0,681.0
3,51532,957440,2748.52,4722.5,941.0
4,55523,800268,2761.06,4546.8,982.0


In [17]:
#Separating the features and the target variable
X = dfr.drop('rice_yield', axis = 1).values.astype(float) # Features
y = dfr[["rice_yield"]].values.astype(float) # Target
X

array([[8.940310e+05, 2.713570e+03, 4.030500e+03, 5.922000e+02],
       [8.940310e+05, 2.723250e+03, 4.784500e+03, 6.293000e+02],
       [8.510000e+05, 2.734780e+03, 4.563000e+03, 6.810000e+02],
       [9.574400e+05, 2.748520e+03, 4.722500e+03, 9.410000e+02],
       [8.002680e+05, 2.761060e+03, 4.546800e+03, 9.820000e+02],
       [9.674170e+05, 2.773180e+03, 4.747400e+03, 6.544000e+02],
       [1.014520e+06, 2.783230e+03, 4.689800e+03, 4.707000e+02],
       [1.277547e+06, 2.797010e+03, 4.622600e+03, 5.948000e+02],
       [1.063344e+06, 2.815860e+03, 4.525900e+03, 5.231000e+02],
       [1.260174e+06, 2.827570e+03, 4.573100e+03, 1.060100e+03],
       [1.279369e+06, 2.836930e+03, 4.663600e+03, 7.700000e+02],
       [1.147922e+06, 2.848460e+03, 4.877400e+03, 5.254000e+02],
       [1.106325e+06, 2.854980e+03, 4.273000e+03, 6.694000e+02],
       [1.252046e+06, 2.859600e+03, 4.590500e+03, 9.044000e+02],
       [1.273503e+06, 2.873380e+03, 4.634500e+03, 8.351000e+02],
       [1.217411e+06, 2.8

In [18]:
### Sandardization of data ###
from sklearn.preprocessing import StandardScaler
scaledX = StandardScaler()
scaledy = StandardScaler()

In [19]:
# Storing the fit object for later reference
X = scaledX.fit_transform(X)
y = scaledy.fit_transform(y)
y = y.ravel()

In [20]:
# Splitting the dataset into training and testing set (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 5)

In [21]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 5)
# Train the model on training data
rf.fit(X_train, y_train)

In [22]:
# Predicting the target values of the test set
y_pred = rf.predict(X_test)

In [23]:
from sklearn.metrics import mean_absolute_error as mae
error = mae(y_test, y_pred)
print("Mean absolute error : " + str(error))

Mean absolute error : 0.30029644736613725


In [24]:
# RMSE (Root Mean Square Error)
rmse = float(format(np.sqrt(mean_squared_error(y_test, y_pred)), '.3f'))
print("\nRMSE: ", rmse)


RMSE:  0.383


In [25]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print( "R-Squared :", r2)

R-Squared : 0.8904419266634679


In [26]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print( "Mean Square Error :", mse)

Mean Square Error : 0.14632398665717855


## RANDOM FOREST REGRESSION FOR CROP WHEAT YIELD

In [27]:
dfw=df[['wheat_yield', 'solar_rad', 'co2', 'temp', 'precip']]
dfw.head()

Unnamed: 0,wheat_yield,solar_rad,co2,temp,precip
0,22513,894031,2713.57,4030.5,592.2
1,23229,894031,2723.25,4784.5,629.3
2,23859,851000,2734.78,4563.0,681.0
3,26509,957440,2748.52,4722.5,941.0
4,26073,800268,2761.06,4546.8,982.0


In [28]:
#Separating the features and the target variable
X = dfw.drop('wheat_yield', axis = 1).values.astype(float) # Features
y = dfw[["wheat_yield"]].values.astype(float) # Target
X

array([[8.940310e+05, 2.713570e+03, 4.030500e+03, 5.922000e+02],
       [8.940310e+05, 2.723250e+03, 4.784500e+03, 6.293000e+02],
       [8.510000e+05, 2.734780e+03, 4.563000e+03, 6.810000e+02],
       [9.574400e+05, 2.748520e+03, 4.722500e+03, 9.410000e+02],
       [8.002680e+05, 2.761060e+03, 4.546800e+03, 9.820000e+02],
       [9.674170e+05, 2.773180e+03, 4.747400e+03, 6.544000e+02],
       [1.014520e+06, 2.783230e+03, 4.689800e+03, 4.707000e+02],
       [1.277547e+06, 2.797010e+03, 4.622600e+03, 5.948000e+02],
       [1.063344e+06, 2.815860e+03, 4.525900e+03, 5.231000e+02],
       [1.260174e+06, 2.827570e+03, 4.573100e+03, 1.060100e+03],
       [1.279369e+06, 2.836930e+03, 4.663600e+03, 7.700000e+02],
       [1.147922e+06, 2.848460e+03, 4.877400e+03, 5.254000e+02],
       [1.106325e+06, 2.854980e+03, 4.273000e+03, 6.694000e+02],
       [1.252046e+06, 2.859600e+03, 4.590500e+03, 9.044000e+02],
       [1.273503e+06, 2.873380e+03, 4.634500e+03, 8.351000e+02],
       [1.217411e+06, 2.8

In [29]:
### Sandardization of data ###
from sklearn.preprocessing import StandardScaler
scaledX = StandardScaler()
scaledy = StandardScaler()

In [30]:
# Storing the fit object for later reference
X = scaledX.fit_transform(X)
y = scaledy.fit_transform(y)
y = y.ravel()

In [31]:
# Splitting the dataset into training and testing set (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 5)

In [32]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 5)
# Train the model on training data
rf.fit(X_train, y_train)

In [33]:
# Predicting the target values of the test set
y_pred = rf.predict(X_test)

In [34]:
from sklearn.metrics import mean_absolute_error as mae
error = mae(y_test, y_pred)
print("Mean absolute error : " + str(error))

Mean absolute error : 0.6080248133560212


In [35]:
# RMSE (Root Mean Square Error)
rmse = float(format(np.sqrt(mean_squared_error(y_test, y_pred)), '.3f'))
print("\nRMSE: ", rmse)


RMSE:  0.704


In [36]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print("R Squared :", r2)

R Squared : 0.5207873861050403


In [37]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print( "Mean Square Error :", mse)

Mean Square Error : 0.4960997141457818


## RANDOM FOREST REGRESSION FOR CROP POTATO YIELD

In [38]:
dfp=df[['pot_yield', 'solar_rad', 'co2', 'temp', 'precip']]
dfp.head()

Unnamed: 0,pot_yield,solar_rad,co2,temp,precip
0,295245,894031,2713.57,4030.5,592.2
1,308199,894031,2723.25,4784.5,629.3
2,312544,851000,2734.78,4563.0,681.0
3,299754,957440,2748.52,4722.5,941.0
4,311082,800268,2761.06,4546.8,982.0


In [39]:
#Separating the features and the target variable
X = dfp.drop('pot_yield', axis = 1).values.astype(float) # Features
y = dfp[["pot_yield"]].values.astype(float) # Target
X

array([[8.940310e+05, 2.713570e+03, 4.030500e+03, 5.922000e+02],
       [8.940310e+05, 2.723250e+03, 4.784500e+03, 6.293000e+02],
       [8.510000e+05, 2.734780e+03, 4.563000e+03, 6.810000e+02],
       [9.574400e+05, 2.748520e+03, 4.722500e+03, 9.410000e+02],
       [8.002680e+05, 2.761060e+03, 4.546800e+03, 9.820000e+02],
       [9.674170e+05, 2.773180e+03, 4.747400e+03, 6.544000e+02],
       [1.014520e+06, 2.783230e+03, 4.689800e+03, 4.707000e+02],
       [1.277547e+06, 2.797010e+03, 4.622600e+03, 5.948000e+02],
       [1.063344e+06, 2.815860e+03, 4.525900e+03, 5.231000e+02],
       [1.260174e+06, 2.827570e+03, 4.573100e+03, 1.060100e+03],
       [1.279369e+06, 2.836930e+03, 4.663600e+03, 7.700000e+02],
       [1.147922e+06, 2.848460e+03, 4.877400e+03, 5.254000e+02],
       [1.106325e+06, 2.854980e+03, 4.273000e+03, 6.694000e+02],
       [1.252046e+06, 2.859600e+03, 4.590500e+03, 9.044000e+02],
       [1.273503e+06, 2.873380e+03, 4.634500e+03, 8.351000e+02],
       [1.217411e+06, 2.8

In [40]:
### Sandardization of data ###
from sklearn.preprocessing import StandardScaler
scaledX = StandardScaler()
scaledy = StandardScaler()

In [41]:
# Storing the fit object for later reference
X = scaledX.fit_transform(X)
y = scaledy.fit_transform(y)
y=y.ravel()

In [42]:
# Splitting the dataset into training and testing set (80/20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 5)

In [43]:
# Import the model we are using
from sklearn.ensemble import RandomForestRegressor
# Instantiate model with 1000 decision trees
rf = RandomForestRegressor(n_estimators = 1000, random_state = 5)
# Train the model on training data
rf.fit(X_train, y_train);

In [44]:
# Predicting the target values of the test set
y_pred = rf.predict(X_test)

In [45]:
from sklearn.metrics import mean_absolute_error as mae
error = mae(y_test, y_pred)
print("Mean absolute error : " + str(error))

Mean absolute error : 0.16091194796397812


In [46]:
# RMSE (Root Mean Square Error)
rmse = float(format(np.sqrt(mean_squared_error(y_test, y_pred)), '.3f'))
print("\nRMSE: ", rmse)


RMSE:  0.195


In [47]:
from sklearn.metrics import r2_score
r2 = r2_score(y_test, y_pred)
print( "R-Squared :", r2)

R-Squared : 0.9697945016779979


In [48]:
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(y_test, y_pred)
print( "Mean Square Error :", mse)

Mean Square Error : 0.038120850906141215
