In [1]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Data preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# Model and evaluation
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score


In [2]:
import pandas as pd

# Load the dataset
file_path = '/content/drive/MyDrive/Prediction/crop_yield_dataset.csv'
df = pd.read_csv(file_path)

# Display the first few rows of the dataset
print(df.head())


  soil_type crop_type     rainfall  temperature   latitude   longitude  \
0     Silty      Rice   671.070231    27.159876  26.686252   79.296453   
1      Clay     Maize   644.561838    34.162970 -58.970455   67.421882   
2     Loamy   Cassava  1831.258245    32.804828  67.031021 -145.528489   
3     Silty     Maize   649.183160    14.616997  20.360923  152.126066   
4     Silty     Maize   689.509507    14.477484 -61.703301   24.649993   

    nitrogen  phosphorus  potassium  fertilizer  predicted_yield  
0  37.364082   65.430632   7.317510    1.320483         2.892431  
1  33.291210    8.003257   8.947838    0.099048         2.227836  
2  17.615391   24.232981  65.197371  141.679038         4.979721  
3  60.726667   77.367935  48.694071    8.788112         3.174908  
4  47.662416   52.868585  79.041525  292.359766         3.830798  


In [None]:
# Check for missing values in the dataset
missing_values = df.isnull().sum()

# Display columns with missing values, if any
print("Missing values per column:")
print(missing_values)

# Check if there are any rows with missing values
if missing_values.sum() > 0:
    print("\nThere are missing values in the dataset.")
else:
    print("\nNo missing values found in the dataset.")


Missing values per column:
soil_type          0
crop_type          0
rainfall           0
temperature        0
latitude           0
longitude          0
nitrogen           0
phosphorus         0
potassium          0
fertilizer         0
predicted_yield    0
dtype: int64

No missing values found in the dataset.


In [None]:
# Define the mappings for soil_type and crop_type
soil_type_mapping = {
    "Loamy": 1,
    "Sandy": 2,
    "Clay": 3,
    "Silty": 4
}

crop_type_mapping = {
    "Yam": 1,
    "Cassava": 2,
    "Rice": 3,
    "Maize": 4
}

# Map the values using the dictionaries
df["soil_type"] = df["soil_type"].map(soil_type_mapping)
df["crop_type"] = df["crop_type"].map(crop_type_mapping)

# Display the updated dataset
print(df.head())


   soil_type  crop_type     rainfall  temperature   latitude   longitude  \
0          4          3   671.070231    27.159876  26.686252   79.296453   
1          3          4   644.561838    34.162970 -58.970455   67.421882   
2          1          2  1831.258245    32.804828  67.031021 -145.528489   
3          4          4   649.183160    14.616997  20.360923  152.126066   
4          4          4   689.509507    14.477484 -61.703301   24.649993   

    nitrogen  phosphorus  potassium  fertilizer  predicted_yield  
0  37.364082   65.430632   7.317510    1.320483         2.892431  
1  33.291210    8.003257   8.947838    0.099048         2.227836  
2  17.615391   24.232981  65.197371  141.679038         4.979721  
3  60.726667   77.367935  48.694071    8.788112         3.174908  
4  47.662416   52.868585  79.041525  292.359766         3.830798  


In [None]:
df

Unnamed: 0,soil_type,crop_type,rainfall,temperature,latitude,longitude,nitrogen,phosphorus,potassium,fertilizer,predicted_yield
0,4,3,671.070231,27.159876,26.686252,79.296453,37.364082,65.430632,7.317510,1.320483,2.892431
1,3,4,644.561838,34.162970,-58.970455,67.421882,33.291210,8.003257,8.947838,0.099048,2.227836
2,1,2,1831.258245,32.804828,67.031021,-145.528489,17.615391,24.232981,65.197371,141.679038,4.979721
3,4,4,649.183160,14.616997,20.360923,152.126066,60.726667,77.367935,48.694071,8.788112,3.174908
4,4,4,689.509507,14.477484,-61.703301,24.649993,47.662416,52.868585,79.041525,292.359766,3.830798
...,...,...,...,...,...,...,...,...,...,...,...
1995,1,4,995.792662,30.533061,-40.831268,128.756156,0.464448,42.433814,0.759142,186.556826,4.019999
1996,2,4,801.922122,25.096612,-52.286540,143.103181,87.366866,91.569135,57.267058,228.159661,2.485405
1997,1,2,910.230168,32.954465,-8.040825,160.814849,30.732518,49.793805,71.718689,217.707555,2.980787
1998,4,2,1153.893056,24.558719,73.518759,-36.904323,93.858886,39.883661,74.508017,207.609208,3.912395


In [None]:
# Define the feature columns and target column
X = df.drop(columns=["predicted_yield"])  # Features (all columns except target)
y = df["predicted_yield"]  # Target (predicted_yield)

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display the shapes of the training and testing sets
print(f"Training set size: {X_train.shape}")
print(f"Testing set size: {X_test.shape}")


Training set size: (1600, 10)
Testing set size: (400, 10)


In [None]:
# Preprocessing pipeline for categorical and numerical features
preprocessor = ColumnTransformer(
    transformers=[
        ("num", "passthrough", X.columns.difference(["soil_type", "crop_type"])),  # Keep numerical features as they are
        ("cat", OneHotEncoder(), ["soil_type", "crop_type"])  # One-hot encode categorical features
    ]
)

# Define the Random Forest Regressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Create a pipeline with preprocessing and the Random Forest model
pipeline = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", model)
])

# Train the model using the training data
pipeline.fit(X_train, y_train)

# Display a message indicating the model has been trained
print("Model training completed.")


Model training completed.


In [None]:
# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

# Calculate R² Score
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2}")


Mean Absolute Error: 0.531987891759175
R² Score: 0.7534891712490033


In [None]:
# Make predictions on the test set
y_pred = pipeline.predict(X_test)

# Calculate Mean Absolute Error (MAE)
mae = mean_absolute_error(y_test, y_pred)
print(f"Mean Absolute Error: {mae}")

# Calculate R² Score
r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2}")

# Calculate Mean Squared Error (MSE)
mse = np.mean((y_pred - y_test) ** 2)
print(f"Mean Squared Error: {mse}")

# Calculate Root Mean Squared Error (RMSE)
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")

# Calculate Explained Variance Score
from sklearn.metrics import explained_variance_score
evs = explained_variance_score(y_test, y_pred)
print(f"Explained Variance Score: {evs}")


Mean Absolute Error: 0.531987891759175
R² Score: 0.7534891712490033
Mean Squared Error: 0.3839535595848623
Root Mean Squared Error: 0.6196398628113449
Explained Variance Score: 0.7540257944552557


In [None]:
import joblib

# Save the entire pipeline (which includes the preprocessor and the model)
joblib.dump(pipeline, 'crop_yield_model.pkl')

print("Model saved successfully!")


Model saved successfully!
