In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
data = pd.read_csv('crop_yield.csv')

In [3]:
len(data)

1000000

In [None]:
# the dataset has 1 Million tuples. These are too many tuples. It can create some issues such as --->
# Memory Overload, OneHotEncoder explosion, Computational bottlenecks, Resource Intensive.
# Hence this data set has been shortened to 10,000 tuples only. This is also more than sufficient for our purpose

In [7]:
data[:10000].to_csv(path_or_buf='shortened_crop_yield.csv')

In [6]:
# Load data
df = pd.read_csv('shortened_crop_yield.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Region,Soil_Type,Crop,Rainfall_mm,Temperature_Celsius,Fertilizer_Used,Irrigation_Used,Weather_Condition,Days_to_Harvest,Yield_tons_per_hectare
0,0,West,Sandy,Cotton,897.077239,27.676966,False,True,Cloudy,122,6.555816
1,1,South,Clay,Rice,992.673282,18.026142,True,True,Rainy,140,8.527341
2,2,North,Loam,Barley,147.998025,29.794042,False,False,Sunny,106,1.127443
3,3,North,Sandy,Soybean,986.866331,16.64419,False,True,Rainy,146,6.517573
4,4,South,Silt,Wheat,730.379174,31.620687,True,True,Cloudy,110,7.248251


In [7]:
df.drop(labels = 'Unnamed: 0',axis = 1,inplace=True)
df.head()

Unnamed: 0,Region,Soil_Type,Crop,Rainfall_mm,Temperature_Celsius,Fertilizer_Used,Irrigation_Used,Weather_Condition,Days_to_Harvest,Yield_tons_per_hectare
0,West,Sandy,Cotton,897.077239,27.676966,False,True,Cloudy,122,6.555816
1,South,Clay,Rice,992.673282,18.026142,True,True,Rainy,140,8.527341
2,North,Loam,Barley,147.998025,29.794042,False,False,Sunny,106,1.127443
3,North,Sandy,Soybean,986.866331,16.64419,False,True,Rainy,146,6.517573
4,South,Silt,Wheat,730.379174,31.620687,True,True,Cloudy,110,7.248251


In [8]:
# Separate features and target
X = df.drop('Yield_tons_per_hectare', axis=1)
y = df['Yield_tons_per_hectare']

In [9]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [10]:
# Preprocessing pipeline
categorical_features = ['Region', 'Soil_Type', 'Crop', 'Weather_Condition']
numerical_features = ['Rainfall_mm', 'Temperature_Celsius', 'Days_to_Harvest']
bool_features = ['Fertilizer_Used', 'Irrigation_Used']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))]), categorical_features),
        ('num', SimpleImputer(strategy='median'), numerical_features),
        ('bool', 'passthrough', bool_features)
    ])

In [11]:
# Model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [12]:
# Train model
model.fit(X_train, y_train)

In [14]:
# Evaluate
y_pred = model.predict(X_test)
print(f"MSE: {mean_squared_error(y_test, y_pred)}")
print(f"MAE: {mean_absolute_error(y_test, y_pred)}")
print(f"R²: {r2_score(y_test, y_pred)}")

MSE: 0.25535303624926164
MAE: 0.4031354184065505
R²: 0.9148254772404703


In [15]:
# Prediction function
def predict_yield():
    features = {
        'Region': input("Enter Region (West/South/North/East): ").strip(),
        'Soil_Type': input("Enter Soil Type (Sandy/Clay/Loam/etc.): ").strip(),
        'Crop': input("Enter Crop: ").strip(),
        'Rainfall_mm': input("Enter Rainfall (mm): ").strip(),
        'Temperature_Celsius': input("Enter Temperature (°C): ").strip(),
        'Fertilizer_Used': input("Fertilizer Used (True/False): ").strip(),
        'Irrigation_Used': input("Irrigation Used (True/False): ").strip(),
        'Weather_Condition': input("Weather Condition (Sunny/Rainy/Cloudy): ").strip(),
        'Days_to_Harvest': input("Days to Harvest: ").strip()
    }
    
    # Handle missing/wrong values
    for key in ['Rainfall_mm', 'Temperature_Celsius', 'Days_to_Harvest']:
        if not features[key]:
            features[key] = np.nan
        else:
            try:
                features[key] = float(features[key])
            except:
                print(f"Invalid value for {key}, using NaN")
                features[key] = np.nan
                
    for key in ['Fertilizer_Used', 'Irrigation_Used']:
        if features[key].lower() in ['true', 't', '1']:
            features[key] = True
        elif features[key].lower() in ['false', 'f', '0']:
            features[key] = False
        else:
            print(f"Invalid boolean value for {key}, using False")
            features[key] = False

    # Create DataFrame and predict
    input_df = pd.DataFrame([features])
    try:
        prediction = model.predict(input_df)
        print(f"\nPredicted Yield: {prediction[0]:.2f} tons/hectare")
    except Exception as e:
        print(f"Prediction failed: {str(e)}")

In [None]:
# In the below run of the above function, all values have been provided. And each feature has been provided
# correct values.

In [21]:
# Example usage
predict_yield()

Enter Region (West/South/North/East):  North
Enter Soil Type (Sandy/Clay/Loam/etc.):  Sandy
Enter Crop:  Cotton
Enter Rainfall (mm):  150
Enter Temperature (°C):  27
Fertilizer Used (True/False):  true
Irrigation Used (True/False):  false
Weather Condition (Sunny/Rainy/Cloudy):  Rainy
Days to Harvest:  78



Predicted Yield: 2.75 tons/hectare


In [None]:
# In the below run of the above function, no value has been provided to feature 'Region'. It becomes a NULL value. 
# Every other feature has been provided correct values.

In [22]:
# Example usage
predict_yield()

Enter Region (West/South/North/East):  
Enter Soil Type (Sandy/Clay/Loam/etc.):  Loam
Enter Crop:  Corn
Enter Rainfall (mm):  135.632
Enter Temperature (°C):  32
Fertilizer Used (True/False):  false
Irrigation Used (True/False):  false
Weather Condition (Sunny/Rainy/Cloudy):  Cloudy
Days to Harvest:  23



Predicted Yield: 1.34 tons/hectare


In [None]:
# In the below run of the above functiion, feature 'rainfall_mm' has been provided an incorrect value. So the 
# function treats it like a NaN and solves

In [18]:
# Example usage
predict_yield()

Enter Region (West/South/North/East):  West
Enter Soil Type (Sandy/Clay/Loam/etc.):  sandy
Enter Crop:  Wheat
Enter Rainfall (mm):  xyz
Enter Temperature (°C):  25
Fertilizer Used (True/False):  t
Irrigation Used (True/False):  t
Weather Condition (Sunny/Rainy/Cloudy):  sunny
Days to Harvest:  50


Invalid value for Rainfall_mm, using NaN

Predicted Yield: 5.96 tons/hectare


In [None]:
# In the below run of the above functiion, feature 'crop', 'temperature_celsius' have been provided an 
# no values,i.e., NULL values, and 'fertilizer_used' has an incorrect value. Despite such inconsistencies, our 
# model performs. 

In [23]:
# Example usage
predict_yield()

Enter Region (West/South/North/East):  NOrth
Enter Soil Type (Sandy/Clay/Loam/etc.):  Sandy
Enter Crop:  
Enter Rainfall (mm):  123
Enter Temperature (°C):  
Fertilizer Used (True/False):  x
Irrigation Used (True/False):  t
Weather Condition (Sunny/Rainy/Cloudy):  sunny
Days to Harvest:  56


Invalid boolean value for Fertilizer_Used, using False

Predicted Yield: 2.34 tons/hectare


In [None]:
# Safe to say that we have stress-tested the model.