In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_squared_error

# Load dataset
file_path = '/Users/mthabisimunyariri/Desktop/DS-250/Machine_learning_project/crop_data.csv'
data = pd.read_csv(file_path)

# Display the first few rows
print(data.head())


   Year Country         Region  Crop_Type  Average_Temperature_C  \
0  2001   India    West Bengal       Corn                   1.55   
1  2024   China          North       Corn                   3.23   
2  2001  France  Ile-de-France      Wheat                  21.11   
3  2001  Canada       Prairies     Coffee                  27.85   
4  1998   India     Tamil Nadu  Sugarcane                   2.19   

   Total_Precipitation_mm  CO2_Emissions_MT  Crop_Yield_MT_per_HA  \
0                  447.06             15.22                 1.737   
1                 2913.57             29.82                 1.737   
2                 1301.74             25.75                 1.719   
3                 1154.36             13.91                 3.890   
4                 1627.48             11.81                 1.080   

   Extreme_Weather_Events  Irrigation_Access_%  Pesticide_Use_KG_per_HA  \
0                       8                14.54                    10.08   
1                       8 

In [3]:
# Define features and target variable
X = data.drop('Crop_Yield_MT_per_HA', axis=1)
y = data['Crop_Yield_MT_per_HA']

In [4]:
# Identify categorical and numerical columns
categorical_cols = ['Year', 'Country', 'Region', 'Crop_Type', 'Extreme_Weather_Events', 'Adaptation_Strategies']
numerical_cols = [col for col in X.columns if col not in categorical_cols]

In [5]:
# Preprocessing for numerical data
numerical_transformer = StandardScaler()

# Preprocessing for categorical data
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [6]:
# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# Define base models
base_models = [
    ('lr', LinearRegression()),
    ('dt', DecisionTreeRegressor(random_state=42)),
    ('rf', RandomForestRegressor(n_estimators=100, random_state=42))
]

# Create a pipeline that includes preprocessing and the stacking regressor
stacking_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('stacking', StackingRegressor(
        estimators=base_models,
        final_estimator=LinearRegression()
    ))
])

In [7]:

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the stacking model
stacking_pipeline.fit(X_train, y_train)


In [8]:
# Make predictions
y_pred = stacking_pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
print(f'Mean Squared Error: {mse}')


Mean Squared Error: 0.3954810451302688
