In [None]:
# ---------------------------------------
# 1. Import Libraries
# ---------------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Optional: If using XGBoost
# from xgboost import XGBRegressor

# ---------------------------------------
# 2. Load Dataset
# ---------------------------------------
# Replace with actual path or URL to your dataset
df = pd.read_csv('crop_yield_data.csv')  # Sample columns: ['Crop', 'Region', 'Year', 'Rainfall', 'Temperature', 'Soil_Type', 'Fertilizer_Used', 'Yield']

# ---------------------------------------
# 3. Exploratory Data Analysis (EDA)
# ---------------------------------------
print(df.head())
print(df.info())
print(df.describe())

# Visualize correlation
sns.heatmap(df.corr(), annot=True)
plt.title("Feature Correlation")
plt.show()

# ---------------------------------------
# 4. Preprocessing
# ---------------------------------------

# Drop rows with missing values
df.dropna(inplace=True)

# One-hot encode categorical features
df_encoded = pd.get_dummies(df, columns=['Crop', 'Region', 'Soil_Type'], drop_first=True)

# Features and Target
X = df_encoded.drop('Yield', axis=1)
y = df_encoded['Yield']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ---------------------------------------
# 5. Train Model
# ---------------------------------------

# Instantiate Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train
rf_model.fit(X_train, y_train)

# Predict
y_pred = rf_model.predict(X_test)

# ---------------------------------------
# 6. Evaluate Model
# ---------------------------------------
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.2f}")

# Visualize actual vs predicted
plt.scatter(y_test, y_pred, alpha=0.7)
plt.xlabel("Actual Yield")
plt.ylabel("Predicted Yield")
plt.title("Actual vs Predicted Crop Yields")
plt.plot([y.min(), y.max()], [y.min(), y.max()], 'r--')
plt.grid()
plt.show()

# ---------------------------------------
# 7. Feature Importance
# ---------------------------------------
feature_importances = pd.Series(rf_model.feature_importances_, index=X.columns).sort_values(ascending=False)
feature_importances.plot(kind='bar', figsize=(12, 6))
plt.title("Feature Importance")
plt.ylabel("Importance Score")
plt.show()


# Crop Yield Prediction with Machine Learning (SDG 2: Zero Hunger)

This project uses **machine learning** to predict crop yields based on environmental and agricultural inputs, directly supporting the **United Nations Sustainable Development Goal 2 – Zero Hunger**. By offering accurate predictions, the tool helps farmers, NGOs, and governments make better decisions about food production and distribution.

---

## Project Highlights

- **SDG Focus**: SDG 2 – Zero Hunger
- **ML Technique**: Supervised Learning (Random Forest Regressor)
- **Goal**: Predict crop yield (in tons per hectare) using data like rainfall, temperature, fertilizer use, crop type, soil type, and region
- **Interface**: Streamlit web app for real-time user interaction

---

## Quick Start

### 1. Clone the Repository

```bash
git clone https://github.com/karaboJJ/week-2-ai.git
cd crop-yield-predictor
