**Import Libraries and Setup**

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import joblib


**Load Dataset** **and handle missing values**

In [None]:
dataset = pd.read_csv('Harvest_Prediction_Factors_Merged.csv')

# Optionally, assign the dataset to df
df = dataset

# Handle missing values
df = df.dropna()



**Explore Dataset**

In [None]:
dataset.info()
dataset.head()


**Rows and Columns**

In [None]:
# Step 4: Check the Dimensions of the Dataset
rows, columns = dataset.shape
print(f"The dataset contains {rows} rows and {columns} columns.")


**Define Features and Target**

In [None]:
X = dataset.drop(columns=[' Yield (kg)'])
y = dataset[' Yield (kg)']


**Split Dataset into Training and Testing Sets**

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Display training and testing sizes
print(f"Training data size: {X_train.shape}")
print(f"Testing data size: {X_test.shape}")



**Preprocessing and Feature Transformation**



In [None]:
categorical_features = ['Soil Type', 'Irrigation Type', 'Water Source',
                        'Paddy Variety', 'Pest Severity', 'Season', 'District']

numerical_features = X.select_dtypes(include=['float64', 'int64']).columns.tolist()
print("numerical_features", numerical_features)

numerical_features = [col for col in numerical_features if col not in categorical_features]
print("numerical_features =", numerical_features)

# Preprocessing: Encode categorical features
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ],
    remainder='passthrough',  # Leave numerical features untouched
    force_int_remainder_cols=False  # Future-proof behavior
)

print("\nPreprocessor Details:")
print(preprocessor)


**Create Pipeline for Preprocessing and Model**


In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),  # Data preprocessing step (scaling, encoding, etc.)

    ('model', RandomForestRegressor(
        n_estimators=150,              # Number of trees in the forest (reduces variance and prevents overfitting by averaging predictions)
        max_depth=18,                  # Limits the depth of each tree to prevent overfitting by controlling model complexity
        min_samples_split=10,          # Requires at least 10 samples to split a node (prevents overly specific splits)
        min_samples_leaf=4,            # Ensures that each leaf node has at least 4 samples, preventing overfitting to small data variations
        random_state=42                # Ensures reproducibility of results (ensures the same splits each time for consistency in testing)
    ))
])


**Train the Model**

In [None]:
pipeline.fit(X_train, y_train)



**Test Performance**

In [None]:
# Step 5: Evaluate model on test set
y_test_pred = pipeline.predict(X_test)

# Test Metrics
test_mae = mean_absolute_error(y_test, y_test_pred)
test_mse = mean_squared_error(y_test, y_test_pred)
test_r2 = r2_score(y_test, y_test_pred)

print(f"Test MAE: {test_mae}")
print(f"Test MSE: {test_mse}")
print(f"Test R²: {test_r2}")


**Accuracy of the model**

In [None]:
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Predict using the trained model
y_pred = pipeline.predict(X_test)

# Example metric calculations
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Calculate RMSE
rmse = np.sqrt(mse)

# Print the metrics
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R-squared: {r2}")

# MAPE and Accuracy
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
accuracy = 100 - mape
print(f"Mean Absolute Percentage Error (MAPE): {mape:.2f}%")
print(f"Regression Accuracy: {accuracy:.2f}%")


**Save the test data**

In [None]:
from sklearn.model_selection import train_test_split
import pandas as pd

# Load the dataset
file_path = 'Harvest_Prediction_Factors_Merged.csv'  # Update with your file path
data = pd.read_csv(file_path)

# Split the data into 80% training and 20% testing
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Save the test data to a CSV file
test_file_path = 'Harvest_Prediction_Test_Data.csv'  # Update with desired save location
test_data.to_csv(test_file_path, index=False)

print(f"Test data saved to: {test_file_path}")


**Save the model**

In [None]:

# Save the model
model_path = "rice_yield_random_forest_model.pkl"
joblib.dump(pipeline, model_path)
print(f"Model saved to {model_path}")


**Test data Performance**

In [None]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Load the test data
test_file_path = 'Harvest_Prediction_Test_Data.csv'  # Update with your test data path
test_data = pd.read_csv(test_file_path)

# Separate features and target variable
X_test = test_data.drop(columns=[' Yield (kg)'])  # Replace 'Yield (kg)' with your target column name
y_test = test_data[' Yield (kg)']  # Replace 'Yield (kg)' with your target column name

# Load your trained model (ensure the model is trained and saved)
import joblib
model = joblib.load('rice_yield_random_forest_model.pkl')  # Update with your model path

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print("Mean Squared Error (MSE):", mse)
print("Mean Absolute Error (MAE):", mae)

# Optionally, visualize the results
import matplotlib.pyplot as plt

plt.scatter(y_test, y_pred)
plt.xlabel("Actual Yield (kg)")
plt.ylabel("Predicted Yield (kg)")
plt.title("Actual vs Predicted Yield")
plt.show()


**Comparison of Predicted and Test data**

In [None]:
import pandas as pd

# Function to predict harvest based on input test data
def predict_test_data(model, test_data):
    # Prepare to store results
    results = []

    # Iterate through test data rows
    for index, row in test_data.iterrows():
        # Extract input features
        new_data = pd.DataFrame([{
            'Rainfall (mm)': row['Rainfall (mm)'],
            'Temperature (°C)': row['Temperature (°C)'],
            'Relative Humidity (%)': row['Relative Humidity (%)'],
            'Sunshine Hours (hrs)': row['Sunshine Hours (hrs)'],
            'Wind Speed (km/h)': row['Wind Speed (km/h)'],
            'Soil Type': row['Soil Type'],
            'Irrigation Type': row['Irrigation Type'],
            'Water Source': row['Water Source'],
            'Paddy Variety': row['Paddy Variety'],
            'Fertilizer Usage (kg)': row['Fertilizer Usage (kg)'],
            'Area (hectare)': row['Area (hectare)'],
            'Soil Nitrogen (mg/kg)': row['Soil Nitrogen (mg/kg)'],
            'Soil Phosphorus (mg/kg)': row['Soil Phosphorus (mg/kg)'],
            'Soil Potassium (mg/kg)': row['Soil Potassium (mg/kg)'],
            'Pest Severity': row['Pest Severity'],
            'Season': row['Season'],
            'District': row['District']
        }])

        # Predict the total yield
        predicted_yield = model.predict(new_data)[0]

        # Store actual and predicted values
        results.append({
            "Actual Yield": row[' Yield (kg)'],
            "Predicted Yield": predicted_yield
        })

    # Return results as a DataFrame
    return pd.DataFrame(results)

# Load the test data
test_file_path = 'Harvest_Prediction_Test_Data.csv'  # Update with your file path
test_data = pd.read_csv(test_file_path)

# Load your trained model
import joblib
pipeline = joblib.load('rice_yield_random_forest_model.pkl')  # Update with your model path

# Predict for test data and compare
results_df = predict_test_data(pipeline, test_data)

# Save the results for evaluation
results_df.to_csv('Prediction_Results.csv', index=False)

print("Prediction results saved to 'Prediction_Results.csv'")


**Farmer input data predict harvest based on land size**


In [None]:
import pandas as pd

# Function to predict harvest based on input acreage and other factors, including recommendations based on yield comparison
def predict_total_harvest():
    # Step 1: Farmer inputs various factors
    print("=== Enter Farm Data ===")
    area = float(input("Enter the total area of land in hectares: "))
    rainfall = float(input("Enter the total rainfall in mm: "))
    temperature = float(input("Enter the temperature in °C: "))
    humidity = float(input("Enter the relative humidity percentage: "))
    sunshine_hours = float(input("Enter the total sunshine hours per day: "))
    wind_speed = float(input("Enter the average wind speed in km/h: "))
    soil_type = input("Enter the soil type (e.g., Loam, Sandy, Clay): ")
    irrigation_type = input("Enter the irrigation type (e.g., Rainfed, Canal, Tube Well): ")
    water_source = input("Enter the water source (e.g., Well, River, Rainwater): ")
    paddy_variety = input("Enter the paddy variety (e.g., BG 450, BG 250 , BG 350 , BG 360 ): ")
    fertilizer_usage = float(input("Enter the fertilizer usage in kg: "))
    soil_nitrogen = float(input("Enter the soil nitrogen level in mg/kg: "))
    soil_phosphorus = float(input("Enter the soil phosphorus level in mg/kg: "))
    soil_potassium = float(input("Enter the soil potassium level in mg/kg: "))
    pest_severity = input("Enter the pest severity (e.g., Low, Medium, High): ")
    season = input("Enter the season (e.g., Maha, Yala): ")
    district = input("Enter the district: ")

    # Step 2: Input for previous yield per acre
    previous_yield_per_hectare = float(input("Enter the previous yield per hectare in kg: "))

    # Step 3: Prepare the data for prediction
    new_data = pd.DataFrame([{
        'Rainfall (mm)': rainfall,
        'Temperature (°C)': temperature,
        'Relative Humidity (%)': humidity,
        'Sunshine Hours (hrs)': sunshine_hours,
        'Wind Speed (km/h)': wind_speed,
        'Soil Type': soil_type,
        'Irrigation Type': irrigation_type,
        'Water Source': water_source,
        'Paddy Variety': paddy_variety,
        'Fertilizer Usage (kg)': fertilizer_usage,
        'Area (hectare)': area,  # This is still used, but model predicts per acre
        'Soil Nitrogen (mg/kg)': soil_nitrogen,
        'Soil Phosphorus (mg/kg)': soil_phosphorus,
        'Soil Potassium (mg/kg)': soil_potassium,
        'Pest Severity': pest_severity,
        'Season': season,
        'District': district
    }])

    # Step 4: Predict the total yield using the trained model (replace `pipeline` with your actual model)
    Total_predicted_yield = pipeline.predict(new_data)[0]

    # Step 5: Calculate yield per acre (Optional, as yield prediction might already be per acre)
    predicted_yield_per_hectare = Total_predicted_yield / area

    # Step 6: Display the predicted results
    print(f"\n=== Predicted Results ===")
    print(f"Total Harvest for {area} hectare: {Total_predicted_yield:.2f} kg")
    print(f"Predicted Yield per hectare: {predicted_yield_per_hectare:.2f} kg")
    print("=========================")

    # Step 7: Compare the previous yield with the predicted yield and provide recommendations
    print("\n=== Recommendations Based on Yield Comparison ===")

    if previous_yield_per_hectare > predicted_yield_per_hectare:
        print("Condition: Previous Yield > Predicted Yield")
        print("\nPredicted Yield is LOWER than the Previous Yield.")
        print("Suggested Actions to Improve Yield:")
        print("- Optimize fertilizer usage based on soil tests.")
        print("- Improve irrigation practices to ensure consistent water supply.")
        print("- Consider pest control measures to minimize yield loss.")
        print("- Use high-yield paddy varieties suitable for the district.")
        print("- Apply balanced soil nutrients and organic matter.")

    elif predicted_yield_per_hectare > previous_yield_per_hectare:
        print("Condition: Predicted Yield > Previous Yield")
        print("\nPredicted Yield is HIGHER than the Previous Yield.")
        print("Recommendations for Maintaining or Further Improving Yield:")
        print("- Continue current agricultural practices.")
        print("- Monitor soil health regularly and apply nutrients accordingly.")
        print("- Use precision farming tools to track crop progress.")
        print("- Plan for seasonal crop rotation to maintain soil fertility.")
        print("- Use modern harvesting techniques to minimize post-harvest losses.")

    else:
        print("Condition: Previous Yield == Predicted Yield")
        print("\nPredicted Yield matches the Previous Yield. Keep up the current practices!")

    print("===========================")
    return Total_predicted_yield, predicted_yield_per_hectare

# Run the function
predict_total_harvest()


**Farmer input data predict harvest based on land size**




In [None]:
import pandas as pd

# Function to predict harvest based on input acreage and other factors, including recommendations based on yield comparison
def predict_total_harvest():
    # Step 1: Farmer inputs various factors
    print("=== Enter Farm Data ===")
    area = float(input("Enter the total area of land in hectares: "))
    rainfall = float(input("Enter the total rainfall in mm: "))
    temperature = float(input("Enter the temperature in °C: "))
    humidity = float(input("Enter the relative humidity percentage: "))
    sunshine_hours = float(input("Enter the total sunshine hours per day: "))
    wind_speed = float(input("Enter the average wind speed in km/h: "))
    soil_type = input("Enter the soil type (e.g., Loam, Sandy, Clay): ")
    irrigation_type = input("Enter the irrigation type (e.g., Rainfed, Canal, Tube Well): ")
    water_source = input("Enter the water source (e.g., Well, River, Rainwater): ")
    paddy_variety = input("Enter the paddy variety (e.g., BG 450, BG 250 , BG 350 , BG 360 ): ")
    fertilizer_usage = float(input("Enter the fertilizer usage in kg: "))
    soil_nitrogen = float(input("Enter the soil nitrogen level in mg/kg: "))
    soil_phosphorus = float(input("Enter the soil phosphorus level in mg/kg: "))
    soil_potassium = float(input("Enter the soil potassium level in mg/kg: "))
    pest_severity = input("Enter the pest severity (e.g., Low, Medium, High): ")
    season = input("Enter the season (e.g., Maha, Yala): ")
    district = input("Enter the district: ")

    # Step 2: Input for previous yield per acre
    previous_yield_per_hectare = float(input("Enter the previous yield per hectare in kg: "))

    # Step 3: Prepare the data for prediction
    new_data = pd.DataFrame([{
        'Rainfall (mm)': rainfall,
        'Temperature (°C)': temperature,
        'Relative Humidity (%)': humidity,
        'Sunshine Hours (hrs)': sunshine_hours,
        'Wind Speed (km/h)': wind_speed,
        'Soil Type': soil_type,
        'Irrigation Type': irrigation_type,
        'Water Source': water_source,
        'Paddy Variety': paddy_variety,
        'Fertilizer Usage (kg)': fertilizer_usage,
        'Area (hectare)': area,  # This is still used, but model predicts per acre
        'Soil Nitrogen (mg/kg)': soil_nitrogen,
        'Soil Phosphorus (mg/kg)': soil_phosphorus,
        'Soil Potassium (mg/kg)': soil_potassium,
        'Pest Severity': pest_severity,
        'Season': season,
        'District': district
    }])

    # Step 4: Predict the total yield using the trained model (replace `pipeline` with your actual model)
    Total_predicted_yield = pipeline.predict(new_data)[0]

    # Step 5: Calculate yield per acre (Optional, as yield prediction might already be per acre)
    predicted_yield_per_hectare = Total_predicted_yield / area

    # Step 6: Display the predicted results
    print(f"\n=== Predicted Results ===")
    print(f"Total Harvest for {area} hectare: {Total_predicted_yield:.2f} kg")
    print(f"Predicted Yield per hectare: {predicted_yield_per_hectare:.2f} kg")
    print("=========================")

    # Step 7: Compare the previous yield with the predicted yield and provide recommendations
    print("\n=== Recommendations Based on Yield Comparison ===")

    if previous_yield_per_hectare > predicted_yield_per_hectare:
        print("Condition: Previous Yield > Predicted Yield")
        print("\nPredicted Yield is LOWER than the Previous Yield.")
        print("Suggested Actions to Improve Yield:")
        print("- Optimize fertilizer usage based on soil tests.")
        print("- Improve irrigation practices to ensure consistent water supply.")
        print("- Consider pest control measures to minimize yield loss.")
        print("- Use high-yield paddy varieties suitable for the district.")
        print("- Apply balanced soil nutrients and organic matter.")

    elif predicted_yield_per_hectare > previous_yield_per_hectare:
        print("Condition: Predicted Yield > Previous Yield")
        print("\nPredicted Yield is HIGHER than the Previous Yield.")
        print("Recommendations for Maintaining or Further Improving Yield:")
        print("- Continue current agricultural practices.")
        print("- Monitor soil health regularly and apply nutrients accordingly.")
        print("- Use precision farming tools to track crop progress.")
        print("- Plan for seasonal crop rotation to maintain soil fertility.")
        print("- Use modern harvesting techniques to minimize post-harvest losses.")

    else:
        print("Condition: Previous Yield == Predicted Yield")
        print("\nPredicted Yield matches the Previous Yield. Keep up the current practices!")

    print("===========================")
    return Total_predicted_yield, predicted_yield_per_hectare

# Run the function
predict_total_harvest()


**Predict and Compare Yields with Suiatable Recommendations for each Districts**

In [None]:
import pandas as pd

# District-specific data for selected Sri Lankan districts
district_recommendations = {
    'Polonnaruwa': {
        'soil': 'Clay soil, retains moisture well.',
        'climate': 'Hot, dry climate, with irrigation systems in place.',
        'irrigation': 'Extensive canal irrigation, manage water usage.',
        'pests': 'Common pests: termites and rice bugs, use organic pesticides.'
    },
    'Jaffna': {
        'soil': 'Sandy soil with lower fertility.',
        'climate': 'Low rainfall, hot temperatures, and dry conditions.',
        'irrigation': 'Rainfed and limited irrigation, optimize water usage.',
        'pests': 'Low pest threat, but watch out for soil-borne diseases.'
    },
    'Hambantota': {
        'soil': 'Sandy soil with moderate fertility.',
        'climate': 'Very hot, low rainfall, high evaporation rates.',
        'irrigation': 'Rainfed and some canal irrigation systems.',
        'pests': 'Pest threat is moderate, particularly grasshoppers and locusts.'
    },
    'Mannar': {
        'soil': 'Sandy soil, needs fertilization.',
        'climate': 'Hot and dry, with a semi-arid environment.',
        'irrigation': 'Irrigation through wells and canals, optimize water usage.',
        'pests': 'Low pest risk, but be mindful of root rot and fungal diseases.'
    },
    'Trincomalee': {
        'soil': 'Sandy soil with low water retention.',
        'climate': 'Hot and dry, seasonal rainfall patterns.',
        'irrigation': 'Limited irrigation, dependence on seasonal rainfall.',
        'pests': 'Low pest risk, monitor for root rot diseases.'
    },
    'Batticaloa': {
        'soil': 'Loamy soil, suitable for paddy cultivation.',
        'climate': 'Coastal climate with moderate rainfall.',
        'irrigation': 'Rainfed with occasional canal irrigation.',
        'pests': 'Moderate pest risk, particularly rice weevils and aphids.'
    },
    'Vavuniya': {
        'soil': 'Clayey soil with good fertility.',
        'climate': 'Hot and dry, moderate rainfall.',
        'irrigation': 'Tank irrigation and some canal irrigation.',
        'pests': 'Regular monitoring for pests like leafhoppers and caterpillars.'
    },
    'Kurunegala': {
        'soil': 'Clay and loamy soils.',
        'climate': 'Moderate rainfall, cool temperature, ideal for a variety of crops.',
        'irrigation': 'Rainfed with canal systems.',
        'pests': 'Regular pest management needed for caterpillars and leafhoppers.'
    },
    'Ampara': {
        'soil': 'Sandy and clay soils.',
        'climate': 'Low rainfall, high temperatures, and drought risk.',
        'irrigation': 'Rainfed, supplemented with well water during dry periods.',
        'pests': 'Moderate pest risk, particularly stem borers and grasshoppers.'
    }
}

# Function to predict harvest based on input acreage and other factors, including recommendations based on yield comparison
def predict_total_harvest():
    # Step 1: Farmer inputs various factors
    print("=== Enter Farm Data ===")
    area = float(input("Enter the total area of land in hectares: "))
    rainfall = float(input("Enter the total rainfall in mm: "))
    temperature = float(input("Enter the temperature in °C: "))
    humidity = float(input("Enter the relative humidity percentage: "))
    sunshine_hours = float(input("Enter the total sunshine hours per day: "))
    wind_speed = float(input("Enter the average wind speed in km/h: "))
    soil_type = input("Enter the soil type (e.g., Loam, Sandy, Clay): ")
    irrigation_type = input("Enter the irrigation type (e.g., Rainfed, Canal, Tube Well): ")
    water_source = input("Enter the water source (e.g., Well, River, Rainwater): ")
    paddy_variety = input("Enter the paddy variety (e.g., BG 450, BG 250 , BG 350 , BG 360 ): ")
    fertilizer_usage = float(input("Enter the fertilizer usage in kg: "))
    soil_nitrogen = float(input("Enter the soil nitrogen level in mg/kg: "))
    soil_phosphorus = float(input("Enter the soil phosphorus level in mg/kg: "))
    soil_potassium = float(input("Enter the soil potassium level in mg/kg: "))
    pest_severity = input("Enter the pest severity (e.g., Low, Medium, High): ")
    season = input("Enter the season (e.g., Maha, Yala): ")
    district = input("Enter the district: ")

    # Step 2: Input for previous yield per acre
    previous_yield_per_hectare = float(input("Enter the previous yield per hectare in kg: "))

    # Step 3: Prepare the data for prediction
    new_data = pd.DataFrame([{
        'Rainfall (mm)': rainfall,
        'Temperature (°C)': temperature,
        'Relative Humidity (%)': humidity,
        'Sunshine Hours (hrs)': sunshine_hours,
        'Wind Speed (km/h)': wind_speed,
        'Soil Type': soil_type,
        'Irrigation Type': irrigation_type,
        'Water Source': water_source,
        'Paddy Variety': paddy_variety,
        'Fertilizer Usage (kg)': fertilizer_usage,
        'Area (hectare)': area,  # This is still used, but model predicts per acre
        'Soil Nitrogen (mg/kg)': soil_nitrogen,
        'Soil Phosphorus (mg/kg)': soil_phosphorus,
        'Soil Potassium (mg/kg)': soil_potassium,
        'Pest Severity': pest_severity,
        'Season': season,
        'District': district
    }])

    # Step 4: Predict the total yield using the trained model (replace `pipeline` with your actual model)
    Total_predicted_yield = pipeline.predict(new_data)[0]

    # Step 5: Calculate yield per acre (Optional, as yield prediction might already be per acre)
    predicted_yield_per_hectare = Total_predicted_yield / area

    # Step 6: Display the predicted results
    print(f"\n=== Predicted Results ===")
    print(f"Total Harvest for {area} hectare: {Total_predicted_yield:.2f} kg")
    print(f"Predicted Yield per hectare: {predicted_yield_per_hectare:.2f} kg")
    print("=========================")

    # Step 7: Compare the previous yield with the predicted yield and provide recommendations
    print("\n=== Recommendations Based on Yield Comparison ===")

    if previous_yield_per_hectare > predicted_yield_per_hectare:
        print("Condition: Previous Yield > Predicted Yield")
        print("\nPredicted Yield is LOWER than the Previous Yield.")
        print("Suggested Actions to Improve Yield:")
        print("- Optimize fertilizer usage based on soil tests.")
        print("- Improve irrigation practices to ensure consistent water supply.")
        print("- Consider pest control measures to minimize yield loss.")
        print("- Use high-yield paddy varieties suitable for the district.")
        print("- Apply balanced soil nutrients and organic matter.")

    elif predicted_yield_per_hectare > previous_yield_per_hectare:
        print("Condition: Predicted Yield > Previous Yield")
        print("\nPredicted Yield is HIGHER than the Previous Yield.")
        print("Recommendations for Maintaining or Further Improving Yield:")
        print("- Continue current agricultural practices.")
        print("- Monitor soil health regularly and apply nutrients accordingly.")
        print("- Use precision farming tools to track crop progress.")
        print("- Plan for seasonal crop rotation to maintain soil fertility.")
        print("- Use modern harvesting techniques to minimize post-harvest losses.")

    else:
        print("Condition: Previous Yield == Predicted Yield")
        print("\nPredicted Yield matches the Previous Yield. Keep up the current practices!")

    # Step 8: District-specific recommendations
    print("\n=== District-Specific Recommendations ===")
    if district in district_recommendations:
        print(f"District: {district}")
        print(f"Soil: {district_recommendations[district]['soil']}")
        print(f"Climate: {district_recommendations[district]['climate']}")
        print(f"Irrigation: {district_recommendations[district]['irrigation']}")
        print(f"Pests: {district_recommendations[district]['pests']}")
    else:
        print(f"No specific recommendations available for district: {district}. Please consult local agriculture experts.")
    print("===========================")

# Run the function
predict_total_harvest()


**Visualize Model Performance**

In [None]:
import matplotlib.pyplot as plt

# Calculate residuals
residuals = y_test - y_pred

# Scatter plot of Actual vs Predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_test, y_pred, alpha=0.7, edgecolor='k')
plt.plot([min(y_test), max(y_test)], [min(y_test), max(y_test)], color='red', linestyle='--', label="Ideal Fit")
plt.title("Actual vs Predicted Yield")
plt.xlabel("Actual Yield (kg/acre)")
plt.ylabel("Predicted Yield (kg/acre)")
plt.legend()
plt.grid(True)
plt.show()

# Histogram of Residuals
plt.figure(figsize=(10, 6))
plt.hist(residuals, bins=30, color='blue', alpha=0.7, edgecolor='k')
plt.title("Distribution of Residuals")
plt.xlabel("Residuals (Actual - Predicted)")
plt.ylabel("Frequency")
plt.grid(True)
plt.show()

# Residuals vs Predicted values
plt.figure(figsize=(10, 6))
plt.scatter(y_pred, residuals, alpha=0.7, edgecolor='k')
plt.axhline(0, color='red', linestyle='--', label="Ideal Residual Line")
plt.title("Residuals vs Predicted Values")
plt.xlabel("Predicted Yield (kg/acre)")
plt.ylabel("Residuals (Actual - Predicted)")
plt.legend()
plt.grid(True)
plt.show()


In [None]:
import pickle


with open('rice_yield_random_forest_model.pkl', 'rb') as f:
    data = pickle.load(f)