In [137]:
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
import joblib
import json

In [138]:
# crop_data = pd.read_excel('/content/drive/MyDrive/model dvpt data/crop_recommendation.xlsx', sheet_name=0)

In [139]:
crop_data = pd.read_csv('dataset/filtered_crop_suitability_dataset.csv')
region_data = pd.read_excel('dataset/crop_recommendations.xlsx', sheet_name=2)

# crop_data = pd.read_excel('crop_recommendations.xlsx', sheet_name=0)
# crop_data = pd.read_excel('/content/drive/MyDrive/model dvpt data/crop_recommendation2.xlsx', sheet_name=0)

In [140]:
# # Function to sample a value from a range or threshold
# def sample_from_range(value):
#     # Remove commas from numbers
#     if isinstance(value, str):
#         value = value.replace(',', '')

#     # Check if the value is a range (e.g., '45-60')
#     if isinstance(value, str) and '-' in value:
#         parts = value.split('-')
#         if len(parts) == 2 and all(part.strip().replace('.', '', 1).isdigit() for part in parts):
#             low, high = map(float, parts)
#             return np.random.uniform(low, high)

#     # Check for '>' or '<' in the value, e.g., '>60' or '<45'
#     if isinstance(value, str) and value:
#         if value[0] == '>':
#             threshold = float(value[1:].strip())
#             return np.random.uniform(threshold, threshold + 20)  # Sample within a range above the threshold
#         elif value[0] == '<':
#             threshold = float(value[1:].strip())
#             return np.random.uniform(threshold - 20, threshold)  # Sample within a range below the threshold

#     # Check for symbol at the end (e.g., '50<')
#     if isinstance(value, str) and value[-1] == '<':
#         threshold = float(value[:-1].strip())  # Remove the '<' symbol and convert to float
#         return np.random.uniform(threshold - 20, threshold)  # Sample within a range below the threshold

#     # Check if the value is a valid number after removing commas
#     if isinstance(value, str) and value.replace('.', '', 1).isdigit():
#         return float(value)  # Return as float after removing commas

#     return value

# # Function to generate samples for each crop
# def generate_samples(data, num_samples=100):
#     sampled_data = []
#     for _, row in data.iterrows():
#         for _ in range(num_samples):
#             sampled_row = row.copy()
#             sampled_row['Altitude (masl)'] = sample_from_range(row['Altitude (masl)'])
#             sampled_row['temperature (C) '] = sample_from_range(row['temperature (C) '])
#             sampled_row['pH'] = sample_from_range(row['pH'])
#             sampled_row['Crop water need (mm/total growing period)'] = sample_from_range(row['Crop water need (mm/total growing period)'])
#             sampled_row['Humidity(%)'] = sample_from_range(row['Humidity(%)'])
#             sampled_row['N'] = sample_from_range(row['N'])
#             sampled_row['P'] = sample_from_range(row['P'])
#             sampled_row['K'] = sample_from_range(row['K'])
#             sampled_data.append(sampled_row)
#     return pd.DataFrame(sampled_data)

# # Generate the new sampled dataset
# sampled_dataset = generate_samples(crop_data, num_samples=100)
sampled_dataset = crop_data

In [141]:
crop_columns_to_remove = [
    'Id','Crop suitability','Growing period (days)','Irrigation required(%)'
]

sampled_dataset = sampled_dataset.drop(columns=crop_columns_to_remove, errors='ignore')

In [142]:
models = {
    
    'XGBoost': XGBClassifier(
        n_estimators=200,                # Increase number of boosting rounds
        learning_rate=0.05,              # Lower learning rate
        max_depth=4,                     # Decrease max depth to avoid overfitting
        min_child_weight=3,              # Minimum sum of instance weight needed in a child
        gamma=0.1,                       # Minimum loss reduction required to split
        subsample=0.8,                   # Fraction of samples used for training each tree
        colsample_bytree=0.8,            # Fraction of features used for training each tree
        reg_lambda=1.5,                  # L2 regularization
        reg_alpha=0.5,                   # L1 regularization (adds sparsity)
        eval_metric='mlogloss',
        random_state=42
    )
}

In [143]:
# Prepare the dataset for training
train_features = sampled_dataset[['Altitude (masl)', 'temperature (C) ',
                             'pH', 'N', 'P', 'K','Crop water need (mm/total growing period)', 'Humidity(%)']]
target = sampled_dataset['Crop']

In [144]:
# Encode the target variable to numerical values
label_encoder = LabelEncoder()
target_encoded = label_encoder.fit_transform(target)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train_features, target_encoded, test_size=0.2, random_state=42)

In [145]:
# Initialize model_accuracies
model_accuracies = {name: [] for name in models.keys()}

In [146]:
# Function to train and evaluate models
def train_and_evaluate(models, X_train, X_test, y_train, y_test):
    results = {}

    for name, model in models.items():
        print(f"Training {name}...")
        try:
            # Train the model
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)

            # Calculate accuracy
            accuracy = accuracy_score(y_test, y_pred)
            print(f"\n{name} Accuracy: {accuracy:.4f}")

            # Calculate classification report
            report = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

            # Store results
            results[name] = {
                'model': model,
                'accuracy': accuracy,
                'classification_report': report
            }

            # Append the accuracy to the model_accuracies list
            model_accuracies[name].append(accuracy)

        except Exception as e:
            print(f"Error training {name}: {str(e)}")
            results[name] = {
                'model': None,
                'accuracy': None,
                'classification_report': None,
                'error': str(e)
            }

    return results

# Train and evaluate all models
results = train_and_evaluate(models, X_train, X_test, y_train, y_test)

# Calculate mean accuracies
mean_accuracies = {
    model_name: (sum(accuracies) / len(accuracies)) if len(accuracies) > 0 else 0
    for model_name, accuracies in model_accuracies.items()
}

# Determine the best model
best_model_name = max(mean_accuracies, key=mean_accuracies.get)
best_model = models[best_model_name]

print(f"\nBest Model: {best_model_name} with mean accuracy: {mean_accuracies[best_model_name]:.4f}")

Training XGBoost...

XGBoost Accuracy: 0.9895

Best Model: XGBoost with mean accuracy: 0.9895


In [147]:
# sampled_dataset.to_csv('sampled_dataset.csv', index=False)


In [148]:
# Step 1: Drop rows with specific unwanted text entries
unwanted_text = ['Protected Land', 'Water Body']
region_data = region_data[~region_data['pH'].isin(unwanted_text)]

# Step 2: Function to calculate average pH
def calculate_average_ph(ph_value):
    # Handle special case for values like '<5.0'
    if '<' in ph_value:
        # Convert '<5.0' to 5.0 (assuming '<5.0' means "up to 5.0")
        return float(ph_value.replace('<', ''))
    elif '>' in ph_value:
        return float(ph_value.replace('>',''))

    # Handle range values like '6.0-7.0'
    if '-' in ph_value:
        lower, upper = map(float, ph_value.split('-'))
        return (lower + upper) / 2

    # Convert single numeric value to float
    return float(ph_value)

# Step 3: Apply the function to the 'pH' column
region_data['pH_avg'] = region_data['pH'].apply(calculate_average_ph)

In [149]:
def fetch_crop_data(crop_names, dataset):
    # Filter the dataset based on the crop names
    filtered_data = dataset[dataset['Crop'].isin(crop_names)].drop_duplicates(subset=['Crop'])
    
    # Use a dictionary to store crop details for quick lookup
    crop_details_dict = {}
    for _, row in filtered_data.iterrows():
        crop_details_dict[row['Crop']] = {
            "season_a_start": row['Season A start(month)'],
            "season_a_end": row['Season A end'],
            "season_b_start": row['Season B start(month)'],
            "season_b_end": row['Season B end(month)'],
            "soil_type": row['Soil type']
        }
    
    return crop_details_dict

In [150]:
# User input
user_input = {
    "district": "Gasabo",
    "sector": "Jali",
    "start_date_to_plant": "2024-02-01"
}

In [151]:
# Function to predict best crops based on input features
def predict_best_crops(features, best_model, label_encoder, top_n=5):
    # Convert features dictionary into a DataFrame for prediction
    features_df = pd.DataFrame([features])

    # Ensure the feature order matches what the model was trained on
    feature_columns = [
        'Altitude (masl)', 'temperature (C) ', 'pH', 'N', 'P', 'K', 
        'Crop water need (mm/total growing period)', 'Humidity(%)'
    ]
    features_df = features_df[feature_columns]
    
    # Make a prediction using the best model
    try:
        predictions_proba = best_model.predict_proba(features_df)[0]
    except AttributeError:
        predicted_class = best_model.predict(features_df)[0]
        return [label_encoder.inverse_transform([predicted_class])[0]]

    # Get the indices of the top N predictions with the highest probability
    top_n_indices = np.argsort(predictions_proba)[-top_n:][::-1]

    # Get the crop names and their corresponding probabilities
    top_crops = label_encoder.inverse_transform(top_n_indices)
    top_probabilities = predictions_proba[top_n_indices]

    # Return both top crops and their probabilities
    return list(zip(top_crops, top_probabilities))

# Example: Filter region data based on user input
filtered_region = region_data[
    (region_data['District'] == user_input['district']) & 
    (region_data['Sector'] == user_input['sector'])
]

# Check if filtered_region is empty
if filtered_region.empty:
    print("No data found for the specified district and sector.")
else:
    # Extract relevant features from filtered_region
    humidity = filtered_region['Humidity(%)'].values[0]
    temperature = filtered_region['Temperature (°C)'].values[0]
    pH = filtered_region['pH_avg'].values[0]
    potassium_min = filtered_region['Potassium(ppm)'].values[0]
    phosphorus_min = filtered_region['Phosphorus(ppm)'].values[0]
    nitrogen = filtered_region['Nitrogen(%)'].values[0]
    crop_water_need = filtered_region['Average Rainfall (mm)'].values[0]
    altitude = filtered_region['Elevation'].values[0]

    # Prepare the features for the model
    features = {
        'Altitude (masl)': altitude,
        'temperature (C) ': temperature,
        'pH': pH,
        'N': nitrogen,
        'P': phosphorus_min,
        'K': potassium_min,
        'Crop water need (mm/total growing period)': crop_water_need,
        'Humidity(%)': humidity
    }

    # Predict the top 5 crops along with their probabilities
    top_crops = predict_best_crops(features, best_model, label_encoder)

    # Extract crop names for querying additional data from the dataset
    crop_names = [crop for crop, _ in top_crops]

    # Fetch additional data for the predicted crops
    crop_details = fetch_crop_data(crop_names, sampled_dataset)

    # Format the output in JSON
    output = {
        "predicted_crops": [
            {
                "crop_name": crop,
                "probability": float(probability),
                **crop_details.get(crop, {})  # Merge details if available
            }
            for crop, probability in top_crops
        ]
    }

    # Print the output in JSON format
    print(json.dumps(output, indent=4))

{
    "predicted_crops": [
        {
            "crop_name": "Spinach",
            "probability": 0.3290771543979645,
            "season_a_start": 9,
            "season_a_end": "12-Nov",
            "season_b_start": 2,
            "season_b_end": "5-Apr",
            "soil_type": "well-draining loamy soil"
        },
        {
            "crop_name": "Tomato",
            "probability": 0.2253754585981369,
            "season_a_start": 9,
            "season_a_end": "3-Jan",
            "season_b_start": 2,
            "season_b_end": "8-Jun",
            "soil_type": "Well drained sandy, loam, and clay loam soils"
        },
        {
            "crop_name": "Mango",
            "probability": 0.17884708940982819,
            "season_a_start": 9,
            "season_a_end": "2-Dec",
            "season_b_start": 2,
            "season_b_end": "7-May",
            "soil_type": " well-draining, sandy loam, loamy soil"
        },
        {
            "crop_name": "Tea",
         

In [152]:
def predict_best_crops(features, best_model, label_encoder, top_n=5):
    # Convert features dictionary into a DataFrame for prediction
    features_df = pd.DataFrame([features])

    # Ensure the feature order matches what the model was trained on
    feature_columns = [
        'Altitude (masl)', 'temperature (C) ', 'pH', 'N', 'P', 'K', 
        'Crop water need (mm/total growing period)', 'Humidity(%)'
    ]
    features_df = features_df[feature_columns]
    
    # Make a prediction using the best model
    try:
        predictions_proba = best_model.predict_proba(features_df)[0]
    except AttributeError:
        # If the model doesn't support predict_proba, fall back to predict
        predicted_class = best_model.predict(features_df)[0]
        return [label_encoder.inverse_transform([predicted_class])[0]]

    # Get the indices of the top N predictions with the highest probability
    top_n_indices = np.argsort(predictions_proba)[-top_n:][::-1]

    # Get the crop names using the label encoder
    top_crops = label_encoder.inverse_transform(top_n_indices)
    
    return top_crops

# List to store the results for each sector
results = []

# Iterate over each sector in the region data
for index, row in region_data.iterrows():
    # Extract relevant features from each row
    humidity = row['Humidity(%)']
    temperature = row['Temperature (°C)']
    pH = row['pH_avg']
    potassium_min = row['Potassium(ppm)']
    phosphorus_min = row['Phosphorus(ppm)']
    nitrogen = row['Nitrogen(%)']
    crop_water_need = row['Average Rainfall (mm)']
    altitude = row['Elevation']
    
    # Prepare the features dictionary
    features = {
        'Altitude (masl)': altitude,
        'temperature (C) ': temperature,
        'pH': pH,
        'N': nitrogen,
        'P': phosphorus_min,
        'K': potassium_min,
        'Crop water need (mm/total growing period)': crop_water_need,
        'Humidity(%)': humidity
    }
    
    # Predict the top 5 crops for the current sector
    top_crops = predict_best_crops(features, best_model, label_encoder)
    
    # Store the results (District, Sector, and top 5 crops)
    results.append({
        'District': row['District'],
        'Sector': row['Sector'],
        'Top 1 Crop': top_crops[0] if len(top_crops) > 0 else None,
        'Top 2 Crop': top_crops[1] if len(top_crops) > 1 else None,
        'Top 3 Crop': top_crops[2] if len(top_crops) > 2 else None,
        'Top 4 Crop': top_crops[3] if len(top_crops) > 3 else None,
        'Top 5 Crop': top_crops[4] if len(top_crops) > 4 else None
    })

# Convert the results list into a DataFrame
results_df = pd.DataFrame(results)

# Save the DataFrame to a CSV file
csv_filename = 'best_crops_per_sector_DECISION_crops.csv'
results_df.to_csv(csv_filename, index=False)

In [153]:
# Filter region data based on user input
filtered_region = region_data[
    (region_data['District'] == user_input['district']) &
    (region_data['Sector'] == user_input['sector'])
]

# Check if filtered_region is empty
if filtered_region.empty:
    print("No data found for the specified district and sector.")
else:
    # Extract relevant features from filtered_region
    humidity = filtered_region['Humidity(%)'].values[0]
    temperature = filtered_region['Temperature (°C)'].values[0]
    pH = filtered_region['pH_avg'].values[0]
    potassium_min = filtered_region['Potassium(ppm)'].values[0]
    phosphorus_min = filtered_region['Phosphorus(ppm)'].values[0]
    nitrogen = filtered_region['Nitrogen(%)'].values[0]
    crop_water_need = filtered_region['Average Rainfall (mm)'].values[0]
    altitude = filtered_region['Elevation'].values[0]

    # Prepare the features for the model using user_input
    features = {
        'Altitude (masl)': altitude,
        'temperature (C) ': temperature,
        'pH': pH,
        'N': nitrogen,
        'P': phosphorus_min,
        'K': potassium_min,
        'Crop water need (mm/total growing period)': crop_water_need,
        'Humidity(%)': humidity
    }

    print("Extracted Features:", features)

    # Convert features to DataFrame
    input_data = pd.DataFrame([features])  # Wrap in a list to create a DataFrame

    # Predict suitability for the input conditions using the best model
    predictions = best_model.predict(input_data)

    # Decode the predicted crop labels to their original string names
    predictions_decoded = label_encoder.inverse_transform(predictions)

    # Get predicted probabilities for all classes (crops)
    predicted_probabilities = best_model.predict_proba(input_data)

    # Decode the class names in the predicted probabilities
    decoded_classes = label_encoder.inverse_transform(best_model.classes_)

    # Create a DataFrame with decoded crops and their predicted probabilities
    crop_probabilities = pd.DataFrame(predicted_probabilities, columns=decoded_classes)
    crop_probabilities['Crop'] = predictions_decoded

    # Remove duplicates based on Crop and the predicted probabilities
    crop_probabilities = crop_probabilities.drop_duplicates(subset=['Crop'] + list(decoded_classes), keep='first')

    # Select the probabilities for the first row only
    first_row_probabilities = crop_probabilities.loc[0, decoded_classes]

    # Ensure the series is numeric and call nlargest
    first_row_probabilities = pd.to_numeric(first_row_probabilities, errors='coerce')

    # Get the top crops based on predicted probabilities
    top_crops = first_row_probabilities.nlargest(5).reset_index()
    top_crops.columns = ['Crop', 'Probability']

    # Decode the crop names
    top_crops['Crop'] = top_crops['Crop'].astype(str)
    crop_data['Crop'] = crop_data['Crop'].astype(str)

    # Merge to get additional crop information if needed
    top_crops_info = crop_data.merge(top_crops, on='Crop', how='inner')

    # Check if the merge was successful
    if top_crops_info.empty:
        print("No matching crops found after merging. Please check the crop names.")
    else:
        # Remove duplicates and sort the results
        top_crops_info = top_crops_info.drop_duplicates(subset='Crop', keep='first')
        top_crops_info = top_crops_info.sort_values(by='Probability', ascending=False)

        # Output the result
        print(top_crops_info[['Crop', 'Crop type', 'Probability']])

Extracted Features: {'Altitude (masl)': 2049, 'temperature (C) ': 14.89676273, 'pH': 5.755, 'N': 100.5885138, 'P': 12.46373804, 'K': 47.92388791, 'Crop water need (mm/total growing period)': 1169.885808, 'Humidity(%)': 73.8558148}
        Crop      Crop type  Probability
300  Spinach    Leafy Green     0.329077
100   Tomato      Vegetable     0.225375
0      Mango          Fruit     0.178847
400      Tea  Beverage Crop     0.057037
200  Cabbage      Vegetable     0.036194


In [154]:
# # Step 2: Define conditions to filter crops based on region behavior
# def filter_crops_by_region(crops_df, regions_df):
#     # Define the columns to match
#     common_columns = [
#         'Altitude (masl)', 'temperature (C) ', 'pH', 'Humidity(%)', 'N', 'P', 'K'
#     ]
    
#     # List to store matching crops
#     matching_crops = []
    
#     for _, region_row in regions_df.iterrows():
#         # Apply filters for each region's conditions
#         filtered_crops = crops_df[
#             (crops_df['Altitude (masl)'] >= region_row['Elevation'] - 100) &
#             (crops_df['Altitude (masl)'] <= region_row['Elevation'] + 100) &
#             (crops_df['temperature (C) '] >= region_row['Temperature (°C)'] - 2) &
#             (crops_df['temperature (C) '] <= region_row['Temperature (°C)'] + 2) &
#             (crops_df['pH'] >= region_row['pH_avg'] - 0.5) &
#             (crops_df['pH'] <= region_row['pH_avg'] + 0.5) &
#             (crops_df['Humidity(%)'] >= region_row['Humidity(%)'] - 5) &
#             (crops_df['Humidity(%)'] <= region_row['Humidity(%)'] + 5) &
#             (crops_df['N'] >= region_row['Nitrogen(%)'] - 10) &
#             (crops_df['P'] >= region_row['Phosphorus(ppm)'] - 5) &
#             (crops_df['K'] >= region_row['Potassium(ppm)'] - 5)
#         ]
        
#         # Append matching crops to the list
#         matching_crops.extend(filtered_crops['Crop'].unique())
    
#     return list(set(matching_crops))

# # Step 3: Run the filtering function
# matching_crops = filter_crops_by_region(sampled_dataset, region_data)

In [155]:
# # Step 4: Filter the sampled_dataset for the matching crops
# filtered_dataset = sampled_dataset[sampled_dataset['Crop'].isin(matching_crops)]

In [156]:
# # Step 5: Visualize the matching crops based on attributes
# def visualize_matching_crops(filtered_df):
#     plt.figure(figsize=(12, 8))
    
#     # Visualize Altitude vs Temperature colored by Crop Type
#     sns.scatterplot(
#         data=filtered_df,
#         x='Altitude (masl)',
#         y='temperature (C) ',
#         hue='Crop type',
#         palette='viridis'
#     )
#     plt.title("Matching Crops: Altitude vs Temperature")
#     plt.xlabel("Altitude (masl)")
#     plt.ylabel("Temperature (C)")
#     plt.legend(title='Crop Type', bbox_to_anchor=(1, 1))
#     plt.show()
    
#     # Box plot for pH levels across different crops
#     plt.figure(figsize=(12, 6))
#     sns.boxplot(data=filtered_df, x='Crop', y='pH')
#     plt.xticks(rotation=45)
#     plt.title("pH Levels of Matching Crops")
#     plt.ylabel("pH")
#     plt.xlabel("Crop")
#     plt.show()

In [157]:
# # Step 6: Visualize the filtered dataset
# visualize_matching_crops(filtered_dataset)

In [158]:
# filtered_dataset.to_csv('filtered_sampled_dataset.csv', index=False)