In [1241]:
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
import joblib

In [1242]:
crop_data = pd.read_excel('datasets/crop_recommendation_new.xlsx', sheet_name=0)
region_data = pd.read_excel('datasets/crop_recommendation_new.xlsx', sheet_name=4)

In [1243]:
# remove the unnecessary space
crop_data.columns = crop_data.columns.str.strip()

In [1244]:
total_sectors_per_district = region_data['District'].value_counts().reset_index()

In [1245]:
def clean_potassium_range(potassium_range):
    if isinstance(potassium_range, str):  # Ensure it's a string
        potassium_range = potassium_range.strip()  # Clean up leading/trailing spaces
        
        if '<' in potassium_range:
            upper_bound = float(potassium_range.split('<')[-1].strip())
            # Calculate the average of 0 and the upper bound
            return (0, (0 + upper_bound) / 2)
        elif '>' in potassium_range:
            lower_bound = float(potassium_range.split('>')[-1].strip())
            # Use an arbitrary high value to compute average, like double the lower_bound
            return (lower_bound, (lower_bound + lower_bound * 2) / 2)
        elif '-' in potassium_range:
            return tuple(map(float, potassium_range.split('-')))
        else:
            return (float(potassium_range), float(potassium_range))
    else:
        # Handle case where input is already a float
        return (float(potassium_range), float(potassium_range))

In [1246]:
def clean_pH_range(ph_range):
    if isinstance(ph_range, str):  # Ensure it's a string
        ph_range = ph_range.strip()  # Clean up leading/trailing spaces
        
        if '<' in ph_range:
            upper_bound = float(ph_range.split('<')[-1].strip())
            # Calculate the average of 0 and the upper bound
            return (0, (0 + upper_bound) / 2)
        elif '>' in ph_range:
            lower_bound = float(ph_range.split('>')[-1].strip())
            # Use an arbitrary high value to compute average, like double the lower_bound
            return (lower_bound, (lower_bound + lower_bound * 2) / 2)
        elif '-' in ph_range:
            return tuple(map(float, ph_range.split('-')))
        else:
            return (float(ph_range), float(ph_range))
    else:
        # Handle case where input is already a float
        return (float(ph_range), float(ph_range))

In [1247]:
def clean_phosphorus_range(p_range):
    if isinstance(p_range, str):
        if '<' in p_range:
            upper_bound = float(p_range.split('<')[-1].strip())
            # Calculate the average of 0 and the upper bound
            return (0, (0 + upper_bound) / 2)
        elif '>' in p_range:
            lower_bound = float(p_range.split('>')[-1].strip())
            # Use an arbitrary high value to compute average, like double the lower_bound
            return (lower_bound, (lower_bound + lower_bound * 2) / 2)
        elif '-' in p_range:
            return tuple(map(float, p_range.split('-')))
        else:
            return (float(p_range), float(p_range))
    else:
        return (float(p_range), float(p_range))

In [1248]:
def clean_humidity_range(humidity_range):
    if isinstance(humidity_range, str):
        if '<' in humidity_range:
            upper_bound = float(humidity_range.split('<')[-1].strip())
            # Calculate the average of 0 and the upper bound
            return (0, (0 + upper_bound) / 2)
        elif '>' in humidity_range:
            lower_bound = float(humidity_range.split('>')[-1].strip())
            # Use an arbitrary high value to compute average, like double the lower_bound
            return (lower_bound, (lower_bound + lower_bound * 2) / 2)
        elif '-' in humidity_range:
            return tuple(map(float, humidity_range.split('-')))
        else:
            return (float(humidity_range), float(humidity_range))
    else:
        return (float(humidity_range), float(humidity_range))

In [1249]:
def clean_temperature_range(temp_range):
    if isinstance(temp_range, str):
        if '<' in temp_range:
            upper_bound = float(temp_range.split('<')[-1].strip())
            # Calculate the average of 0 and the upper bound
            return (0, (0 + upper_bound) / 2)
        elif '>' in temp_range:
            lower_bound = float(temp_range.split('>')[-1].strip())
            # Use an arbitrary high value to compute average, like double the lower_bound
            return (lower_bound, (lower_bound + lower_bound * 2) / 2)
        elif '-' in temp_range:
            return tuple(map(float, temp_range.split('-')))
        else:
            return (float(temp_range), float(temp_range))
    else:
        return (float(temp_range), float(temp_range))

In [1250]:
def clean_soil_moisture_range(moisture_range):
    if isinstance(moisture_range, str):
        if '<' in moisture_range:
            upper_bound = float(moisture_range.split('<')[-1].strip())
            # Calculate the average of 0 and the upper bound
            return (0, (0 + upper_bound) / 2)
        elif '>' in moisture_range:
            lower_bound = float(moisture_range.split('>')[-1].strip())
            # Use an arbitrary high value to compute average, like double the lower_bound
            return (lower_bound, (lower_bound + lower_bound * 2) / 2)
        elif '-' in moisture_range:
            return tuple(map(float, moisture_range.split('-')))
        else:
            return (float(moisture_range), float(moisture_range))
    else:
        return (float(moisture_range), float(moisture_range))

In [1251]:
# # Apply cleaning function to split potassium range into min and max
# region_data[['potassium_min', 'potassium_max']] = region_data['Potassium(ppm)'].apply(clean_potassium_range).apply(pd.Series)

# # Ensure the min and max columns are float type
# region_data['potassium_min'] = region_data['potassium_min'].astype(float)
# region_data['potassium_max'] = region_data['potassium_max'].astype(float)

In [1252]:
# # Apply cleaning function to split pH range into min and max
# region_data[['phosphorus_min', 'phosphorus_max']] = region_data['Phosphorous(ppm)'].apply(clean_phosphorus_range).apply(pd.Series)

# # Ensure the min and max columns are float type
# region_data['phosphorus_min'] = region_data['phosphorus_min'].astype(float)
# region_data['phosphorus_max'] = region_data['phosphorus_max'].astype(float)

In [1253]:
# Apply cleaning function to split Phosphorous range into min and max
region_data[['pH_min', 'pH_max']] = region_data['pH'].apply(clean_pH_range).apply(pd.Series)

# Ensure the min and max columns are float type
region_data['pH_min'] = region_data['pH_min'].astype(float)
region_data['pH_max'] = region_data['pH_max'].astype(float)

In [1254]:
# Clean and convert the pH column in crop_data
def convert_crop_ph(ph_value):
    if isinstance(ph_value, str) and '-' in ph_value:
        # If pH is in range format "6.0-7.0", convert it to the midpoint
        ph_min, ph_max = map(float, ph_value.split('-'))
        return (ph_min + ph_max) / 2  # Using the average of the range
    else:
        return float(ph_value)  # Ensure single values are floats

crop_data['pH'] = crop_data['pH'].apply(convert_crop_ph)

In [1255]:
# Define a function to clean the range values or '>' values
def clean_range_or_greater_than(value):
    if isinstance(value, str):
        if '-' in value:
            # If the value is a range (e.g., '680-1400'), calculate the midpoint
            value_min, value_max = map(float, value.split('-'))
            return (value_min + value_max) / 2
        elif '>' in value:
            # If the value starts with '>' (e.g., '>4500'), remove the '>' and return the numeric part
            return float(value.replace('>', '').strip())
    # If the value is already a number (or can be converted), just return it
    try:
        return pd.to_numeric(value, errors='coerce')
    except:
        return None

In [1256]:
# Apply the cleaning function to the relevant columns
crop_data['Altitude (masl)'] = crop_data['Altitude (masl)'].apply(clean_range_or_greater_than)
crop_data['Annual rainfall (mm)'] = crop_data['Annual rainfall (mm)'].apply(clean_range_or_greater_than)

In [1257]:
def match_ph_range(row, region_data):
    ph_min, ph_max = row['pH_cleaned']
    
    # Match if any overlap exists between the pH range and the potassium range in region_data
    return region_data[
        (region_data['pH_min'] <= ph_max) &
        (region_data['pH_max'] >= ph_min)
    ]

In [1258]:
def match_potassium_range(row, region_data):
    k_min, k_max = row['K_cleaned']
    
    # Match if any overlap exists between the K range and the potassium range in region_data
    return region_data[
        (region_data['potassium_min'] <= k_max) &  # Potassium min must be <= K max
        (region_data['potassium_max'] >= k_min)    # Potassium max must be >= K min
    ]

In [1259]:
# Merge phosphorus data with region_data
def match_phosphorus_range(row, region_data):
    p_min, p_max = row['P_cleaned']
    
    return region_data[
        (region_data['phosphorus_min'] <= p_max) &
        (region_data['phosphorus_max'] >= p_min)
    ]

In [1260]:
# Function to parse the humidity range from strings like '60-70' to numerical min and max
def parse_humidity_range(humidity_range):
    # Convert to string if it's not already, and handle cases where it might be NaN or a float
    if isinstance(humidity_range, float) or pd.isna(humidity_range):
        return None, None
    try:
        min_value, max_value = map(float, str(humidity_range).split('-'))
        return min_value, max_value
    except ValueError:
        # If the split fails, return None for both values
        return None, None

# Apply the parsing function to create min and max columns in crop_data
crop_data[['Humidity_min', 'Humidity_max']] = crop_data['Humidity(%)'].apply(
    lambda x: pd.Series(parse_humidity_range(x))
)

In [1261]:
# Function to parse the humidity range from strings like '60-70' to numerical min and max
def parse_temperature_range(temperature_range):
    # Convert to string if it's not already, and handle cases where it might be NaN or a float
    if isinstance(temperature_range, float) or pd.isna(temperature_range):
        return None, None
    try:
        min_value, max_value = map(float, str(temperature_range).split('-'))
        return min_value, max_value
    except ValueError:
        # If the split fails, return None for both values
        return None, None

# Apply the parsing function to create min and max columns in crop_data
crop_data[['Temperature_min', 'Temperature_max']] = crop_data['temperature (C)'].apply(
    lambda x: pd.Series(parse_temperature_range(x))
)

In [1262]:
# Function to parse the humidity range from strings like '60-70' to numerical min and max
def parse_moisture_range(moisture_range):
    # Convert to string if it's not already, and handle cases where it might be NaN or a float
    if isinstance(moisture_range, float) or pd.isna(moisture_range):
        return None, None
    try:
        min_value, max_value = map(float, str(moisture_range).split('-'))
        return min_value, max_value
    except ValueError:
        # If the split fails, return None for both values
        return None, None

# Apply the parsing function to create min and max columns in crop_data
crop_data[['Moisture_min', 'Moisture_max']] = crop_data['Optimum soil moisture'].apply(
    lambda x: pd.Series(parse_moisture_range(x))
)

In [1263]:
# Function to convert ranges into mean values
def convert_to_mean(value):
    if isinstance(value, float) or pd.isna(value):
        # If the value is already a float (or NaN), return it as is
        return value
    elif '-' in value:
        # If the value contains a range like "500-700", compute the mean
        low, high = value.split('-')
        return (float(low) + float(high)) / 2
    elif '>' in value:
        # If the value contains ">", return the minimum possible value plus a small buffer (e.g., 50)
        return float(value.replace('>', '').strip()) + 50
    else:
        # For single values, return as float
        return float(value.strip())

# Apply conversion to 'Annual rainfall (mm)' column
crop_data['Annual rainfall (mm)'] = crop_data['Annual rainfall (mm)'].apply(convert_to_mean)

# Apply conversion to 'Irrigation required(%)' column
crop_data['Irrigation required(%)'] = crop_data['Irrigation required(%)'].apply(convert_to_mean)

# Apply conversion to 'Crop water need (mm/total growing period)' column
crop_data['Crop water need (mm/total growing period)'] = crop_data['Crop water need (mm/total growing period)'].apply(convert_to_mean)

# Apply conversion to 'Growing period (days)' column
crop_data['Growing period (days)'] = crop_data['Growing period (days)'].apply(convert_to_mean)

In [1264]:
# Mapping similar soil types to common categories
soil_mapping = {
    r'.*well[-\s]*drain.*': 'Well-drained',
    r'.*sandy.*loam.*': 'Sandy Loam',
    r'.*loam.*clay.*': 'Clay Loam',
    r'.*loamy.*': 'Loamy',
    r'.*sandy.*': 'Sandy',
    r'.*volcanic.*': 'Volcanic',
    r'.*alluvial.*': 'Alluvial'
}

# Clean the 'Soil type' column using regex mapping
crop_data['Soil type'] = crop_data['Soil type'].str.lower()
for pattern, replacement in soil_mapping.items():
    crop_data['Soil type'] = crop_data['Soil type'].str.replace(pattern, replacement, regex=True)

In [1265]:
# Function to split rows with multiple start/end months into separate rows
def expand_crop_calendar(crop_data):
    rows = []
    for _, row in crop_data.iterrows():
        # Convert to string if it's not already, and replace NaN with an empty string
        start_months = str(row['Crop calendar start (month)']).replace(' ', '').split(',')
        end_months = str(row['Crop calendar end (month)']).replace(' ', '').split(',')

        # Ensure we match start and end seasons properly
        for start, end in zip(start_months, end_months):
            new_row = row.copy()
            new_row['Crop calendar start (month)'] = start.strip()
            new_row['Crop calendar end (month)'] = end.strip()
            rows.append(new_row)
    
    return pd.DataFrame(rows)

# Apply the function to expand the dataset
crop_data = expand_crop_calendar(crop_data)

In [1266]:
columns_to_remove = [
    'Id','Crop suitability', 'Acid Saturation(%)', 'AcidSat', 'Boron (ppm)', 'Calcium(%)', 
    'Calcium(ppm)', 'Copper (ppm)', 'Magnessium(%)', 'Magnessium(ppm)', 
    'Manganese(ppm)', 'Manganese', 'Organic Matter(%)', 
    'Potassium (%)', 'Sulphur (ppm)', 'Zinc (ppm)', 'temperature (C)', 'pH', 'Optimum soil moisture', 'Humidity(%)'
]

crop_columns_to_remove = [
    'Id','Crop suitability','Optimum soil moisture','temperature (C)','Humidity(%)'
]

In [1267]:
region_data = region_data.drop(columns=columns_to_remove, errors='ignore')

crop_data = crop_data.drop(columns=crop_columns_to_remove, errors='ignore')

Remove NaN from region data
-----

In [1268]:
numerical_columns = ['Humidity', 'Rainfall', 'Temperature', 'Elevation', 'Soil_Moisture', 'Potassium(ppm)', 'Phosphorus(ppm)', 'pH_min', 'pH_max']
for col in numerical_columns:
    region_data[col].fillna(region_data[col].mean(), inplace=True)

# For categorical columns, fill with the mode
categorical_columns = ['Province', 'District', 'Sector']
for col in categorical_columns:
    region_data[col].fillna(region_data[col].mode()[0], inplace=True)

region_data = region_data.drop(columns='Sect_ID', errors='ignore')

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  region_data[col].fillna(region_data[col].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  region_data[col].fillna(region_data[col].mode()[0], inplace=True)


Remove NaN from crop data
-----

In [1269]:
crop_data['pH'].fillna(7, inplace=True)
crop_data['Irrigation required(%)'].fillna(0, inplace=True)

# Sample function to convert range strings into their averages
def convert_range_to_average(value):
    if isinstance(value, str) and '-' in value:
        # Split the range and convert to float
        low, high = value.split('-')
        return (float(low) + float(high)) / 2
    else:
        # Return the value as is (converting to float if it's numeric)
        try:
            return float(value)
        except ValueError:
            return np.nan

# Apply this function to the N, P, and K columns
crop_data['N'] = crop_data['N'].apply(convert_range_to_average)
crop_data['P'] = crop_data['P'].apply(convert_range_to_average)
crop_data['K'] = crop_data['K'].apply(convert_range_to_average)


crop_data["N"].fillna(crop_data["N"].min(), inplace=True)
crop_data["P"].fillna(crop_data["P"].min(), inplace=True)
crop_data["K"].fillna(crop_data["K"].min(), inplace=True)

# Fill NaN values for min columns with their respective minimum values
crop_data["Humidity_min"].fillna(crop_data["Humidity_min"].min(), inplace=True)
crop_data["Temperature_min"].fillna(crop_data["Temperature_min"].min(), inplace=True)
crop_data["Moisture_min"].fillna(crop_data["Moisture_min"].min(), inplace=True)

# Fill NaN values for max columns with their respective maximum values
crop_data["Humidity_max"].fillna(crop_data["Humidity_max"].max(), inplace=True)
crop_data["Temperature_max"].fillna(crop_data["Temperature_max"].max(), inplace=True)
crop_data["Moisture_max"].fillna(crop_data["Moisture_max"].max(), inplace=True)

# Fill NaN values for Altitude and Annual Rainfall with their mean values
crop_data["Altitude (masl)"].fillna(crop_data["Altitude (masl)"].mean(), inplace=True)
crop_data["Annual rainfall (mm)"].fillna(crop_data["Annual rainfall (mm)"].mean(), inplace=True)

crop_data["Crop water need (mm/total growing period)"].fillna(0, inplace=True)
crop_data["Growing period (days)"].fillna(0, inplace=True)

crop_data["Soil type"].fillna('Loam', inplace=True)
crop_data["Crop type"].fillna('Unknown', inplace=True)
crop_data["Crop"].fillna('Unknown', inplace=True)   

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  crop_data['pH'].fillna(7, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  crop_data['Irrigation required(%)'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting 

In [1270]:
# Calculate averages for min and max columns to get single representative values
crop_data['Humidity_avg'] = (crop_data['Humidity_min'] + crop_data['Humidity_max']) / 2
crop_data['Temperature_avg'] = (crop_data['Temperature_min'] + crop_data['Temperature_max']) / 2
crop_data['Moisture_avg'] = (crop_data['Moisture_min'] + crop_data['Moisture_max']) / 2

In [1271]:
models = {
    'Random Forest': RandomForestClassifier(),
    'Decision Tree': DecisionTreeClassifier(),
    'SVM': SVC(probability=True),
    'Naive Bayes': GaussianNB()
    # 'XGBoost': XGBClassifier()
}

In [1272]:

# # Prepare the dataset for training
# train_features = crop_data[['Humidity_avg', 'Temperature_avg', 'pH', 'K', 'P', 'N']]
# target = crop_data['Crop']

# # Initialize a dictionary to hold model accuracies
# model_accuracies = {model_name: [] for model_name in models.keys()}

# # Train and evaluate each model using the entire dataset
# for model_name, model in models.items():
#     # Fit the model on the entire dataset
#     model.fit(train_features, target)
    
#     # Predict on the same dataset (since you're using the entire dataset)
#     y_pred = model.predict(train_features)
    
#     # Calculate accuracy using the entire dataset
#     accuracy = accuracy_score(target, y_pred)
#     model_accuracies[model_name].append(accuracy)

# # Calculate the mean accuracy for each model
# mean_accuracies = {model_name: sum(accuracies) / len(accuracies) for model_name, accuracies in model_accuracies.items()}

# # Determine the best model based on mean accuracy
# best_model_name = max(mean_accuracies, key=mean_accuracies.get)
# best_model = models[best_model_name]

# # Save the best model using joblib
# joblib.dump(best_model, 'models/best_model4.pkl')

# # Print model accuracies
# for model_name, accuracies in model_accuracies.items():
#     print(f"{model_name} Mean Accuracy: {sum(accuracies) / len(accuracies):.2f}")

# print(f"\nBest Model: {best_model_name} with Mean Accuracy: {mean_accuracies[best_model_name]:.2f}")

In [1273]:
# Prepare the dataset for training
train_features = crop_data[['Humidity_avg', 'Temperature_avg', 
                             'pH', 'K', 'P', 'N']]
target = crop_data['Crop']

In [1274]:
# Initialize StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [1275]:
# Initialize a dictionary to hold model accuracies
model_accuracies = {model_name: [] for model_name in models.keys()}

In [1276]:
# Train and evaluate each model using StratifiedKFold
for model_name, model in models.items():
    for train_index, test_index in skf.split(train_features, target):
        X_train, X_test = train_features.iloc[train_index], train_features.iloc[test_index]
        y_train, y_test = target.iloc[train_index], target.iloc[test_index]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        model_accuracies[model_name].append(accuracy)



In [1277]:
# Calculate the mean accuracy for each model
mean_accuracies = {model_name: sum(accuracies)/len(accuracies) for model_name, accuracies in model_accuracies.items()}

In [1278]:
# Determine the best model
best_model_name = max(mean_accuracies, key=mean_accuracies.get)
best_model = models[best_model_name]
joblib.dump(best_model, 'models/best_model4.pkl')

['models/best_model4.pkl']

In [1279]:
# Print model accuracies
for model_name, accuracies in model_accuracies.items():
    print(f"{model_name} Mean Accuracy: {sum(accuracies)/len(accuracies):.2f}")

print(f"\nBest Model: {best_model_name} with Mean Accuracy: {mean_accuracies[best_model_name]:.2f}")

Random Forest Mean Accuracy: 0.80
Decision Tree Mean Accuracy: 0.80
SVM Mean Accuracy: 0.39
Naive Bayes Mean Accuracy: 0.80

Best Model: Random Forest with Mean Accuracy: 0.80


In [1280]:
# User input
user_input = {
    "district": "Kicukiro",
    "sector": "Nyarugunga",
    "start_date_to_plant": "2024-02-01"
}

In [1281]:
# Filter region data based on user input
filtered_region = region_data[
    (region_data['District'] == user_input['district']) & 
    (region_data['Sector'] == user_input['sector'])
]

# Check if filtered_region is empty
if filtered_region.empty:
    print("No data found for the specified district and sector.")
else:
    # Extract relevant features from filtered_region
    humidity = filtered_region['Humidity'].values[0]
    temperature = filtered_region['Temperature'].values[0]
    pH = filtered_region['pH_min'].values[0]
    potassium_min = filtered_region['Potassium(ppm)'].values[0]
    phosphorus_min = filtered_region['Phosphorus(ppm)'].values[0]
    nitrogen = filtered_region['Nitrogen(%)'].values[0]

    # Prepare the features for the model using user_input
    features = {
        'Humidity_avg': humidity,
        'Temperature_avg': temperature,
        'pH': pH,
        'K': potassium_min,
        'P': phosphorus_min,
        'N': nitrogen
    }

    print(features)

    # Convert features to DataFrame
    input_data = pd.DataFrame([features])  # Wrap in a list to create a DataFrame

    # Predict suitability for the input conditions using the best model
    predictions = best_model.predict(input_data)

    # Get predicted probabilities for all classes (crops)
    predicted_probabilities = best_model.predict_proba(input_data)

    # Create a DataFrame with crops and their predicted probabilities
    crop_probabilities = pd.DataFrame(predicted_probabilities, columns=best_model.classes_)
    crop_probabilities['Crop'] = predictions

    # Ensure probabilities are in a numeric format
    crop_probabilities[best_model.classes_] = crop_probabilities[best_model.classes_].apply(pd.to_numeric, errors='coerce')

    # Remove duplicates based on Crop and the predicted probabilities
    crop_probabilities = crop_probabilities.drop_duplicates(subset=['Crop'] + list(best_model.classes_), keep='first')

    # Select the probabilities for the first row only
    first_row_probabilities = crop_probabilities.loc[0, best_model.classes_]

    # Ensure the series is numeric and call nlargest
    if first_row_probabilities.dtype == 'object':
        first_row_probabilities = pd.to_numeric(first_row_probabilities, errors='coerce')

    # Get the top crops based on predicted probabilities
    top_crops = first_row_probabilities.nlargest(5).reset_index()
    top_crops.columns = ['Crop', 'Probability']

    # Merge to get additional crop information if needed
    top_crops_info = crop_data.merge(top_crops, on='Crop', how='inner')

    # Remove duplicates in the final output to ensure each crop is displayed only once
    top_crops_info = top_crops_info.drop_duplicates(subset='Crop', keep='first')

    # Sort the top crops by Probability in descending order
    top_crops_info = top_crops_info.sort_values(by='Probability', ascending=False)

    # Output the result
    print(top_crops_info[['Crop', 'Crop type', 'Probability']])

{'Humidity_avg': 66.92674249, 'Temperature_avg': 20.90751468, 'pH': 6.01, 'K': 111.958804, 'P': 10.02317638, 'N': 39.94947467}
            Crop   Crop type  Probability
3        Cassava  Tuber/Root         0.30
0         Citrus       Fruit         0.12
7  Chilli pepper       Spice         0.12
1          Apple       Fruit         0.05
2      Onion dry     Unknown         0.05


In [1282]:
# Filter and display the row where the Crop is 'Tea'
tea_row = crop_data[crop_data['Crop'] == 'Cassava']
tea_row


Unnamed: 0,Crop,Crop type,Altitude (masl),Annual rainfall (mm),pH,Soil type,N,P,K,Crop water need (mm/total growing period),...,Crop calendar end (month),Humidity_min,Humidity_max,Temperature_min,Temperature_max,Moisture_min,Moisture_max,Humidity_avg,Temperature_avg,Moisture_avg
24,Cassava,Tuber/Root,1500.0,1250.0,6.0,Sandy Loam,120.0,12.0,84.0,1250.0,...,January,30.0,100.0,25.0,32.0,15.0,100.0,65.0,28.5,57.5
24,Cassava,Tuber/Root,1500.0,1250.0,6.0,Sandy Loam,120.0,12.0,84.0,1250.0,...,April,30.0,100.0,25.0,32.0,15.0,100.0,65.0,28.5,57.5
24,Cassava,Tuber/Root,1500.0,1250.0,6.0,Sandy Loam,120.0,12.0,84.0,1250.0,...,July,30.0,100.0,25.0,32.0,15.0,100.0,65.0,28.5,57.5
24,Cassava,Tuber/Root,1500.0,1250.0,6.0,Sandy Loam,120.0,12.0,84.0,1250.0,...,October,30.0,100.0,25.0,32.0,15.0,100.0,65.0,28.5,57.5


In [1283]:
# # Check if filtered_region is empty
# if filtered_region.empty:
#     print("No data found for the specified district and sector.")
# else:
#     # Extract relevant features for filtering crops
#     humidity = filtered_region['Humidity'].values[0]
#     temperature = filtered_region['Temperature'].values[0]
#     pH = filtered_region['pH_min'].values[0]
#     soil_moisture = filtered_region['Soil_Moisture'].values[0]

#     # Filtering crops based on the features
#     filtered_crops = crop_data[
#         (crop_data['Humidity_max'] >= humidity) & 
#         (crop_data['Humidity_min'] <= humidity) &
#         (crop_data['Temperature_max'] >= temperature) & 
#         (crop_data['Temperature_min'] <= temperature) &
#         (crop_data['pH'] >= pH) &
#         (crop_data['Soil type'].str.contains("Loamy|Sandy|Clay", case=False))  # Adjust soil type as necessary
#     ]

#      # Remove duplicates based on the 'Crop' column
#     filtered_crops = filtered_crops.drop_duplicates(subset=['Crop'])

#     # Sort the crops based on the Annual rainfall (or any other criterion)
#     top_crops = filtered_crops.sort_values(by='Annual rainfall (mm)', ascending=False).head(5)

#     # Output the result
#     print(top_crops[['Crop', 'Crop type', 'Annual rainfall (mm)']])
