In [393]:
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.preprocessing import MinMaxScaler
from scipy import stats
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedShuffleSplit, StratifiedKFold, cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, hamming_loss, classification_report
from sklearn.impute import SimpleImputer
from sklearn.naive_bayes import GaussianNB
import joblib

In [394]:
crop_data = pd.read_excel('datasets/crop_recommendation_new.xlsx', sheet_name=0)
region_data = pd.read_excel('datasets/202408_KI_Database (soil, crop, location, etc).xlsx', sheet_name=0)
test_data = pd.read_excel('datasets/crop_recommendation_new.xlsx', sheet_name=2)

Cleaning crop data
-----

In [395]:
# remove the unnecessary space
crop_data.columns = crop_data.columns.str.strip()

In [396]:
total_sectors_per_district = region_data['District'].value_counts().reset_index()

In [397]:
def clean_potassium_range(potassium_range):
    if isinstance(potassium_range, str):  # Ensure it's a string
        potassium_range = potassium_range.strip()  # Clean up leading/trailing spaces
        
        if '<' in potassium_range:
            upper_bound = float(potassium_range.split('<')[-1].strip())
            # Calculate the average of 0 and the upper bound
            return (0, (0 + upper_bound) / 2)
        elif '>' in potassium_range:
            lower_bound = float(potassium_range.split('>')[-1].strip())
            # Use an arbitrary high value to compute average, like double the lower_bound
            return (lower_bound, (lower_bound + lower_bound * 2) / 2)
        elif '-' in potassium_range:
            return tuple(map(float, potassium_range.split('-')))
        else:
            return (float(potassium_range), float(potassium_range))
    else:
        # Handle case where input is already a float
        return (float(potassium_range), float(potassium_range))

In [398]:
def clean_pH_range(ph_range):
    if isinstance(ph_range, str):  # Ensure it's a string
        ph_range = ph_range.strip()  # Clean up leading/trailing spaces
        
        if '<' in ph_range:
            upper_bound = float(ph_range.split('<')[-1].strip())
            # Calculate the average of 0 and the upper bound
            return (0, (0 + upper_bound) / 2)
        elif '>' in ph_range:
            lower_bound = float(ph_range.split('>')[-1].strip())
            # Use an arbitrary high value to compute average, like double the lower_bound
            return (lower_bound, (lower_bound + lower_bound * 2) / 2)
        elif '-' in ph_range:
            return tuple(map(float, ph_range.split('-')))
        else:
            return (float(ph_range), float(ph_range))
    else:
        # Handle case where input is already a float
        return (float(ph_range), float(ph_range))

In [399]:
def clean_phosphorus_range(p_range):
    if isinstance(p_range, str):
        if '<' in p_range:
            upper_bound = float(p_range.split('<')[-1].strip())
            # Calculate the average of 0 and the upper bound
            return (0, (0 + upper_bound) / 2)
        elif '>' in p_range:
            lower_bound = float(p_range.split('>')[-1].strip())
            # Use an arbitrary high value to compute average, like double the lower_bound
            return (lower_bound, (lower_bound + lower_bound * 2) / 2)
        elif '-' in p_range:
            return tuple(map(float, p_range.split('-')))
        else:
            return (float(p_range), float(p_range))
    else:
        return (float(p_range), float(p_range))

In [400]:
def clean_humidity_range(humidity_range):
    if isinstance(humidity_range, str):
        if '<' in humidity_range:
            upper_bound = float(humidity_range.split('<')[-1].strip())
            # Calculate the average of 0 and the upper bound
            return (0, (0 + upper_bound) / 2)
        elif '>' in humidity_range:
            lower_bound = float(humidity_range.split('>')[-1].strip())
            # Use an arbitrary high value to compute average, like double the lower_bound
            return (lower_bound, (lower_bound + lower_bound * 2) / 2)
        elif '-' in humidity_range:
            return tuple(map(float, humidity_range.split('-')))
        else:
            return (float(humidity_range), float(humidity_range))
    else:
        return (float(humidity_range), float(humidity_range))

In [401]:
def clean_temperature_range(temp_range):
    if isinstance(temp_range, str):
        if '<' in temp_range:
            upper_bound = float(temp_range.split('<')[-1].strip())
            # Calculate the average of 0 and the upper bound
            return (0, (0 + upper_bound) / 2)
        elif '>' in temp_range:
            lower_bound = float(temp_range.split('>')[-1].strip())
            # Use an arbitrary high value to compute average, like double the lower_bound
            return (lower_bound, (lower_bound + lower_bound * 2) / 2)
        elif '-' in temp_range:
            return tuple(map(float, temp_range.split('-')))
        else:
            return (float(temp_range), float(temp_range))
    else:
        return (float(temp_range), float(temp_range))

In [402]:
def clean_soil_moisture_range(moisture_range):
    if isinstance(moisture_range, str):
        if '<' in moisture_range:
            upper_bound = float(moisture_range.split('<')[-1].strip())
            # Calculate the average of 0 and the upper bound
            return (0, (0 + upper_bound) / 2)
        elif '>' in moisture_range:
            lower_bound = float(moisture_range.split('>')[-1].strip())
            # Use an arbitrary high value to compute average, like double the lower_bound
            return (lower_bound, (lower_bound + lower_bound * 2) / 2)
        elif '-' in moisture_range:
            return tuple(map(float, moisture_range.split('-')))
        else:
            return (float(moisture_range), float(moisture_range))
    else:
        return (float(moisture_range), float(moisture_range))

In [403]:
# Clean and convert the pH column in crop_data
def convert_crop_ph(ph_value):
    if isinstance(ph_value, str) and '-' in ph_value:
        # If pH is in range format "6.0-7.0", convert it to the midpoint
        ph_min, ph_max = map(float, ph_value.split('-'))
        return (ph_min + ph_max) / 2  # Using the average of the range
    else:
        return float(ph_value)  # Ensure single values are floats

crop_data['pH'] = crop_data['pH'].apply(convert_crop_ph)

In [404]:
# Define a function to clean the range values or '>' values
def clean_range_or_greater_than(value):
    if isinstance(value, str):
        if '-' in value:
            # If the value is a range (e.g., '680-1400'), calculate the midpoint
            value_min, value_max = map(float, value.split('-'))
            return (value_min + value_max) / 2
        elif '>' in value:
            # If the value starts with '>' (e.g., '>4500'), remove the '>' and return the numeric part
            return float(value.replace('>', '').strip())
    # If the value is already a number (or can be converted), just return it
    try:
        return pd.to_numeric(value, errors='coerce')
    except:
        return None

In [405]:
# Apply the cleaning function to the relevant columns
crop_data['Altitude (masl)'] = crop_data['Altitude (masl)'].apply(clean_range_or_greater_than)
crop_data['Annual rainfall (mm)'] = crop_data['Annual rainfall (mm)'].apply(clean_range_or_greater_than)

In [406]:
# Function to parse the humidity range from strings like '60-70' to numerical min and max
def parse_humidity_range(humidity_range):
    # Convert to string if it's not already, and handle cases where it might be NaN or a float
    if isinstance(humidity_range, float) or pd.isna(humidity_range):
        return None, None
    try:
        min_value, max_value = map(float, str(humidity_range).split('-'))
        return min_value, max_value
    except ValueError:
        # If the split fails, return None for both values
        return None, None

# Apply the parsing function to create min and max columns in crop_data
crop_data[['Humidity_min', 'Humidity_max']] = crop_data['Humidity(%)'].apply(
    lambda x: pd.Series(parse_humidity_range(x))
)

In [407]:
# Function to parse the humidity range from strings like '60-70' to numerical min and max
def parse_temperature_range(temperature_range):
    # Convert to string if it's not already, and handle cases where it might be NaN or a float
    if isinstance(temperature_range, float) or pd.isna(temperature_range):
        return None, None
    try:
        min_value, max_value = map(float, str(temperature_range).split('-'))
        return min_value, max_value
    except ValueError:
        # If the split fails, return None for both values
        return None, None

# Apply the parsing function to create min and max columns in crop_data
crop_data[['Temperature_min', 'Temperature_max']] = crop_data['temperature (C)'].apply(
    lambda x: pd.Series(parse_temperature_range(x))
)

In [408]:
# Function to parse the humidity range from strings like '60-70' to numerical min and max
def parse_moisture_range(moisture_range):
    # Convert to string if it's not already, and handle cases where it might be NaN or a float
    if isinstance(moisture_range, float) or pd.isna(moisture_range):
        return None, None
    try:
        min_value, max_value = map(float, str(moisture_range).split('-'))
        return min_value, max_value
    except ValueError:
        # If the split fails, return None for both values
        return None, None

# Apply the parsing function to create min and max columns in crop_data
crop_data[['Moisture_min', 'Moisture_max']] = crop_data['Optimum soil moisture'].apply(
    lambda x: pd.Series(parse_moisture_range(x))
)

In [409]:
# Function to convert ranges into mean values
def convert_to_mean(value):
    if isinstance(value, float) or pd.isna(value):
        # If the value is already a float (or NaN), return it as is
        return value
    elif '-' in value:
        # If the value contains a range like "500-700", compute the mean
        low, high = value.split('-')
        return (float(low) + float(high)) / 2
    elif '>' in value:
        # If the value contains ">", return the minimum possible value plus a small buffer (e.g., 50)
        return float(value.replace('>', '').strip()) + 50
    else:
        # For single values, return as float
        return float(value.strip())

# Apply conversion to 'Annual rainfall (mm)' column
crop_data['Annual rainfall (mm)'] = crop_data['Annual rainfall (mm)'].apply(convert_to_mean)

# Apply conversion to 'Irrigation required(%)' column
crop_data['Irrigation required(%)'] = crop_data['Irrigation required(%)'].apply(convert_to_mean)

# Apply conversion to 'Crop water need (mm/total growing period)' column
crop_data['Crop water need (mm/total growing period)'] = crop_data['Crop water need (mm/total growing period)'].apply(convert_to_mean)

# Apply conversion to 'Growing period (days)' column
crop_data['Growing period (days)'] = crop_data['Growing period (days)'].apply(convert_to_mean)

In [410]:
# Mapping similar soil types to common categories
soil_mapping = {
    r'.*well[-\s]*drain.*': 'Well-drained',
    r'.*sandy.*loam.*': 'Sandy Loam',
    r'.*loam.*clay.*': 'Clay Loam',
    r'.*loamy.*': 'Loamy',
    r'.*sandy.*': 'Sandy',
    r'.*volcanic.*': 'Volcanic',
    r'.*alluvial.*': 'Alluvial'
}

# Clean the 'Soil type' column using regex mapping
crop_data['Soil type'] = crop_data['Soil type'].str.lower()
for pattern, replacement in soil_mapping.items():
    crop_data['Soil type'] = crop_data['Soil type'].str.replace(pattern, replacement, regex=True)

In [411]:
# Function to split rows with multiple start/end months into separate rows
def expand_crop_calendar(crop_data):
    rows = []
    for _, row in crop_data.iterrows():
        # Convert to string if it's not already, and replace NaN with an empty string
        start_months = str(row['Crop calendar start (month)']).replace(' ', '').split(',')
        end_months = str(row['Crop calendar end (month)']).replace(' ', '').split(',')

        # Ensure we match start and end seasons properly
        for start, end in zip(start_months, end_months):
            new_row = row.copy()
            new_row['Crop calendar start (month)'] = start.strip()
            new_row['Crop calendar end (month)'] = end.strip()
            rows.append(new_row)
    
    return pd.DataFrame(rows)

# Apply the function to expand the dataset
crop_data = expand_crop_calendar(crop_data)

In [412]:
crop_columns_to_remove = [
    'Id','Crop suitability','Optimum soil moisture','temperature (C)','Humidity(%)'
]

In [413]:
crop_data = crop_data.drop(columns=crop_columns_to_remove, errors='ignore')

In [414]:
crop_data['pH'].fillna(7, inplace=True)
crop_data['Irrigation required(%)'].fillna(0, inplace=True)

# Sample function to convert range strings into their averages
def convert_range_to_average(value):
    if isinstance(value, str) and '-' in value:
        # Split the range and convert to float
        low, high = value.split('-')
        return (float(low) + float(high)) / 2
    else:
        # Return the value as is (converting to float if it's numeric)
        try:
            return float(value)
        except ValueError:
            return np.nan

# Apply this function to the N, P, and K columns
crop_data['N'] = crop_data['N'].apply(convert_range_to_average)
crop_data['P'] = crop_data['P'].apply(convert_range_to_average)
crop_data['K'] = crop_data['K'].apply(convert_range_to_average)


crop_data["N"].fillna(crop_data["N"].min(), inplace=True)
crop_data["P"].fillna(crop_data["P"].min(), inplace=True)
crop_data["K"].fillna(crop_data["K"].min(), inplace=True)

# Fill NaN values for min columns with their respective minimum values
crop_data["Humidity_min"].fillna(crop_data["Humidity_min"].min(), inplace=True)
crop_data["Temperature_min"].fillna(crop_data["Temperature_min"].min(), inplace=True)
crop_data["Moisture_min"].fillna(crop_data["Moisture_min"].min(), inplace=True)

# Fill NaN values for max columns with their respective maximum values
crop_data["Humidity_max"].fillna(crop_data["Humidity_max"].max(), inplace=True)
crop_data["Temperature_max"].fillna(crop_data["Temperature_max"].max(), inplace=True)
crop_data["Moisture_max"].fillna(crop_data["Moisture_max"].max(), inplace=True)

# Fill NaN values for Altitude and Annual Rainfall with their mean values
crop_data["Altitude (masl)"].fillna(crop_data["Altitude (masl)"].mean(), inplace=True)
crop_data["Annual rainfall (mm)"].fillna(crop_data["Annual rainfall (mm)"].mean(), inplace=True)

crop_data["Crop water need (mm/total growing period)"].fillna(0, inplace=True)
crop_data["Growing period (days)"].fillna(0, inplace=True)

crop_data["Soil type"].fillna('Loam', inplace=True)
crop_data["Crop type"].fillna('Unknown', inplace=True)
crop_data["Crop"].fillna('Unknown', inplace=True)   

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  crop_data['pH'].fillna(7, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  crop_data['Irrigation required(%)'].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting 

Cleaning test_data
----

In [415]:
total_sectors_per_district = test_data['District'].value_counts().reset_index()

In [416]:
# Apply cleaning function to split potassium range into min and max
test_data[['potassium_min', 'potassium_max']] = test_data['Potassium(ppm)'].apply(clean_potassium_range).apply(pd.Series)

# Ensure the min and max columns are float type
test_data['potassium_min'] = test_data['potassium_min'].astype(float)
test_data['potassium_max'] = test_data['potassium_max'].astype(float)


# Apply cleaning function to split pH range into min and max
test_data[['phosphorus_min', 'phosphorus_max']] = test_data['Phosphorous(ppm)'].apply(clean_phosphorus_range).apply(pd.Series)

# Ensure the min and max columns are float type
test_data['phosphorus_min'] = test_data['phosphorus_min'].astype(float)
test_data['phosphorus_max'] = test_data['phosphorus_max'].astype(float)


# Apply cleaning function to split Phosphorous range into min and max
test_data[['pH_min', 'pH_max']] = test_data['pH'].apply(clean_pH_range).apply(pd.Series)

# Ensure the min and max columns are float type
test_data['pH_min'] = test_data['pH_min'].astype(float)
test_data['pH_max'] = test_data['pH_max'].astype(float)

In [417]:
columns_to_remove = [
    'Id','Crop suitability', 'Acid Saturation(%)', 'AcidSat', 'Boron (ppm)', 'Calcium(%)', 
    'Calcium(ppm)', 'Copper (ppm)', 'Magnessium(%)', 'Magnessium(ppm)', 
    'Manganese(ppm)', 'Manganese', 'Organic Matter(%)', 'Phosphorous(ppm)', 
    'Potassium (%)', 'Potassium(ppm)', 'Sulphur (ppm)', 'Zinc (ppm)', 'temperature (C)', 'pH', 'Optimum soil moisture', 'Humidity(%)'
]

test_data = test_data.drop(columns=columns_to_remove, errors='ignore')

Cleaning region data
-----

In [418]:
# Apply cleaning function to split Phosphorous range into min and max
region_data[['pH_min', 'pH_max']] = region_data['PH'].apply(clean_pH_range).apply(pd.Series)

# Ensure the min and max columns are float type
region_data['pH_min'] = region_data['pH_min'].astype(float)
region_data['pH_max'] = region_data['pH_max'].astype(float)

region_data.drop(columns=['PH'], inplace=True)

In [419]:
# Generalized function to clean any suitability column
def clean_suitability(value):
    # Standardize the suitability level
    if "High" in value:
        suitability = "High"
    elif "Moderate" in value:
        suitability = "Moderate"
    elif "Marginal" in value:
        suitability = "Marginal"
    elif "Unsuitable" in value:
        suitability = "Unsuitable"
    else:
        suitability = value

    # Extract limitations
    limitation = ""
    if "Limitation:" in value:
        parts = value.split("Limitation:")
        limitation = parts[1].strip() if len(parts) > 1 else ""
    
    return pd.Series([suitability, limitation])

In [420]:
# Column list
suitability_columns = [
    'Banana Suitability', 'Beans Suitability', 'Cassava Suitability', 
    'Maize Suitability', 'Groundnut Suitability', 'Potatoes Suitability', 
    'Sorghum Suitability', 'Peas Suitability', 'Soyabeans Suitability', 
    'Tea Suitability'
]

In [421]:
# Loop through each column to apply the cleaning and create new columns
for column in suitability_columns:
    # Generate standardized column names without spaces and avoid trailing underscores
    base_name = column.replace(" ", "_").replace("Suitability", "").rstrip("_")  # Remove trailing underscore if exists
    suitability_col = f"{base_name}_Suitability"
    limitation_col = f"{base_name}_Limitation"
    
    # Apply cleaning function and create new columns
    region_data[[suitability_col, limitation_col]] = region_data[column].apply(clean_suitability)
    
    # Optionally, drop the original column
    region_data.drop(columns=[column], inplace=True)

In [422]:
# List of temperature and precipitation columns by month
temperature_columns = [
    'Average Temperature (°C) - Jan', 'Average Temperature (°C) - Feb', 
    'Average Temperature (°C) - Mar', 'Average Temperature (°C) - Apr',
    'Average Temperature (°C) - May', 'Average Temperature (°C) - Jun', 
    'Average Temperature (°C) - Jul', 'Average Temperature (°C) - Aug', 
    'Average Temperature (°C) - Sep', 'Average Temperature (°C) - Oct', 
    'Average Temperature (°C) - Nov', 'Average Temperature (°C) - Dec'
]

precipitation_columns = [
    'Average Precipitation (mm) - Jan', 'Average Precipitation (mm) - Feb', 
    'Average Precipitation (mm) - Mar', 'Average Precipitation (mm) - Apr', 
    'Average Precipitation (mm) - May', 'Average Precipitation (mm) - Jun', 
    'Average Precipitation (mm) - Jul', 'Average Precipitation (mm) - Aug', 
    'Average Precipitation (mm) - Sep', 'Average Precipitation (mm) - Oct', 
    'Average Precipitation (mm) - Nov', 'Average Precipitation (mm) - Dec'
]

# Option 1: Fill NaNs with the mean for each column
region_data[temperature_columns] = region_data[temperature_columns].fillna(region_data[temperature_columns].mean())
region_data[precipitation_columns] = region_data[precipitation_columns].fillna(region_data[precipitation_columns].mean())

# Option 2: Fill NaNs with a fixed value (e.g., -1)
region_data[temperature_columns] = region_data[temperature_columns].fillna(-1)
region_data[precipitation_columns] = region_data[precipitation_columns].fillna(-1)

# Option 3: Interpolate NaNs in temperature and precipitation columns
region_data[temperature_columns] = region_data[temperature_columns].interpolate(method='linear', limit_direction='forward', axis=0)
region_data[precipitation_columns] = region_data[precipitation_columns].interpolate(method='linear', limit_direction='forward', axis=0)

In [423]:
# List of pH columns
ph_columns = ['pH_min', 'pH_max']

# Option 1: Fill NaNs with the mean for each pH column
region_data[ph_columns] = region_data[ph_columns].fillna(region_data[ph_columns].mean())

# Option 2: Fill NaNs with a specific value (e.g., -1)
region_data[ph_columns] = region_data[ph_columns].fillna(-1)

# Option 3: Interpolate NaNs in pH columns
region_data[ph_columns] = region_data[ph_columns].interpolate(method='linear', limit_direction='forward', axis=0)

In [424]:
data = region_data

In [425]:
# Extract the target columns containing suitability information
target_columns = data[['Potatoes_Suitability', 'Sorghum_Suitability', 'Peas_Suitability', 'Soyabeans_Suitability', 'Tea_Suitability']]

# Reshape the data into long format and remove the '_Suitability' suffix
target_columns = target_columns.melt(var_name='Crop', value_name='Suitability')

# Remove '_Suitability' from crop names to get clean crop names
target_columns['Crop'] = target_columns['Crop'].str.replace('_Suitability', '', regex=False)

# Repeat the data to match the reshaped target
data_repeated = data.loc[data.index.repeat(len(target_columns['Crop'].unique()))].reset_index(drop=True)

# Reset the index for target_columns to ensure they align
target_columns_reset = target_columns.reset_index(drop=True)

# Combine the repeated data with the reshaped target columns to form the final dataset
cleaned_data = pd.concat([data_repeated, target_columns_reset['Crop'], target_columns_reset['Suitability']], axis=1)

In [426]:
# Step 1: Calculate the averages from the 12 months in cleaned_data
cleaned_data['Average Temperature (°C)'] = cleaned_data[
    [f'Average Temperature (°C) - {month}' for month in ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
                                                         'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']]
].mean(axis=1)

cleaned_data['Average Precipitation (mm)'] = cleaned_data[
    [f'Average Precipitation (mm) - {month}' for month in ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
                                                           'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']]
].mean(axis=1)

# Step 2: Extract the features and target for training
X_train = cleaned_data[['pH_min', 'pH_max', 'Average Temperature (°C)', 'Average Precipitation (mm)']]
y_train = cleaned_data['Crop']

In [427]:
# Step 3: Standardize the training data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# Step 4: Train the models using the entire training data
random_forest = RandomForestClassifier(random_state=42)
random_forest.fit(X_train_scaled, y_train)

decision_tree = DecisionTreeClassifier(random_state=42)
decision_tree.fit(X_train_scaled, y_train)

# Step 5: Calculate the average temperature and precipitation for test data
if 'Temperature' in test_data.columns and 'Rainfall' in test_data.columns:
    test_data['Average Temperature (°C)'] = test_data['Temperature']
    test_data['Average Precipitation (mm)'] = test_data['Rainfall']

# Step 6: Prepare the test data with matching features
X_test = test_data[['pH_min', 'pH_max', 'Average Temperature (°C)', 'Average Precipitation (mm)']]

# Step 7: Standardize the test data using the same scaler
X_test_scaled = scaler.transform(X_test)

# Step 8: Make predictions using the trained models
rf_predictions = random_forest.predict(X_test_scaled)
dt_predictions = decision_tree.predict(X_test_scaled)

In [444]:
# Step 9: Retrieve actual crops from region_data based on location matching
test_data_with_crop = test_data.merge(cleaned_data[['Latitude', 'Longitude', 'Crop']], 
                                      on=['Latitude', 'Longitude'], how='left')

# Make sure that 'Crop' is not NaN, or handle missing values as needed (e.g., drop rows or fill with a default value)
test_data_with_crop = test_data_with_crop.dropna(subset=['Crop'])

# Now, y_test will have the correct length
y_test = test_data_with_crop['Crop']

# Ensure X_test matches the number of samples in y_test
X_test = test_data_with_crop[['pH_min', 'pH_max', 'Average Temperature (°C)', 'Average Precipitation (mm)']]

# Check the shapes to confirm they match
print(f"Shape of X_test: {X_test.shape}")
print(f"Shape of y_test: {y_test.shape}")

# Standardize the test data using the same scaler
X_test_scaled = scaler.transform(X_test)

# Step 10: Make predictions using the trained models
rf_predictions = random_forest.predict(X_test_scaled)
dt_predictions = decision_tree.predict(X_test_scaled)

# Step 11: Evaluate the models using the actual crop data from region_data
rf_accuracy = accuracy_score(y_test, rf_predictions)
dt_accuracy = accuracy_score(y_test, dt_predictions)

print("Random Forest Accuracy on test data:", rf_accuracy)
print("Decision Tree Accuracy on test data:", dt_accuracy)

print("\nRandom Forest Classification Report on test data:")
print(classification_report(y_test, rf_predictions))

print("\nDecision Tree Classification Report on test data:")
print(classification_report(y_test, dt_predictions))


Shape of X_test: (2080, 4)
Shape of y_test: (2080,)
Random Forest Accuracy on test data: 0.2048076923076923
Decision Tree Accuracy on test data: 0.2

Random Forest Classification Report on test data:
              precision    recall  f1-score   support

        Peas       0.00      0.00      0.00       416
    Potatoes       0.00      0.00      0.00       416
     Sorghum       0.00      0.00      0.00       416
   Soyabeans       0.10      0.02      0.04       416
         Tea       0.21      1.00      0.35       416

    accuracy                           0.20      2080
   macro avg       0.06      0.20      0.08      2080
weighted avg       0.06      0.20      0.08      2080


Decision Tree Classification Report on test data:
              precision    recall  f1-score   support

        Peas       0.00      0.00      0.00       416
    Potatoes       0.00      0.00      0.00       416
     Sorghum       0.00      0.00      0.00       416
   Soyabeans       0.00      0.00      0.00

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [430]:
# # Function to process user input and predict the crop
# def predict_crop(district, sector, month, cleaned_data, model='random_forest'):
#     # Retrieve the necessary rows based on District and Sector
#     region_data = cleaned_data[(cleaned_data['District'] == district) & (cleaned_data['Sector'] == sector)]
    
#     if region_data.empty:
#         raise ValueError(f"No data available for the given District: {district} and Sector: {sector}")
    
#     # Select the relevant features
#     features = ['Longitude', 'Latitude', 'Elevation', 'pH_min', 'pH_max'] + \
#                [f'Average Temperature (°C) - {m}' for m in ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
#                                                             'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']] + \
#                [f'Average Precipitation (mm) - {m}' for m in ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
#                                                                 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']]
    
#     # Define the month names list
#     month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    
#     # Filter the columns based on selected month
#     month_index = month_names.index(month)
    
#     # Extract the relevant temperature and precipitation columns for the month
#     temp_col = f'Average Temperature (°C) - {month_names[month_index]}'
#     precip_col = f'Average Precipitation (mm) - {month_names[month_index]}'

#     # Filter the necessary values for temperature and precipitation of the given month
#     user_input_data = region_data[features + [temp_col, precip_col, 'Crop']]

#     # Make sure that the data for the given month exists
#     if user_input_data.empty:
#         raise ValueError(f"No data available for {month} in the selected region")
    
#     # Prepare the input for prediction (drop 'Crop' for prediction purposes)
#     X_user_input = user_input_data.drop(columns='Crop')  # Drop the target variable
#     X_scaled = scaler.transform(X_user_input)  # Standardize based on the previous scaling

#     # Predict with the selected model
#     if model == 'random_forest':
#         prediction = random_forest.predict(X_scaled)
#     elif model == 'decision_tree':
#         prediction = decision_tree.predict(X_scaled)
#     else:
#         raise ValueError("Invalid model selected. Choose either 'random_forest' or 'decision_tree'.")

#     return prediction[0]  # Return the predicted crop

# # Example usage:
# district_input = 'Karongi'
# sector_input = 'Bwishyura'
# month_input = 'Jan'

# # Predict the crop based on the user input
# predicted_crop = predict_crop(district_input, sector_input, month_input, cleaned_data, model='random_forest')
# print(f"Predicted Crop: {predicted_crop}")

In [431]:





# label_encoders = {}
# for column in ['District', 'Sector']:
#     le = LabelEncoder()
#     X[column] = le.fit_transform(X[column])
#     label_encoders[column] = le

# # Standardize features (especially the temperature and precipitation values)
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [432]:
# # Preprocess the dataset
# data = data.dropna()  # Remove rows with missing values

# # Encode categorical variables
# label_encoders = {}
# for column in ['District', 'Sector']:  # Add any other categorical columns as needed
#     le = LabelEncoder()
#     data[column] = le.fit_transform(data[column])
#     label_encoders[column] = le

# # Calculate mean temperature and precipitation for each month
# data['Avg_Temperature'] = data[[f'Average Temperature (°C) - {month}' for month in 
#                                 ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']]].mean(axis=1)
# data['Avg_Precipitation'] = data[[f'Average Precipitation (mm) - {month}' for month in 
#                                   ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']]].mean(axis=1)

# # Define input variables (X)
# X = data[['Elevation', 'pH_min', 'pH_max', 'Avg_Temperature', 'Avg_Precipitation']]

# # Reshape the target variable (y) into a long format with 'Crop' and 'Suitability'
# y = data[['Potatoes_Suitability', 'Sorghum_Suitability', 'Peas_Suitability', 'Soyabeans_Suitability', 'Tea_Suitability']]
# y = y.melt(var_name='Crop', value_name='Suitability') 

In [433]:
# Standardize features for better model performance
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# Split the dataset into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# # Initialize models
# rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
# dt_model = DecisionTreeClassifier(random_state=42)

# # Train models
# rf_model.fit(X_train, y_train['Suitability'])
# dt_model.fit(X_train, y_train['Suitability'])

# # Predict using the models
# rf_predictions = rf_model.predict(X_test)
# dt_predictions = dt_model.predict(X_test)

# # Evaluate the models
# # Calculate accuracy
# rf_accuracy = accuracy_score(y_test['Suitability'], rf_predictions)
# dt_accuracy = accuracy_score(y_test['Suitability'], dt_predictions)

# # Detailed classification report for each model
# print("Random Forest Classification Report:")
# print(classification_report(y_test['Suitability'], rf_predictions))

# print("\nDecision Tree Classification Report:")
# print(classification_report(y_test['Suitability'], dt_predictions))

# # Print accuracies
# print(f'\nRandom Forest Accuracy: {rf_accuracy:.2f}')
# print(f'Decision Tree Accuracy: {dt_accuracy:.2f}')

In [434]:
# # Preprocess the dataset
# data = data.dropna()  # Remove rows with missing values

# # Encode categorical variables
# label_encoders = {}
# for column in ['District', 'Sector']:  # Add any other categorical columns as needed
#     le = LabelEncoder()
#     data[column] = le.fit_transform(data[column])
#     label_encoders[column] = le

# # Define input and target variables
# # # Add temperature and precipitation columns for each month to X
# # X = data[['Elevation', 'pH_min', 'pH_max'] +
# #          [f'Average Temperature (°C) - {month}' for month in ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
# #                                                               'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']] +
# #          [f'Average Precipitation (mm) - {month}' for month in ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
# #                                                                 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']]]
# # y = data[['Potatoes_Suitability', 'Sorghum_Suitability', 'Peas_Suitability', 'Soyabeans_Suitability', 'Tea_Suitability']]
# # Calculate mean temperature and precipitation for each month
# data['Avg_Temperature'] = data[[f'Average Temperature (°C) - {month}' for month in 
#                                 ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']]].mean(axis=1)
# data['Avg_Precipitation'] = data[[f'Average Precipitation (mm) - {month}' for month in 
#                                   ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']]].mean(axis=1)

# # Define input and target variables
# X = data[['Elevation', 'pH_min', 'pH_max', 'Avg_Temperature', 'Avg_Precipitation']]
# y = data[['Potatoes_Suitability', 'Sorghum_Suitability', 'Peas_Suitability', 'Soyabeans_Suitability', 'Tea_Suitability']]

# # Standardize features for better model performance
# scaler = StandardScaler()
# X_scaled = scaler.fit_transform(X)

# # Split the dataset
# X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# # Initialize models
# rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
# dt_model = DecisionTreeClassifier(random_state=42)

# # Train models
# rf_model.fit(X_train, y_train)
# dt_model.fit(X_train, y_train)

# # Predict using the models
# rf_predictions = rf_model.predict(X_test)
# dt_predictions = dt_model.predict(X_test)

# # Evaluate the models
# # Calculate accuracy for each target separately
# rf_accuracies = [accuracy_score(y_test.iloc[:, i], rf_predictions[:, i]) for i in range(y_test.shape[1])]
# dt_accuracies = [accuracy_score(y_test.iloc[:, i], dt_predictions[:, i]) for i in range(y_test.shape[1])]

# # Average accuracy across all targets
# rf_average_accuracy = sum(rf_accuracies) / len(rf_accuracies)
# dt_average_accuracy = sum(dt_accuracies) / len(dt_accuracies)

# # Detailed classification report for each model and target
# print("Random Forest Classification Report:")
# for i, target in enumerate(y.columns):
#     print(f"\nTarget: {target}")
#     print(classification_report(y_test.iloc[:, i], rf_predictions[:, i]))

# print("\nDecision Tree Classification Report:")
# for i, target in enumerate(y.columns):
#     print(f"\nTarget: {target}")
#     print(classification_report(y_test.iloc[:, i], dt_predictions[:, i]))

# # Print average accuracies
# print(f'\nRandom Forest Average Accuracy: {rf_average_accuracy:.2f}')
# print(f'Decision Tree Average Accuracy: {dt_average_accuracy:.2f}')

In [435]:
# # Encode district and sector
# district_encoded = label_encoders['District'].transform(["Karongi"])[0]
# sector_encoded = label_encoders['Sector'].transform(["Bwishyura"])[0]
    
#     # Set default values for other features (average values from dataset)
# elevation_avg = X['Elevation'].mean()
# ph_min_avg = X['pH_min'].mean()
# ph_max_avg = X['pH_max'].mean()
# avg_temp = X['Avg_Temperature'].mean()
# avg_precip = X['Avg_Precipitation'].mean()
    
#     # Combine all values into a single input vector with consistent column names
# input_data = pd.DataFrame([[elevation_avg, ph_min_avg, ph_max_avg, avg_temp, avg_precip]], 
#                               columns=['Elevation', 'pH_min', 'pH_max', 'Avg_Temperature', 'Avg_Precipitation'])
    
#     # Scale the input data using StandardScaler
# input_data = scaler.transform(input_data)
    
#     # Predict suitability for each crop using Random Forest model
# rf_prediction = rf_model.predict(input_data)
# dt_prediction = dt_model.predict(input_data)

In [436]:
# # Prediction function for new user inputs
# def predict_crop_suitability(district, sector):
#     # Encode district and sector
#     district_encoded = label_encoders['District'].transform([district])[0]
#     sector_encoded = label_encoders['Sector'].transform([sector])[0]
    
#     # Set default values for other features (average values from dataset)
#     elevation_avg = X['Elevation'].mean()
#     ph_min_avg = X['pH_min'].mean()
#     ph_max_avg = X['pH_max'].mean()
#     avg_temp = X['Avg_Temperature'].mean()
#     avg_precip = X['Avg_Precipitation'].mean()
    
#     # Combine all values into a single input vector with consistent column names
#     input_data = pd.DataFrame([[elevation_avg, ph_min_avg, ph_max_avg, avg_temp, avg_precip]], 
#                               columns=['Elevation', 'pH_min', 'pH_max', 'Avg_Temperature', 'Avg_Precipitation'])
    
#     # Scale the input data using StandardScaler
#     input_data = scaler.transform(input_data)
    
#     # Predict suitability for each crop using Random Forest model
#     rf_prediction = rf_model.predict(input_data)
#     dt_prediction = dt_model.predict(input_data)
    
#     # Return predictions from both models
#     return {
#         rf_prediction
#     }

In [437]:
# # Example prediction
# district_input = "Kirehe"  # Replace with actual district
# sector_input = "Musaza"      # Replace with actual sector
# predictions = predict_crop_suitability(district_input, sector_input)
# print(predictions)

In [438]:
# unique_districts = region_data['District'].unique()
# print("Unique Districts in DataFrame:", unique_districts)


In [439]:
# label_encoders['District'] = LabelEncoder()
# label_encoders['Sector'] = LabelEncoder()
# label_encoders['District'].fit(region_data['District']) 
# label_encoders['Sector'].fit(region_data['Sector']) 


In [440]:
# def recommend_crops(district, sector):
#     # Normalize the input
#     district = district.strip()
#     sector = sector.strip()

#     # Check if the input district and sector are valid
#     if district not in label_encoders['District'].classes_:
#         raise ValueError(f"District '{district}' not recognized. Please check the input.")
    
#     if sector not in label_encoders['Sector'].classes_:
#         raise ValueError(f"Sector '{sector}' not recognized. Please check the input.")
    
#     # Encode user input
#     district_encoded = label_encoders['District'].transform([district])[0]
#     sector_encoded = label_encoders['Sector'].transform([sector])[0]
    
#     # Prepare input for prediction
#     user_input = pd.DataFrame({
#         'District': [district_encoded],
#         'Sector': [sector_encoded]
#     })

#     # Collect probabilities for each crop target
#     crop_names = ['Potatoes', 'Sorghum', 'Peas', 'Soyabeans', 'Tea']
#     rf_probabilities = []

#     for i in range(len(crop_names)):
#         crop_probs = rf_model.estimators_[i].predict_proba(user_input)[0]  # Get probabilities for each crop
#         rf_probabilities.append(crop_probs)

#     # Convert list of lists into a DataFrame
#     rf_probabilities = pd.DataFrame(rf_probabilities, index=crop_names)

#     # Ensure rf_probabilities has the expected structure
#     if rf_probabilities.shape[1] != 5:
#         raise ValueError("Unexpected structure of rf_probabilities. Verify model output.")

#     # Mean and top crop recommendations
#     top_crops = rf_probabilities.mean(axis=1).nlargest(5)
    
#     return top_crops

# # Example usage
# try:
#     recommended_crops = recommend_crops('Nyarugenge', 'Gitega')
#     print(recommended_crops)
# except ValueError as e:
#     print(e)