In [1]:
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

In [2]:
# Pandas object
org_data = pd.read_csv("Crop_Recommendation.csv")

In [3]:
# Visualizing the structure of data
org_data.head(10)

Unnamed: 0,N,P,K,temperature,humidity,ph,rainfall,label
0,90,42,43,20.879744,82.002744,6.502985,202.935536,rice
1,85,58,41,21.770462,80.319644,7.038096,226.655537,rice
2,60,55,44,23.004459,82.320763,7.840207,263.964248,rice
3,74,35,40,26.491096,80.158363,6.980401,242.864034,rice
4,78,42,42,20.130175,81.604873,7.628473,262.71734,rice
5,69,37,42,23.058049,83.370118,7.073454,251.055,rice
6,69,55,38,22.708838,82.639414,5.700806,271.32486,rice
7,94,53,40,20.277744,82.894086,5.718627,241.974195,rice
8,89,54,38,24.515881,83.535216,6.685346,230.446236,rice
9,68,58,38,23.223974,83.033227,6.336254,221.209196,rice


In [4]:
# Checking for 'null' values
org_data.isnull().sum()

N              0
P              0
K              0
temperature    0
humidity       0
ph             0
rainfall       0
label          0
dtype: int64

In [5]:
data = np.array(org_data)
m, n = data.shape       # (2200, 8)

# Data reproducibility
random.seed(42)
random.shuffle(data)

# Test Data --- 20% of total datapoints
data_test = data[0:m//5]         # shape: (440, 8)
X_test = data_test[:, 0:n-1]     # shape: (440, 7)
Y_test = data_test[:, -1]        # shape: (440,)

# Training Data --- 80% of total datapoints
data_train = data[m//5:]         # shape: (1760, 8)
X_train = data_train[:, 0:n-1]   # shape: (1760, 7)
Y_train = data_train[:, -1]      # shape: (1760,)

In [6]:
# Initializing the model
model = RandomForestClassifier()

# Fitting the model to the training data
model.fit(X_train, Y_train)

In [7]:
# Predicting the output of the trained model on unseen 'test' data
predictions = model.predict(X_test)

# Manual Accuracy Calculation
manual_accuracy = accuracy_score(Y_test, predictions)
print(f"Manual Accuracy: {manual_accuracy * 100}%")

# Auto Accuracy Calculation
accuracy = model.score(X_test, Y_test)
print(f"Auto Accuracy: {accuracy * 100}%")

Manual Accuracy: 100.0%
Auto Accuracy: 100.0%


In [None]:
# ------ WHY 100% ACCURACY? ------

# Debugging Part 1: Check if any test samples are in training
from sklearn.metrics import pairwise_distances_argmin_min
closest_train_indices, _ = pairwise_distances_argmin_min(X_test, X_train)
print("Number of test samples with identical training samples:", (closest_train_indices == 0).sum())


# Debugging Part 2: Try simpler models
from sklearn.linear_model import LogisticRegression     # Logistic Regression Model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train, Y_train)
print(f"Logistic Regression Accuracy: {lr_model.score(X_test, Y_test) * 100}%")

from sklearn.tree import DecisionTreeClassifier         # Decision Tree Classifier Model
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, Y_train)
print(f"DT Accuracy: {dt_model.score(X_test, Y_test) * 100}%")


# Debugging Part 3: Feature Importance Plot
importances = model.feature_importances_
features = org_data.columns[:-1]
plt.bar(features, importances)
plt.xticks(rotation=45)
plt.title("Feature Importance")
plt.show()


# ------ OUTPUT ------
'''
    Number of test samples with identical training samples: 1
    Logistic Regression Accuracy: 100.0%
    DT Accuracy: 100.0%
    Multiple features contribute significantly.
'''


# ------ CONCLUSION ------
'''
    Since all models (Random Forest, Decision Tree, and Logistic Regression) achieve 100% accuracy,
    and the feature importance plot shows that multiple features contribute significantly (no single dominant feature),
    the conclusion is clear:

    "This Dataset is Perfectly Separable"
    1. Every unique combination of N, P, K, temperature, humidity, pH, rainfall maps to exactly one crop label.
    2. No ambiguity exists → Models learn a deterministic mapping (like a lookup table).
'''


# ------ WHY DOES THIS HAPPEN? ------
'''
    1. Dataset is Synthetic or Overly Simplified:
       a. Likely generated using hard-coded rules (e.g., "If rainfall > 100 and pH < 6 → Rice").
       b. Real-world crop data would have noise (e.g., same conditions could grow multiple crops due to unmeasured factors like soil type).

    2. No Overfitting or Data Leakage:
       Confirmed by:
       a. Only 1 duplicate between train/test.
       b. Logistic Regression (a simpler model) also gets 100%.
       c. Feature importance is distributed (no single feature dominates).
'''


# ------ WHAT CAN WE DO? ------
'''
    1. Try a messier dataset to see more realistic results (~90-95% accuracy).

    2. Add Noise to simulate measurement errors:
       If accuracy drops to ~90-95%, the original data was too perfect.

    3. Collect More Features (e.g., soil type, elevation) to introduce realistic variability.
'''

In [None]:
# # Noise Addition to check impact on Accuracy
# X_train_noisy = X_train + np.random.normal(0, 10.0, X_train.shape)  # Small noise (Try 0.1 -> 1 -> 10 -> 100): Accuracy will drop as (100.0 -> 100.0 -> 99.77 -> 19 - 70)
# model.fit(X_train_noisy, Y_train)
# print(f"Noisy Data Accuracy: {model.score(X_test, Y_test) * 100}%")

In [16]:
multiple_samples = [
    [36, 58, 25, 28.66024, 59.31891, 8.399136, 36.9263],
    [40, 60, 30, 30.00, 50.00, 7.00, 240.00]]

predicted_crop = model.predict(np.array(multiple_samples))
print(f"Predicted Crop Array: {predicted_crop}")

Predicted Crop Array: ['mothbeans' 'pigeonpeas']
