In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [7]:
# Upload dataset manually or use this path if uploaded to Colab
from google.colab import files

uploaded = files.upload()  # Manually upload dataset file
file_path = list(uploaded.keys())[0]  # Get the uploaded file name

# Load dataset into a Pandas DataFrame
df = pd.read_csv(file_path)

# Display first few rows
df.head()


Saving Crop and fertilizer dataset.csv to Crop and fertilizer dataset (1).csv


Unnamed: 0,District_Name,Soil_color,Nitrogen,Phosphorus,Potassium,pH,Rainfall,Temperature,Crop,Fertilizer
0,Kolhapur,Black,75,50,100,6.5,1000,20,Sugarcane,Urea
1,Kolhapur,Black,80,50,100,6.5,1000,20,Sugarcane,Urea
2,Kolhapur,Black,85,50,100,6.5,1000,20,Sugarcane,Urea
3,Kolhapur,Black,90,50,100,6.5,1000,20,Sugarcane,Urea
4,Kolhapur,Black,95,50,100,6.5,1000,20,Sugarcane,Urea


In [8]:
# Check dataset info
print("\n📊 Dataset Info:")
df.info()

# Check for missing values
print("\n🔢 Checking Missing Values:")
print(df.isnull().sum())

# Check unique values in categorical columns
print("\n📝 Unique values in categorical columns:")
for col in ['District_Name', 'Soil_color', 'Crop', 'Fertilizer']:
    print(f"{col}: {df[col].unique()}")



📊 Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4513 entries, 0 to 4512
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   District_Name  4513 non-null   object 
 1   Soil_color     4513 non-null   object 
 2   Nitrogen       4513 non-null   int64  
 3   Phosphorus     4513 non-null   int64  
 4   Potassium      4513 non-null   int64  
 5   pH             4513 non-null   float64
 6   Rainfall       4513 non-null   int64  
 7   Temperature    4513 non-null   int64  
 8   Crop           4513 non-null   object 
 9   Fertilizer     4513 non-null   object 
dtypes: float64(1), int64(5), object(4)
memory usage: 352.7+ KB

🔢 Checking Missing Values:
District_Name    0
Soil_color       0
Nitrogen         0
Phosphorus       0
Potassium        0
pH               0
Rainfall         0
Temperature      0
Crop             0
Fertilizer       0
dtype: int64

📝 Unique values in categorical columns:
District_Nam

In [10]:
# Encoding categorical features
label_encoders = {}
categorical_columns = ['District_Name', 'Soil_color', 'Crop', 'Fertilizer']

for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Save encoder for later use
    df.head()  # Check after encoding


In [11]:
# Define features (X) and target variable (y)
X = df.drop(columns=['Fertilizer'])  # Features
y = df['Fertilizer']  # Target variable

print("\n✅ Features and Target Split Done!")
print(f"Feature Shape: {X.shape}, Target Shape: {y.shape}")



✅ Features and Target Split Done!
Feature Shape: (4513, 9), Target Shape: (4513,)


In [12]:
# Handling class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

print("\n✅ SMOTE Applied! New Data Shape:")
print(f"Feature Shape: {X_resampled.shape}, Target Shape: {y_resampled.shape}")



✅ SMOTE Applied! New Data Shape:
Feature Shape: (25916, 9), Target Shape: (25916,)


In [13]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

print("\n✅ Train-Test Split Done!")
print(f"Training Data: {X_train.shape}, Testing Data: {X_test.shape}")



✅ Train-Test Split Done!
Training Data: (20732, 9), Testing Data: (5184, 9)


In [14]:
# Standardizing numerical features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

print("\n✅ Data Normalized (Standard Scaler Applied)!")



✅ Data Normalized (Standard Scaler Applied)!


In [15]:
# Train Decision Tree Model
dt_model = DecisionTreeClassifier(max_depth=10, min_samples_split=5, random_state=42)
dt_model.fit(X_train_scaled, y_train)

print("\n✅ Decision Tree Model Trained!")



✅ Decision Tree Model Trained!


In [16]:
# Train Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
rf_model.fit(X_train_scaled, y_train)

print("\n✅ Random Forest Model Trained!")



✅ Random Forest Model Trained!


In [17]:
# Predict using both models
y_pred_dt = dt_model.predict(X_test_scaled)
y_pred_rf = rf_model.predict(X_test_scaled)

# Calculate accuracy
dt_accuracy = accuracy_score(y_test, y_pred_dt)
rf_accuracy = accuracy_score(y_test, y_pred_rf)

print("\n📊 Model Accuracy:")
print(f"✅ Decision Tree Accuracy: {dt_accuracy:.4f}")
print(f"✅ Random Forest Accuracy: {rf_accuracy:.4f}")

# Display confusion matrices
print("\n📌 Confusion Matrix (Decision Tree):")
print(confusion_matrix(y_test, y_pred_dt))

print("\n📌 Confusion Matrix (Random Forest):")
print(confusion_matrix(y_test, y_pred_rf))

# Classification Reports
print("\n📌 Classification Report (Decision Tree):")
print(classification_report(y_test, y_pred_dt))

print("\n📌 Classification Report (Random Forest):")
print(classification_report(y_test, y_pred_rf))



📊 Model Accuracy:
✅ Decision Tree Accuracy: 0.8492
✅ Random Forest Accuracy: 0.8818

📌 Confusion Matrix (Decision Tree):
[[239   0   0   0   0   0   0   0   0   0   0  16   5   0   0   0   0   0
    0]
 [  0 283   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0 274   0   0   0   0   0   0   0   2   0   0   0   8   0   0   0
    0]
 [  0   0   0 282   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0 250   0   0   0   0   0   0   0   0   0   0   0  31   0
    0]
 [  0   0   0   0   0 257   0   0   0   0   0   0   0   0   0   0   0  14
    0]
 [  0   0   0   0   0   0 287   0   0   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0 247   0   0   0   0   0   5   0   7   0   0
    0]
 [  0   0   0   0   0   0   0   0 274   0   0   0   0   0   0   0   0   0
    0]
 [  0   0   0   0   0   0   0   0   0 260   0   0   0   0   0   3   0   0
    0]
 [  0   0  13  20   0   0   0  13   0   0  80   0   0  90  15  14   

In [20]:
def predict_fertilizer(district, soil_color, nitrogen, phosphorus, potassium, pH, rainfall, temperature, crop):
      try:
        district_enc = label_encoders['District_Name'].transform([district])[0]
        soil_color_enc = label_encoders['Soil_color'].transform([soil_color])[0]
        crop_enc = label_encoders['Crop'].transform([crop])[0]

        input_data = np.array([[district_enc, soil_color_enc, nitrogen, phosphorus, potassium, pH, rainfall, temperature, crop_enc]])
        input_data = scaler.transform(input_data)

        prediction = rf_model.predict(input_data)
        predicted_fertilizer = label_encoders['Fertilizer'].inverse_transform(prediction)[0]
        return predicted_fertilizer
      except Exception as e:
        return f"Error: {e}"
        print("\n✅ Prediction Function Ready!")


In [21]:
predicted_fertilizer = predict_fertilizer(district="Kolhapur", soil_color="Black", nitrogen=75, phosphorus=50, potassium=100, pH=6.5, rainfall=1000, temperature=20, crop="Sugarcane")
print("\n🔮 Predicted Fertilizer:", predicted_fertilizer)



🔮 Predicted Fertilizer: Urea




In [22]:
from sklearn.metrics import accuracy_score

# Predict on test data
dt_predictions = dt_model.predict(X_test)
rf_predictions = rf_model.predict(X_test)

# Calculate accuracy
dt_accuracy = accuracy_score(y_test, dt_predictions)
rf_accuracy = accuracy_score(y_test, rf_predictions)

print(f"📊 Decision Tree Accuracy: {dt_accuracy * 100:.2f}%")
print(f"🌲 Random Forest Accuracy: {rf_accuracy * 100:.2f}%")


📊 Decision Tree Accuracy: 5.40%
🌲 Random Forest Accuracy: 6.81%




**1.Decision Tree Accuracy: 5.40%**

**2.Random Forest Accuracy: 6.81%**


**Random Forest is preferred because it gives more accurate results and reduces overfitting.**

### **Why Random Forest Over Decision Tree?**  

1️⃣ **Reduces Overfitting** – Decision Trees overfit; Random Forest averages multiple trees for better generalization.  

2️⃣ **Higher Accuracy** – Majority voting in Random Forest improves prediction accuracy.  

3️⃣ **Handles Noise Better** – Less sensitive to small data changes compared to Decision Trees.  

4️⃣ **Works Well on Large Datasets** – Scales better with high-dimensional data.  

5️⃣ **Feature Importance** – Identifies key parameters influencing predictions.  

6️⃣ **Handles Missing Values** – More robust compared to single Decision Trees.  

7️⃣ **Resistant to Outliers** – Averaging across trees minimizes their impact.