<a href="https://colab.research.google.com/github/Karnikasri/Soil_Heath_Status_-_Remediation_System/blob/main/dataset_capstone.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

# Step 1: Load original dataset
df = pd.read_csv("india_soil_health_card_data.csv")

# Step 2: Define classification function using Soil Health Card (SHC, Govt. of India) thresholds
def classify_soil(row):
    N, P, K, OC = row['nitrogen'], row['phosphorus'], row['potassium'], row['organic_carbon']

    # Categorization based on SHC standards
    N_status = "Low" if N < 280 else "Medium" if N <= 560 else "High"
    P_status = "Low" if P < 10 else "Medium" if P <= 25 else "High"
    K_status = "Low" if K < 108 else "Medium" if K <= 280 else "High"
    OC_status = "Low" if OC < 0.5 else "Medium" if OC <= 0.75 else "High"

    # Final Soil Health Classification
    if "Low" in [N_status, P_status, K_status, OC_status]:
        return "Poor"
    elif all(x == "High" for x in [N_status, P_status, K_status, OC_status]):
        return "Good"
    else:
        return "Moderate"

# Step 3: Apply classification
df["Health_Status"] = df.apply(classify_soil, axis=1)

# Step 4: Generate synthetic "Good" samples (to balance dataset)
n_samples = 500  # You can tune this based on model needs

synthetic_good = pd.DataFrame({
    "state": np.random.choice(df["state"], n_samples),
    "district": np.random.choice(df["district"], n_samples),
    "ph": np.random.uniform(6.5, 7.5, n_samples),  # Neutral to slightly alkaline
    "organic_carbon": np.random.uniform(0.8, 1.2, n_samples),
    "nitrogen": np.random.uniform(600, 800, n_samples),
    "phosphorus": np.random.uniform(30, 50, n_samples),
    "potassium": np.random.uniform(300, 500, n_samples),
    "sulphur": np.random.uniform(df["sulphur"].min(), df["sulphur"].max(), n_samples),
    "zinc": np.random.uniform(df["zinc"].min(), df["zinc"].max(), n_samples),
    "boron": np.random.uniform(df["boron"].min(), df["boron"].max(), n_samples),
    "iron": np.random.uniform(df["iron"].min(), df["iron"].max(), n_samples),
    "manganese": np.random.uniform(df["manganese"].min(), df["manganese"].max(), n_samples),
    "copper": np.random.uniform(df["copper"].min(), df["copper"].max(), n_samples),
    "soil_type": np.random.choice(df["soil_type"], n_samples),
    "rainfall": np.random.uniform(df["rainfall"].min(), df["rainfall"].max(), n_samples),
    "temperature": np.random.uniform(df["temperature"].min(), df["temperature"].max(), n_samples),
    "Health_Status": "Good"
})

# Step 5: Merge original + synthetic dataset
df_balanced = pd.concat([df, synthetic_good], ignore_index=True)

# Step 6: Save balanced dataset to CSV
df_balanced.to_csv("soil_health_balanced.csv", index=False)

# Step 7: Print distribution check
print("Original distribution:\n", df["Health_Status"].value_counts())
print("\nBalanced distribution:\n", df_balanced["Health_Status"].value_counts())
print("\nSaved file: soil_health_balanced.csv")



Original distribution:
 Health_Status
Poor        4885
Moderate     115
Name: count, dtype: int64

Balanced distribution:
 Health_Status
Poor        4885
Good         500
Moderate     115
Name: count, dtype: int64

Saved file: soil_health_balanced.csv


In [None]:
# model1_train.py
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import joblib

# 1. Load labelled data (created above)
df = pd.read_csv("/content/soil_health_balanced.csv")

# 2. Features & target (adapt these names to your CSV)
features = ['ph','organic_carbon','nitrogen','phosphorus','potassium',
            'sulphur','zinc','boron','iron','manganese','copper',
            'soil_type','rainfall','temperature']  # include environmental features if desired
target = 'Health_Status'

# 3. Basic cleaning: drop rows with missing target or essential features
df = df.dropna(subset=['ph','organic_carbon','nitrogen','phosphorus','potassium', target])
X = df[features]
y = df[target]

# 4. Preprocessing pipeline
# - OneHotEncode soil_type (categorical); remainder passthrough or scaled if you like
preprocessor = ColumnTransformer(transformers=[
    ('soil', OneHotEncoder(handle_unknown='ignore'), ['soil_type'])
], remainder='passthrough')

pipeline = Pipeline([
    ('pre', preprocessor),
    ('clf', RandomForestClassifier(n_estimators=200, random_state=42, class_weight='balanced'))
])

# 5. Train-test split (stratify to keep class balance)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 6. Train
pipeline.fit(X_train, y_train)

# 7. Evaluate
y_pred = pipeline.predict(X_test)
print("Classification report:\n", classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))

# 8. Save model for later integration
joblib.dump(pipeline, "model1_random_forest.joblib")
print("Saved model1_random_forest.joblib")

Classification report:
               precision    recall  f1-score   support

        Good       1.00      1.00      1.00       100
    Moderate       1.00      0.83      0.90        23
        Poor       1.00      1.00      1.00       977

    accuracy                           1.00      1100
   macro avg       1.00      0.94      0.97      1100
weighted avg       1.00      1.00      1.00      1100

Confusion matrix:
 [[100   0   0]
 [  0  19   4]
 [  0   0 977]]
Saved model1_random_forest.joblib


In [None]:
# Existing recommend_bacteria function
def recommend_bacteria(nitrogen, phosphorus, potassium, organic_carbon):
    if nitrogen < 280 or phosphorus < 10 or potassium < 108 or organic_carbon < 0.5:
        status = "Poor"
        if nitrogen < 280:
            bacteria = "Azospirillum"
        elif phosphorus < 10:
            bacteria = "Pseudomonas fluorescens"
        elif potassium < 108:
            bacteria = "Frateuria aurantia"
        else:
            bacteria = "Rhizobium"
    elif nitrogen > 560 and phosphorus > 25 and potassium > 280 and organic_carbon > 0.75:
        status = "Good"
        bacteria = "No bacteria"
    else:
        status = "Moderate"
        recs = []
        if 280 <= nitrogen <= 350:
            recs.append("Azospirillum")
        if 10 <= phosphorus <= 15:
            recs.append("Pseudomonas fluorescens")
        if 108 <= potassium <= 150:
            recs.append("Frateuria aurantia")
        if 0.5 <= organic_carbon <= 0.6:
            recs.append("Rhizobium")
        bacteria = ', '.join(recs) if recs else "Trichoderma"
    return status, bacteria

# Dosage and cost mappings (per acre)
dosage_per_acre = {
    'Azospirillum': 2.0,          # liters
    'Pseudomonas fluorescens': 1.5,
    'Frateuria aurantia': 2.0,
    'Rhizobium': 0.2,
    'Trichoderma': 1.5,
    'No bacteria': 0.0
}

cost_per_liter = {
    'Azospirillum': 400,          # INR per liter
    'Pseudomonas fluorescens': 500,
    'Frateuria aurantia': 450,
    'Rhizobium': 250,
    'Trichoderma': 350,
    'No bacteria': 0
}

def calculate_dosage_and_cost(bacteria_str, land_area_acres=1):
    bacteria_list = [b.strip() for b in bacteria_str.split(',')]
    n = len(bacteria_list)
    dosage_dict = {}
    total_cost = 0

    # Distribute dosage equally if multiple bacteria are recommended
    for b in bacteria_list:
        base_dose = dosage_per_acre.get(b, 1.0)
        dose = base_dose * land_area_acres / n
        cost = dose * cost_per_liter.get(b, 400)
        dosage_dict[b] = round(dose, 2)
        total_cost += cost

    return dosage_dict, round(total_cost, 2)

# Example to process full DataFrame `df` with columns: nitrogen, phosphorus, potassium, organic_carbon, and land_area (acre)
# If land_area is not known, default to 1 acre
df['land_area'] = df.get('land_area', 1)

# Add results columns
df['Predicted_Health'], df['Recommended_Bacteria'] = zip(*df.apply(
    lambda row: recommend_bacteria(row['nitrogen'], row['phosphorus'], row['potassium'], row['organic_carbon']), axis=1))

dosages = []
costs = []

for idx, row in df.iterrows():
    dosage_dict, cost = calculate_dosage_and_cost(row['Recommended_Bacteria'], land_area_acres=row['land_area'])
    dosages.append(dosage_dict)
    costs.append(cost)

df['Dosage_Recommendation'] = dosages
df['Estimated_Cost_INR'] = costs

# Print example first 5 rows
for i in range(5):
    print(f"Sample {i+1}:")
    print(f"  Health Status: {df.loc[i, 'Predicted_Health']}")
    print(f"  Bacteria: {df.loc[i, 'Recommended_Bacteria']}")
    print(f"  Dosage per acre: {df.loc[i, 'Dosage_Recommendation']}")
    print(f"  Estimated Cost (INR): ₹{df.loc[i, 'Estimated_Cost_INR']}\n")


Sample 1:
  Health Status: Poor
  Bacteria: Azospirillum
  Dosage per acre: {'Azospirillum': 2.0}
  Estimated Cost (INR): ₹800.0

Sample 2:
  Health Status: Poor
  Bacteria: Azospirillum
  Dosage per acre: {'Azospirillum': 2.0}
  Estimated Cost (INR): ₹800.0

Sample 3:
  Health Status: Poor
  Bacteria: Azospirillum
  Dosage per acre: {'Azospirillum': 2.0}
  Estimated Cost (INR): ₹800.0

Sample 4:
  Health Status: Poor
  Bacteria: Azospirillum
  Dosage per acre: {'Azospirillum': 2.0}
  Estimated Cost (INR): ₹800.0

Sample 5:
  Health Status: Poor
  Bacteria: Azospirillum
  Dosage per acre: {'Azospirillum': 2.0}
  Estimated Cost (INR): ₹800.0

