In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_absolute_error
import joblib

In [7]:
def train_and_evaluate():
    print("‚è≥ Loading agricultural_data.csv...")
    try:
        df = pd.read_csv("C:\\calcutta_hacks\\Project-Kishan-Kolkata-Hacks\\data\\agricultural_data.csv")
    except FileNotFoundError:
        print("‚ùå Error: agricultural_data.csv not found.")
        return

    # 1. Setup
    features = ['NDVI', 'GNDVI', 'NDWI', 'SAVI', 'soil_moisture', 
                'temperature', 'rainfall', 'crop_type']
    target = 'yield'
    
    X = df[features]
    y = df[target]

    # 2. Pipeline Construction
    preprocessor = ColumnTransformer(
        transformers=[
            ('num', 'passthrough', ['NDVI', 'GNDVI', 'NDWI', 'SAVI', 'soil_moisture', 'temperature', 'rainfall']),
            ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), ['crop_type'])
        ])

    # We use 'max_depth=15' to prevent the model from becoming too complex (Pruning)
    # This further reduces the risk of overfitting.
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', RandomForestRegressor(n_estimators=100, max_depth=15, random_state=42))
    ])

    # 3. Train/Test Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # 4. Training
    print("üöÇ Training Model...")
    model.fit(X_train, y_train)

    # 5. Overfitting Check
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
    
    y_test_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_test_pred)

    print("\nüìä --- MODEL DIAGNOSTICS ---")
    print(f"   Training Accuracy (R¬≤): {train_score:.4f}")
    print(f"   Testing Accuracy (R¬≤):  {test_score:.4f}")
    print(f"   Mean Absolute Error:    {mae:.4f} tons/ha")
    
    gap = train_score - test_score
    if gap > 0.15:
        print("   ‚ö†Ô∏è WARNING: High Overfitting Risk! (Gap > 15%)")
    elif gap > 0.05:
        print("   ‚ÑπÔ∏è Note: Slight Overfitting (Normal for Random Forest)")
    else:
        print("   ‚úÖ PERFECT: Model generalizes extremely well.")

    # 6. Save
    joblib.dump(model, 'models/best_model.pkl')
    print("\nüíæ Model Saved to 'models/best_model.pkl'")

if __name__ == "__main__":
    train_and_evaluate()

‚è≥ Loading agricultural_data.csv...
‚ùå Error: agricultural_data.csv not found.
