In [1]:
import pandas as pd
import numpy as np
from io import StringIO
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

class CropYieldRecommender:
    """
    A system to predict crop yield and recommend the best crop.
    
    Part 1: Trains a RandomForestRegressor to predict 'hg/ha_yield'.
    Part 2: Uses the trained model to find the crop with the highest
            predicted yield for a given set of conditions.
    """
    
    def __init__(self):
        self.model_pipeline = None
        self.preprocessor = None
        self.model = None
        self.all_crops = []
        self.feature_names = []

    def _build_preprocessor(self, X):
        """
        Defines the ColumnTransformer to handle mixed data types.
        """
        # Identify numerical and categorical features
        # Note: 'Year' is treated as numerical here.
        self.numerical_features = X.select_dtypes(include=np.number).columns.tolist()
        self.categorical_features = X.select_dtypes(include='object').columns.tolist()
        
        # Create transformers
        numeric_transformer = Pipeline(steps=[
            ('scaler', StandardScaler())
        ])
        
        categorical_transformer = Pipeline(steps=[
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])
        
        # Create the main preprocessor
        self.preprocessor = ColumnTransformer(
            transformers=[
                ('num', numeric_transformer, self.numerical_features),
                ('cat', categorical_transformer, self.categorical_features)
            ])
        
    def fit(self, csv_data):
        """
        Loads data, builds the preprocessor, and trains the model pipeline.
        """
        # 1. Load and prepare data
        df = pd.read_csv(csv_data)
        df = df.drop('id', axis=1) # Drop the 'id' column as it's an index
        
        # Get list of all unique crops for the recommender
        self.all_crops = df['Item'].unique()
        
        # Define features (X) and target (y)
        X = df.drop('hg/ha_yield', axis=1)
        y = df['hg/ha_yield']
        
        # 2. Build and fit the preprocessor
        self._build_preprocessor(X)
        
        # 3. Create the full model pipeline
        self.model = RandomForestRegressor(n_estimators=100, random_state=42)
        
        self.model_pipeline = Pipeline(steps=[
            ('preprocessor', self.preprocessor),
            ('regressor', self.model)
        ])
        
        # 4. Train the model
        print("Training model...")
        self.model_pipeline.fit(X, y)
        print("Model trained.")
        
        # Store feature names for prediction
        self.feature_names = X.columns.tolist()

    def predict_yield(self, input_data):
        """
        (Part 1) Predicts the yield for a single, complete observation.
        
        Args:
            input_data (dict): A dictionary with all feature columns.
        """
        if not self.model_pipeline:
            raise NotFittedError("Model is not fitted yet. Call .fit() first.")
            
        # Convert dict to DataFrame, ensuring correct column order
        input_df = pd.DataFrame([input_data])[self.feature_names]
        
        prediction = self.model_pipeline.predict(input_df)
        return prediction[0]

    def recommend_best_crop(self, input_data):
        """
        (Part 2) Recommends the best crop for given environmental conditions.
        
        Args:
            input_data (dict): A dictionary with all features *except* 'Item'.
        """
        if not self.model_pipeline:
            raise NotFittedError("Model is not fitted yet. Call .fit() first.")
        
        # Create a test DataFrame
        # One row for each possible crop
        test_data = []
        for crop in self.all_crops:
            row = input_data.copy()
            row['Item'] = crop
            test_data.append(row)
            
        test_df = pd.DataFrame(test_data)[self.feature_names]
        
        # Get yield predictions for all crops
        predictions = self.model_pipeline.predict(test_df)
        
        # Combine crops with their predicted yields
        recommendations = list(zip(self.all_crops, predictions))
        
        # Sort by yield (highest first)
        recommendations.sort(key=lambda x: x[1], reverse=True)
        
        return recommendations

# --- Main execution ---
if __name__ == "__main__":
    
    # 1. Initialize and train the system
    recommender = CropYieldRecommender()
    recommender.fit("/mnt/10EE4B76EE4B5360/College/pccoe/7th Sem/RS/RS-A1_yield.csv")
    
    print("\n" + "="*40 + "\n")
    
    # --- Part 1: Yield Prediction ---
    print("--- Part 1: Predict Yield (Regression) ---")
    
    # Test with a known data point (Albania, Maize, 1990)
    test_prediction_data = {
        'Area': 'Albania',
        'Item': 'Maize',
        'Year': 1990,
        'average_rain_fall_mm_per_year': 1485.0,
        'pesticides_tonnes': 121.0,
        'avg_temp': 16.37
    }
    
    predicted_yield = recommender.predict_yield(test_prediction_data)
    print(f"Prediction for (Albania, Maize, 1990): {predicted_yield:.2f} hg/ha")
    print("(Actual was: 36613 hg/ha)")
    
    print("\n" + "="*40 + "\n")

    # --- Part 2: Crop Recommendation ---
    print("--- Part 2: Recommend Best Crop (Recommender) ---")
    
    # Define the environmental conditions (e.g., Algeria in 1991)
    conditions = {
        'Area': 'Algeria',
        'Year': 1991,
        'average_rain_fall_mm_per_year': 89.0,
        'pesticides_tonnes': 260.0,
        'avg_temp': 23.78
    }
    
    print(f"Conditions: Area={conditions['Area']}, Year={conditions['Year']}, Rain={conditions['average_rain_fall_mm_per_year']}mm, Temp={conditions['avg_temp']}C")
    
    best_crops = recommender.recommend_best_crop(conditions)
    
    print("\nTop 3 Recommended Crops (Highest Predicted Yield):")
    for i, (crop, yield_val) in enumerate(best_crops[:3]):
        print(f"  {i+1}. {crop}: {yield_val:.2f} hg/ha")
        
    print("\nFull Ranking:")
    print(pd.DataFrame(best_crops, columns=['Crop', 'Predicted_Yield']).to_string(index=False))

Training model...
Model trained.


--- Part 1: Predict Yield (Regression) ---
Prediction for (Albania, Maize, 1990): 32525.50 hg/ha
(Actual was: 36613 hg/ha)


--- Part 2: Recommend Best Crop (Recommender) ---
Conditions: Area=Algeria, Year=1991, Rain=89.0mm, Temp=23.78C

Top 3 Recommended Crops (Highest Predicted Yield):
  1. Potatoes: 164580.95 hg/ha
  2. Sweet potatoes: 98713.67 hg/ha
  3. Cassava: 79518.89 hg/ha

Full Ranking:
                Crop  Predicted_Yield
            Potatoes        164580.95
      Sweet potatoes         98713.67
             Cassava         79518.89
Plantains and others         65156.41
                Yams         63553.20
         Rice, paddy         26829.28
               Maize         26778.52
            Soybeans         15709.82
             Sorghum         15181.15
               Wheat         11515.88
