In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

# Load dataset
df = pd.read_csv("D:/recomendation/crop_production_karnataka.csv")

# Drop Crop_Year column
df = df.drop(['Crop_Year'], axis=1)

# Features and target
X = df.drop(['Production'], axis=1)
y = df['Production']

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Categorical columns
categorical_cols = ['State_Name', 'District_Name', 'Season', 'Crop']

# One-hot encoding
ohe = OneHotEncoder(handle_unknown='ignore')
ohe.fit(X_train[categorical_cols])

X_train_categorical = ohe.transform(X_train[categorical_cols])
X_test_categorical = ohe.transform(X_test[categorical_cols])

X_train_final = np.hstack((X_train_categorical.toarray(), X_train.drop(categorical_cols, axis=1)))
X_test_final = np.hstack((X_test_categorical.toarray(), X_test.drop(categorical_cols, axis=1)))

# Train Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train_final, y_train)

# --- Jupyter-friendly input ---
Jstate = "Karnataka"
Jdistrict = "BAGALKOT"
Jseason = "Kharif"
Jcrops = "Rice"
Jarea = 197

user_input = np.array([[Jstate, Jdistrict, Jseason, Jcrops, Jarea]])

# One-hot encode categorical columns
user_input_categorical = ohe.transform(user_input[:, :4])

# Combine categorical and numerical
user_input_final = np.hstack((user_input_categorical.toarray(), user_input[:, 4:].astype(float)))

# Make prediction
prediction = model.predict(user_input_final)
print("Predicted Production:", prediction[0])


Predicted Production: 427.64




In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import joblib

# 1. Load your NEW dataset that includes the GEE features
df = pd.read_csv("D:/recomendation/crop_production_with_soil_data.csv")

# --- DATA PREPARATION ---
# Features (X) are everything except the target 'Production'
X = df.drop(['Production'], axis=1)
# Target (y) is 'Production'
y = df['Production']

# Identify which columns are categorical (text) and which are numerical
categorical_cols = ['State_Name', 'District_Name', 'Season', 'Crop']
# All other columns from your GEE script are numerical
numerical_cols = ['Area', 'NDVI_mean', 'SM_surface', 'SM_rootzone', 'pH_top30cm', 'SOC_gkg_top30cm', 'WC33_vpct_top30cm']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- ENCODING CATEGORICAL FEATURES ---
# Create and fit the OneHotEncoder ONLY on the training data
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
ohe.fit(X_train[categorical_cols])

# Transform both training and testing data
X_train_cat_encoded = ohe.transform(X_train[categorical_cols])
X_test_cat_encoded = ohe.transform(X_test[categorical_cols])

# Combine encoded categorical features with the numerical features
X_train_final = np.hstack((X_train_cat_encoded, X_train[numerical_cols].values))
X_test_final = np.hstack((X_test_cat_encoded, X_test[numerical_cols].values))

# --- MODEL TRAINING ---
print("Training the Random Forest model with soil data...")
# Initialize and train the model
soil_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
soil_model.fit(X_train_final, y_train)
print("Model training complete.")

# --- SAVE THE TRAINED OBJECTS ---
# This is the step you asked for. We save the objects needed for production.

# Save the fitted OneHotEncoder
joblib.dump(ohe, 'soil_data_encoder.joblib')

# Save the trained RandomForest model
joblib.dump(soil_model, 'soil_data_model.joblib')

print("\nSuccessfully saved 'soil_data_encoder.joblib' and 'soil_data_model.joblib'")
print("You can now move these files to your Flask project's /models/ folder.")

‚úÖ Model files saved successfully!
üìÅ Files created:
   - models/crop_production_model.pkl
   - models/one_hot_encoder.pkl
‚úÖ Files verified - can be loaded successfully!


In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import joblib
import os

# --- 1. Data Loading and Preprocessing ---

# Create a models directory if it doesn't exist
if not os.path.exists('models'):
    os.makedirs('models')

# Load dataset
# Make sure the CSV file is in the same directory as this script
try:
    df = pd.read_csv("D:/recomendation/crop_production_karnataka.csv")
except FileNotFoundError:
    print("Error: 'crop_production_karnataka.csv' not found. Please place it in the correct directory.")
    exit()


# Drop Crop_Year column
df = df.drop(['Crop_Year'], axis=1)

# Handle potential missing values in Production (target variable)
df.dropna(subset=['Production'], inplace=True)


# Features and target
X = df.drop(['Production'], axis=1)
y = df['Production']

# Split train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# --- 2. Encoding Categorical Features ---

# Identify categorical columns
categorical_cols = ['State_Name', 'District_Name', 'Season', 'Crop']

# Initialize and fit OneHotEncoder
ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)
ohe.fit(X_train[categorical_cols])

# Save the fitted encoder
joblib.dump(ohe, 'models/encoder.joblib')
print("Encoder saved to models/encoder.joblib")


# Transform the training and testing data
X_train_categorical = ohe.transform(X_train[categorical_cols])
X_test_categorical = ohe.transform(X_test[categorical_cols])

# Combine encoded categorical features with numerical features
X_train_final = np.hstack((X_train_categorical, X_train.drop(categorical_cols, axis=1).values))
X_test_final = np.hstack((X_test_categorical, X_test.drop(categorical_cols, axis=1).values))


# --- 3. Model Training ---

# Initialize and train Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1) # Use n_jobs=-1 for faster training
model.fit(X_train_final, y_train)
print("Model training complete.")

# --- 4. Save the Trained Model ---

# Save the trained model to the models directory
joblib.dump(model, 'models/crop_yield_model.joblib')
print("Model saved to models/crop_yield_model.joblib")

Encoder saved to models/encoder.joblib
Model training complete.
Model saved to models/crop_yield_model.joblib
