In [2]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp311-cp311-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [3]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

In [6]:
data = pd.read_csv('/content/Expanded_Crop_price.csv')

In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1500 entries, 0 to 1499
Data columns (total 7 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   Vegetable                        1500 non-null   object
 1   Season                           1500 non-null   object
 2   Month                            1500 non-null   object
 3   Temp                             1500 non-null   int64 
 4   Deasaster Happen in last 3month  1500 non-null   object
 5   Vegetable condition              1500 non-null   object
 6   Price per kg                     1500 non-null   int64 
dtypes: int64(2), object(5)
memory usage: 82.2+ KB


In [8]:
def onehot_encode(df, column):
    df = df.copy()
    dummies = pd.get_dummies(df[column], prefix=column)
    df = pd.concat([df, dummies], axis=1)
    df = df.drop(column, axis=1)
    return df

In [9]:
def preprocess_inputs(df):
    df = df.copy()

    # Clean Vegetable condition column
    df['Vegetable condition'] = df['Vegetable condition'].replace({'scarp': 'scrap'})

    # Binary encoding
    df['Deasaster Happen in last 3month'] = df['Deasaster Happen in last 3month'].replace({'no': 0, 'yes': 1})

    # Ordinal encoding
    df['Month'] = df['Month'].replace({
        'jan': 1,
        'apr': 4,
        'july': 7,
        'sept': 9,
        'oct': 10,
        'dec': 12,
        'may': 5,
        'aug': 8,
        'june': 6,
        ' ': np.NaN,
        'march': 3
    })

    # Fill missing month values with column mode
    df['Month'] = df['Month'].fillna(df['Month'].mode()[0])

    # One-hot encoding
    for column in ['Vegetable', 'Season', 'Vegetable condition']:
        df = onehot_encode(df, column)

    # Split df into X and y
    y = df['Price per kg']
    X = df.drop('Price per kg', axis=1)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, shuffle=True, random_state=1)

    # Scale X
    scaler = StandardScaler()
    scaler.fit(X_train)
    X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
    X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

    return X_train, X_test, y_train, y_test

In [10]:
X_train, X_test, y_train, y_test = preprocess_inputs(data)

  df['Deasaster Happen in last 3month'] = df['Deasaster Happen in last 3month'].replace({'no': 0, 'yes': 1})
  df['Month'] = df['Month'].replace({


In [11]:
models = {
    "                     Linear Regression": LinearRegression(),
    " Linear Regression (L2 Regularization)": Ridge(),
    " Linear Regression (L1 Regularization)": Lasso(),
    "                   K-Nearest Neighbors": KNeighborsRegressor(),
    "                        Neural Network": MLPRegressor(),
    "Support Vector Machine (Linear Kernel)": LinearSVR(),
    "   Support Vector Machine (RBF Kernel)": SVR(),
    "                         Decision Tree": DecisionTreeRegressor(),
    "                         Random Forest": RandomForestRegressor(),
    "                     Gradient Boosting": GradientBoostingRegressor(),
    "                               XGBoost": XGBRegressor(),
    "                              LightGBM": LGBMRegressor(),
    "                              CatBoost": CatBoostRegressor(verbose=0)
}

for name, model in models.items():
    model.fit(X_train, y_train)
    print(name + " trained.")

                     Linear Regression trained.
 Linear Regression (L2 Regularization) trained.
 Linear Regression (L1 Regularization) trained.
                   K-Nearest Neighbors trained.




                        Neural Network trained.
Support Vector Machine (Linear Kernel) trained.
   Support Vector Machine (RBF Kernel) trained.
                         Decision Tree trained.
                         Random Forest trained.
                     Gradient Boosting trained.
                               XGBoost trained.
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007497 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 104
[LightGBM] [Info] Number of data points in the train set: 1050, number of used features: 27
[LightGBM] [Info] Start training from score 56.660952
                              LightGBM trained.
                              CatBoost trained.


In [12]:
for name, model in models.items():
    print(name + " R^2 Score: {:.5f}".format(model.score(X_test, y_test)))

                     Linear Regression R^2 Score: 0.74874
 Linear Regression (L2 Regularization) R^2 Score: 0.74877
 Linear Regression (L1 Regularization) R^2 Score: 0.74006
                   K-Nearest Neighbors R^2 Score: 0.91981
                        Neural Network R^2 Score: 0.90321
Support Vector Machine (Linear Kernel) R^2 Score: 0.68554
   Support Vector Machine (RBF Kernel) R^2 Score: 0.36586
                         Decision Tree R^2 Score: 0.92691
                         Random Forest R^2 Score: 0.92694
                     Gradient Boosting R^2 Score: 0.89686
                               XGBoost R^2 Score: 0.92691
                              LightGBM R^2 Score: 0.92577
                              CatBoost R^2 Score: 0.92691


In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load dataset
df = pd.read_csv("Expanded_Crop_price.csv")  # Update with correct file path

# Preprocess the data using the preprocess_inputs function
# This will handle categorical features and scaling
X_train, X_test, y_train, y_test = preprocess_inputs(df) # Call preprocess_inputs

# Train Random Forest (X_train, y_train are now preprocessed)
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluation Metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"✅ Mean Absolute Error (MAE): {mae:.2f}")
print(f"✅ Root Mean Squared Error (RMSE): {rmse:.2f}")
print(f"✅ R² Score: {r2:.5f}")

  df['Deasaster Happen in last 3month'] = df['Deasaster Happen in last 3month'].replace({'no': 0, 'yes': 1})
  df['Month'] = df['Month'].replace({


✅ Mean Absolute Error (MAE): 7.27
✅ Root Mean Squared Error (RMSE): 12.69
✅ R² Score: 0.92703


In [46]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor

# Load dataset
df = pd.read_csv("/content/Expanded_Crop_price.csv")

# Ensure 'Price per kg' exists
if 'Price per kg' not in df.columns:
    raise ValueError("Error: 'Price per kg' column is missing from the dataset.")

# Convert 'Month' column to numerical format (Jan = 1, Feb = 2, ...)
month_mapping = {
    'jan': 1, 'feb': 2, 'mar': 3, 'apr': 4, 'may': 5, 'jun': 6,
    'jul': 7, 'aug': 8, 'sep': 9, 'oct': 10, 'nov': 11, 'dec': 12,
    'july': 7, 'sept': 9  # Handle alternate spellings
}
df['Month'] = df['Month'].map(month_mapping)

# Handle unrecognized month values
if df['Month'].isnull().sum() > 0:
    df['Month'] = df['Month'].fillna(df['Month'].mode()[0])

# Define features and target variable
X = df.drop('Price per kg', axis=1)
y = df['Price per kg']

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define categorical and numerical features
categorical_features = ['Vegetable', 'Season', 'Vegetable condition', 'Deasaster Happen in last 3month']
numerical_features = ['Month', 'Farm size'] if 'Farm size' in X_train.columns else ['Month']

# Ensure numerical features exist
missing_numerical_features = [feature for feature in numerical_features if feature not in X_train.columns]
if missing_numerical_features:
    raise ValueError(f"Error: Missing numerical features: {missing_numerical_features}")

# Create preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
    ])

# Fit and transform the data
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

# Train Random Forest Model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

print("✅ Model training completed successfully!")

# =========================
# 🚀 Interactive Menu for Predictions
# =========================

def predict_crop_price():
    print("\n💬 Enter details to predict crop price\n")

    # Dropdown options
    vegetable_options = df['Vegetable'].unique().tolist()
    season_options = df['Season'].unique().tolist()
    condition_options = df['Vegetable condition'].unique().tolist()
    disaster_options = df['Deasaster Happen in last 3month'].unique().tolist()

    # User Input
    vegetable = input(f"🌱 Select Vegetable {vegetable_options}: ").strip()
    season = input(f"🗓️ Select Season {season_options}: ").strip()
    condition = input(f"🥦 Select Vegetable Condition {condition_options}: ").strip()
    disaster = input(f"🌍 Any Disaster in Last 3 Months {disaster_options}: ").strip()

    # Month Input
    month_name = input("📅 Enter Month (e.g., jan, feb, mar, apr): ").strip().lower()
    month = month_mapping.get(month_name, None)
    if month is None:
        print(f"⚠️ Invalid month '{month_name}', defaulting to January.")
        month = 1



    # Create input dataframe
    input_data = pd.DataFrame({
        'Vegetable': [vegetable],
        'Season': [season],
        'Vegetable condition': [condition],
        'Deasaster Happen in last 3month': [disaster],
        'Month': [month]

    })

    # Preprocess input data
    input_data = preprocessor.transform(input_data)

    # Predict
    predicted_price = model.predict(input_data)[0]
    print(f"\n💰 Predicted Crop Price: ₹{predicted_price:.2f} per kg\n")

# Run prediction
predict_crop_price()


✅ Model training completed successfully!

💬 Enter details to predict crop price

🌱 Select Vegetable ['potato', 'tomato ', 'peas', 'pumkin', 'cucumber', 'pointed grourd ', 'Raddish', 'Bitter gourd', 'onion', 'garlic', 'cabage', 'califlower', 'chilly', 'okra', 'brinjal', 'ginger', 'radish']: potato
🗓️ Select Season ['winter', 'summer', 'monsoon', 'autumn', 'spring']: monsoon
🥦 Select Vegetable Condition ['fresh', 'scrap', 'avarage', 'scarp']: avarage
🌍 Any Disaster in Last 3 Months ['no', 'yes']: yes
📅 Enter Month (e.g., jan, feb, mar, apr): sept

💰 Predicted Crop Price: ₹56.02 per kg

