In [1]:
import joblib
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
import os

# Load dataset
X = pd.read_csv('dataset/yield_df.csv')  # Replace with actual data loading step
y = X['hg/ha_yield']  # Target variable
X_encoded = X.drop(columns=['hg/ha_yield', 'Unnamed: 0', 'Year'])  # Drop unnecessary columns

# Encode categorical variables
label_encoder_item = LabelEncoder()
label_encoder_area = LabelEncoder()

X_encoded['Item'] = label_encoder_item.fit_transform(X['Item'])
X_encoded['Area'] = label_encoder_area.fit_transform(X['Area'])

# Save label encoders
joblib.dump(label_encoder_item, 'model1/label_encoder_Item.pkl')
joblib.dump(label_encoder_area, 'model1/label_encoder_Area.pkl')

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_encoded)

# Save feature names for alignment in prediction
feature_names = list(X_encoded.columns)
joblib.dump(feature_names, 'model1/feature_names.pkl')

# Save the scaler
joblib.dump(scaler, 'model1/scaler.pkl')

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

# List of models to train
models = [
    ('Linear Regression', LinearRegression()),
    ('Random Forest', RandomForestRegressor(random_state=42)),
    ('Gradient Boost', GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)),
    ('XGBoost', XGBRegressor(random_state=42)),
    ('KNN', KNeighborsRegressor(n_neighbors=5)),
    ('Decision Tree', DecisionTreeRegressor(random_state=42)),
    ('Bagging Regressor', BaggingRegressor(n_estimators=150, random_state=42))
]

# Create folder for saving models
os.makedirs('model1', exist_ok=True)

# Train models and save them
for name, model in models:
    # Train the model
    model.fit(X_train, y_train)
    
    # Save the trained model
    joblib.dump(model, f'model1/{name.replace(" ", "_").lower()}_model.pkl')
    
    # Make predictions
    y_pred = model.predict(X_test)
    
    # Evaluate performance
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"{name} - MSE: {mse}, R2: {r2}")


Linear Regression - MSE: 6815754155.058177, R2: 0.0804615757978091
Random Forest - MSE: 205550376.55959725, R2: 0.9722684438059568
Gradient Boost - MSE: 1296589446.4300666, R2: 0.8250723560029407
XGBoost - MSE: 268816032.61938447, R2: 0.9637330418011529
KNN - MSE: 345863849.3648106, R2: 0.9533382378826856
Decision Tree - MSE: 353911780.6526941, R2: 0.9522524619162717
Bagging Regressor - MSE: 206131736.16847897, R2: 0.972190010446055
