In [1]:
# 1. Import Libraries
import pandas as pd
import os
import pickle
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

# 2. Load the Processed Data
processed_data_dir = '../data/processed'
X_train = pd.read_csv(os.path.join(processed_data_dir, 'X_train.csv'))
X_test = pd.read_csv(os.path.join(processed_data_dir, 'X_test.csv'))
y_train = pd.read_csv(os.path.join(processed_data_dir, 'y_train.csv')).values.ravel()
y_test = pd.read_csv(os.path.join(processed_data_dir, 'y_test.csv')).values.ravel()

# Load original labels for classification report
le = LabelEncoder()
le.fit(pd.read_csv('../data/crop_data.csv')['label'])
class_names = le.classes_

# 3. Define and Train Models
pipeline_lr = Pipeline([('scaler', StandardScaler()), ('clf', LogisticRegression(random_state=42, max_iter=1000))])
pipeline_rf = Pipeline([('scaler', StandardScaler()), ('clf', RandomForestClassifier(random_state=42))])
pipeline_svc = Pipeline([('scaler', StandardScaler()), ('clf', SVC(random_state=42))])
pipelines = [pipeline_lr, pipeline_rf, pipeline_svc]
model_names = ['Logistic Regression', 'Random Forest', 'Support Vector Machine']
trained_models = {}

for i, pipe in enumerate(pipelines):
    print(f"--- Training {model_names[i]} ---")
    pipe.fit(X_train, y_train)
    trained_models[model_names[i]] = pipe
    print(f"{model_names[i]} trained successfully.\n")

# 4. Evaluate Models
best_accuracy = 0.0
best_model_name = ''
best_model_pipeline = None

for name, model in trained_models.items():
    print(f"--- Evaluating {name} ---")
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {accuracy:.4f}\n")
    print("Classification Report:")
    print(classification_report(y_test, y_pred, target_names=class_names))
    
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_model_name = name
        best_model_pipeline = model

print(f"\nBest performing model: {best_model_name} with an accuracy of {best_accuracy:.4f}")

# 5. Save the Best Model
models_dir = '../models'
if not os.path.exists(models_dir):
    os.makedirs(models_dir)

model_path = os.path.join(models_dir, 'best_model_crop.pkl')
with open(model_path, 'wb') as f:
    pickle.dump(best_model_pipeline, f)

print(f"Best model ({best_model_name}) saved to {model_path}")


--- Training Logistic Regression ---
Logistic Regression trained successfully.

--- Training Random Forest ---
Random Forest trained successfully.

--- Training Support Vector Machine ---
Support Vector Machine trained successfully.

--- Evaluating Logistic Regression ---
Accuracy: 0.9909

Classification Report:
              precision    recall  f1-score   support

      banana       1.00      1.00      1.00        40
   blackgram       0.97      0.95      0.96        40
    chickpea       1.00      1.00      1.00        40
 kidneybeans       1.00      1.00      1.00        40
      lentil       0.97      0.97      0.97        40
       maize       1.00      1.00      1.00        40
   mothbeans       0.97      0.97      0.97        40
    mungbean       0.98      1.00      0.99        40
  pigeonpeas       1.00      1.00      1.00        40
 pomegranate       1.00      1.00      1.00        40
        rice       1.00      1.00      1.00        40

    accuracy                        