In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/crop-recommendation-dataset/Crop_Recommendation.csv


In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tabulate import tabulate
import xgboost as xgb  # Uncomment if you have XGBoost installed

# Load dataset
data = pd.read_csv('/kaggle/input/crop-recommendation-dataset/Crop_Recommendation.csv')

# Data preprocessing
X = data[['Nitrogen', 'Phosphorus', 'Potassium', 'Temperature', 'Humidity', 'pH_Value', 'Rainfall']]
y = data['Crop']

# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Scale the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Splitting dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_encoded, test_size=0.2, random_state=42)

# Dictionary to store the performance of each model
results = []

# Define models with hyperparameter tuning for some
models = {
    'Naive Bayes': GaussianNB(),
    'Logistic Regression': LogisticRegression(max_iter=300),
    'Random Forest': RandomForestClassifier(n_estimators=200, max_depth=10, random_state=42),
    'Decision Tree (Entropy)': DecisionTreeClassifier(criterion='entropy', max_depth=10, random_state=42),
    'Decision Tree (Gini Index)': DecisionTreeClassifier(criterion='gini', max_depth=10, random_state=42),
    'Support Vector Classifier': SVC(C=1, kernel='rbf', gamma='scale', probability=True, random_state=42),
    'K-Nearest Neighbors': KNeighborsClassifier(n_neighbors=5),
    # Uncomment the line below if you have XGBoost installed
    # 'XGBoost': xgb.XGBClassifier(objective='multi:softmax', num_class=len(label_encoder.classes_), random_state=42),
}

# Hyperparameters tuning
grid_params = {
    'Logistic Regression': {'max_iter': [300, 500]},
    'Random Forest': {'n_estimators': [100, 200, 300], 'max_depth': [10, 15]},
    'Support Vector Classifier': {'C': [0.5, 1, 10], 'kernel': ['linear', 'rbf'], 'gamma': ['scale', 'auto']},
    'K-Nearest Neighbors': {'n_neighbors': [3, 5, 7]},
    # Uncomment the line below if you have XGBoost installed
    # 'XGBoost': {'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 6, 10], 'n_estimators': [100, 200]},
}

# Training, tuning, and evaluating models
for model_name, model in models.items():
    # Use GridSearchCV if model has hyperparameters to tune
    if model_name in grid_params:
        grid = GridSearchCV(model, grid_params[model_name], cv=5, scoring='accuracy', n_jobs=-1)
        grid.fit(X_train, y_train)
        best_model = grid.best_estimator_
    else:
        best_model = model
        best_model.fit(X_train, y_train)

    # Making predictions and evaluating performance
    y_pred = best_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    f1 = f1_score(y_test, y_pred, average='weighted')
    
    # Appending the results to the list
    results.append([model_name, accuracy, precision, recall, f1])

# Adding a Voting Classifier (Soft Voting)
voting_clf = VotingClassifier(estimators=[
    ('LogReg', LogisticRegression(max_iter=500, random_state=42)),
    ('RandForest', RandomForestClassifier(n_estimators=300, max_depth=15, random_state=42)),
    ('SVC', SVC(C=1, kernel='rbf', gamma='scale', probability=True, random_state=42))
], voting='soft')

voting_clf.fit(X_train, y_train)
y_pred = voting_clf.predict(X_test)

# Evaluating the Voting Classifier
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
results.append(['Voting Classifier', accuracy, precision, recall, f1])

# Display results in tabular format
print(tabulate(results, headers=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'], tablefmt='grid'))


+----------------------------+------------+-------------+----------+------------+
| Model                      |   Accuracy |   Precision |   Recall |   F1 Score |
| Naive Bayes                |   0.995455 |    0.995818 | 0.995455 |   0.995423 |
+----------------------------+------------+-------------+----------+------------+
| Logistic Regression        |   0.963636 |    0.964442 | 0.963636 |   0.963512 |
+----------------------------+------------+-------------+----------+------------+
| Random Forest              |   0.993182 |    0.993735 | 0.993182 |   0.993175 |
+----------------------------+------------+-------------+----------+------------+
| Decision Tree (Entropy)    |   0.979545 |    0.980288 | 0.979545 |   0.979379 |
+----------------------------+------------+-------------+----------+------------+
| Decision Tree (Gini Index) |   0.986364 |    0.986806 | 0.986364 |   0.986315 |
+----------------------------+------------+-------------+----------+------------+
| Support Vector