In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import accuracy_score, r2_score
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.svm import SVC, SVR
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from xgboost import XGBClassifier, XGBRegressor

df = pd.read_csv("crop_data1.csv")
print("Dataset loaded successfully!")
print(f"Shape: {df.shape}")
print("\nColumn names:", df.columns.tolist())

df = df.dropna()
print(f"\nRemoved missing values. Remaining rows: {len(df)}")

for col in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
print("\nEncoded categorical columns automatically.")

numeric_cols = df.select_dtypes(include=[np.number]).columns
if len(df.columns) > 5:
    possible_targets = df.columns[-1:]
else:
    possible_targets = [df.columns[-1]]

print(f"\nDetected possible target column(s): {list(possible_targets)}")

X = df.drop(columns=possible_targets)
y = df[possible_targets]

is_classification = False
for col in y.columns:
    if len(y[col].unique()) < 20 or y[col].dtype == 'int':
        is_classification = True

task_type = "Classification" if is_classification else "Regression"
print(f"\nDetected task type: {task_type}")

scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

if is_classification:
    models = {
        "Random Forest": RandomForestClassifier(random_state=42),
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "SVM": SVC(),
        "KNN": KNeighborsClassifier(),
        "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
    }
else:
    models = {
        "Random Forest": RandomForestRegressor(random_state=42),
        "Linear Regression": LinearRegression(),
        "SVR": SVR(),
        "KNN": KNeighborsRegressor(),
        "XGBoost": XGBRegressor(eval_metric='rmse')
    }

scores = {}
for name, model in models.items():
    model.fit(X_train, y_train.values.ravel() if y.shape[1] == 1 else y_train)
    preds = model.predict(X_test)

    if is_classification:
        score = accuracy_score(y_test, preds)
        print(f"{name} Accuracy: {score:.4f}")
    else:
        score = r2_score(y_test, preds)
        print(f"{name} R² Score: {score:.4f}")

    scores[name] = score

best_model = max(scores, key=scores.get)
print(f"\nBest Model: {best_model}")
print(f"Best {('Accuracy' if is_classification else 'R² Score')}: {scores[best_model]:.4f}")
print("\nTraining complete. Ready for predictions!")


Dataset loaded successfully!
Shape: (2200, 8)

Column names: ['N', 'P', 'K', 'temperature', 'humidity', 'ph', 'rainfall', 'label']

Removed missing values. Remaining rows: 2200

Encoded categorical columns automatically.

Detected possible target column(s): ['label']

Detected task type: Classification
Random Forest Accuracy: 0.9932
Logistic Regression Accuracy: 0.9636
SVM Accuracy: 0.9682
KNN Accuracy: 0.9568


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost Accuracy: 0.9864

Best Model: Random Forest
Best Accuracy: 0.9932

Training complete. Ready for predictions!
