In [1]:
# 1. Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import train_test_split
from sklearn import metrics

ModuleNotFoundError: No module named 'seaborn'

In [None]:
# 2. Load dataset
crops = pd.read_csv("soil_measures.csv")

print("Dataset shape:", crops.shape)
print("Missing values:\n", crops.isna().sum())
print("Unique crops:", crops.crop.unique())

In [None]:
# 3. Split features (X) and target (y)
X = crops.drop(columns="crop")
y = crops["crop"]

#Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
#PART A: Single Feature Analysis

feature_performance = {}

for feature in ["N", "P", "K", "ph"]:
    log_reg = LogisticRegression(multi_class="multinomial", max_iter=500)
    log_reg.fit(X_train[[feature]], y_train)
    y_pred = log_reg.predict(X_test[[feature]])
    
    f1 = metrics.f1_score(y_test, y_pred, average="weighted")
    feature_performance[feature] = f1
    print(f"F1-score using only {feature}: {f1:.3f}")

In [None]:
#Bar chart for single feature performance
plt.figure(figsize=(6,4))
sns.barplot(x=list(feature_performance.keys()), y=list(feature_performance.values()))
plt.title("Single Feature Predictive Power (F1-score)")
plt.ylabel("F1-score")
plt.show()

best_feature = max(feature_performance, key=feature_performance.get)
print("Best single feature:", best_feature)


In [None]:
#PART B-

#MODELS
models = {
    "Logistic Regression": LogisticRegression(multi_class="multinomial", max_iter=500),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42)
}

model_performance = {}


In [None]:
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    f1 = metrics.f1_score(y_test, y_pred, average="weighted")
    model_performance[name] = f1
    print(f"F1-score with {name}: {f1:.3f}")

# Compare model performance
plt.figure(figsize=(6,4))
sns.barplot(x=list(model_performance.keys()), y=list(model_performance.values()))
plt.title("Model Comparison (All Features)")
plt.ylabel("F1-score")
plt.xticks(rotation=20)
plt.show()

In [None]:
best_model_name = max(model_performance, key=model_performance.get)
best_model = models[best_model_name]

print(f"âœ… Best model is {best_model_name} with F1-score = {model_performance[best_model_name]:.3f}")