In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, balanced_accuracy_score

# Load the dataset
crops = pd.read_csv("soil_measures.csv")

# Check for missing values
missing_values = crops.isnull().sum()
print("Missing values:\n", missing_values)

# Get unique crop types
unique_crop_types = crops['crop'].unique()
print("Unique crop types:\n", unique_crop_types)

# Define the target variable
target = 'crop'

# Split the data into features and target variable
X = crops.drop(columns=[target])  # Features
y = crops[target]                 # Target variable

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Define the features to be used in the model
features = ["N", "P", "K", "ph"]

# Create an empty dictionary to store predictive performance of each feature
features_dict = {}

# Loop through each feature to build a model
for feature in features:
    # Isolate the current feature for model training
    X_train_feature = X_train[[feature]]
    X_test_feature = X_test[[feature]]

    # Create a logistic regression model set for multinomial multi-class classification
    log_reg = LogisticRegression(multi_class="multinomial", max_iter=500)  # Increase max_iter if needed

    # Fit the model on the training data using only the selected feature
    log_reg.fit(X_train_feature, y_train)

    # Predict the target values using the test set
    y_pred = log_reg.predict(X_test_feature)

    # Calculate the F1 score and balanced accuracy score
    f1 = f1_score(y_test, y_pred, average="weighted")
    balanced_accuracy = balanced_accuracy_score(y_test, y_pred)

    # Store the results in the dictionary
    features_dict[feature] = {"F1 Score": f1, "Balanced Accuracy": balanced_accuracy}

# Print the performance for each feature
for feature, scores in features_dict.items():
    print(f"F1-score for {feature}: {scores['F1 Score']:.4f}")
    print(f"Balanced Accuracy for {feature}: {scores['Balanced Accuracy']:.4f}")

# Define which metric to compare, e.g., "F1 Score"
comparison_metric = "F1 Score"

# Use a lambda function to fetch the specific metric from nested dictionaries for comparison
best_feature = max(features_dict, key=lambda x: features_dict[x][comparison_metric])
best_score = features_dict[best_feature][comparison_metric]

# Create a dictionary to store the best predictive feature and its score
best_predictive_feature = {best_feature: best_score}

# Print the best predictive feature and its score
print("Best Predictive Feature:", best_predictive_feature)

# Train the model using the entire dataset for the best feature
best_feature_model = LogisticRegression(multi_class="multinomial", max_iter=500)
best_feature_model.fit(crops[[best_feature]], crops[target])

# Function to predict crop type based on input values
def predict_crop(N, P, K, ph):
    input_data = pd.DataFrame([[N, P, K, ph]], columns=["N", "P", "K", "ph"])
    prediction = best_feature_model.predict(input_data[[best_feature]])
    return prediction[0]

# Example usage
N = float(input("Enter value for N: "))
P = float(input("Enter value for P: "))
K = float(input("Enter value for K: "))
ph = float(input("Enter value for ph: "))

predicted_crop = predict_crop(N, P, K, ph)
print("Predicted crop type:", predicted_crop)
