In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

In [10]:
df = pd.read_csv('.\\dataset\\updated.csv')

# Display the first few rows to check the data
print(df.head())

    pH  Humidity  Air Temp                         Plant Type
0  7.0      68.0      28.0  Warm-Season Fruits and Vegetables
1  7.0      51.0      24.0  Warm-Season Fruits and Vegetables
2  7.2      45.0      24.0  Warm-Season Fruits and Vegetables
3  7.0      40.0      22.0  Warm-Season Fruits and Vegetables
4  8.0      50.0      24.0          Herbs and Aromatic Plants


In [11]:
X = df[['pH', 'Humidity', 'Air Temp']]  # Features
y = df['Plant Type']  # Target

# Check for missing data
print(df.isnull().sum())

pH            0
Humidity      0
Air Temp      0
Plant Type    0
dtype: int64


In [12]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

In [14]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [15]:
# Predict on the training data
y_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Training Accuracy Score:", train_accuracy)

Training Accuracy Score: 1.0


In [16]:
y_pred = model.predict(X_test)

# Evaluate the performance
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

Accuracy Score: 1.0

Classification Report:
                                    precision    recall  f1-score   support

           Berries and Evergreens       1.00      1.00      1.00        30
         Drought-Resistant Plants       1.00      1.00      1.00        69
        Herbs and Aromatic Plants       1.00      1.00      1.00        61
 Leafy Greens and Root Vegetables       1.00      1.00      1.00       103
      Tropical and Wetland Plants       1.00      1.00      1.00       142
Warm-Season Fruits and Vegetables       1.00      1.00      1.00       169

                         accuracy                           1.00       574
                        macro avg       1.00      1.00      1.00       574
                     weighted avg       1.00      1.00      1.00       574



In [17]:
def predict_plant_type(ph_value, humidity, air_temp):
    if ph_value < 4.0 or ph_value > 9.5:
        return 'Invalid pH value'
    if humidity < 0 or humidity > 100:
        return 'Invalid humidity value'
    if air_temp < -10 or air_temp > 50:  # Assumed reasonable range
        return 'Invalid air temperature value'
    
    # Predict the plant type using the model
    prediction_encoded = model.predict([[ph_value, humidity, air_temp]])
    # Convert numeric prediction back to original label
    predicted_label = le.inverse_transform(prediction_encoded)
    return predicted_label[0]

# Test the function with sample inputs
ph_value = 7.1
humidity = 65
air_temp = 25
print(f"Predicted plant type: {predict_plant_type(ph_value, humidity, air_temp)}")

Predicted plant type: Warm-Season Fruits and Vegetables




In [18]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(model, X, y_encoded, cv=5)  # 5-fold cross-validation

print("Cross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Score:", cv_scores.mean())


Cross-Validation Scores: [1.        1.        0.9947644 1.        1.       ]
Mean Cross-Validation Score: 0.9989528795811518


In [19]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

# Get predictions from cross-validation
from sklearn.model_selection import cross_val_predict

y_pred_cv = cross_val_predict(model, X, y_encoded, cv=5)

# Compute confusion matrix and additional metrics
conf_matrix = confusion_matrix(y_encoded, y_pred_cv)
precision = precision_score(y_encoded, y_pred_cv, average='weighted')
recall = recall_score(y_encoded, y_pred_cv, average='weighted')
f1 = f1_score(y_encoded, y_pred_cv, average='weighted')

print("Confusion Matrix:\n", conf_matrix)
print("Precision Score:", precision)
print("Recall Score:", recall)
print("F1 Score:", f1)


Confusion Matrix:
 [[102   0   0   0   0   0]
 [  0 259   0   0   0   0]
 [  0   0 214   0   0   2]
 [  0   0   0 362   0   0]
 [  0   0   0   0 439   0]
 [  0   0   0   0   0 534]]
Precision Score: 0.998957877974146
Recall Score: 0.9989539748953975
F1 Score: 0.9989525198724308


In [20]:
import pickle
with open('pedosphere_model.pickle','wb') as f:
    pickle.dump(model,f)

In [21]:
import json
columns={
    'data_columns':[col.lower() for col in X.columns]
}
with open("columns.json","w")as f:
    f.write(json.dumps(columns))