In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import pickle

In [2]:
df = pd.read_csv('.\\datasets\\cleaned_output_data.csv')

# Display the first few rows to check the data
print(df.head())

FileNotFoundError: [Errno 2] No such file or directory: '.\\datasets\\cleaned_output_data.csv'

In [81]:
df = df.dropna()

In [82]:
df

Unnamed: 0,Dissolved_Oxygen,Temperature,pH,Plant_Type
0,4.4,22.9,7.5,"Tropical, Neutral Lake with Moderate Oxygen"
1,4.4,22.9,7.5,"Tropical, Neutral Lake with Moderate Oxygen"
2,4.4,22.9,7.5,"Tropical, Neutral Lake with Moderate Oxygen"
3,4.4,22.9,7.5,"Tropical, Neutral Lake with Moderate Oxygen"
4,8.0,10.0,7.0,"Cool, Neutral Stream with High Oxygen"
...,...,...,...,...
4991,5.5,22.0,7.0,"Tropical, Neutral Lake with Moderate Oxygen"
4992,5.5,23.0,7.0,"Tropical, Neutral Lake with Moderate Oxygen"
4993,3.9,21.0,6.3,"Warm, Slightly Acidic Pond with Low to Moderat..."
4994,4.6,18.0,7.0,"Temperate, Slightly Alkaline Lake with Low Oxygen"


In [83]:
X = df[['Dissolved_Oxygen', 'Temperature', 'pH']]
y = df['Plant_Type']
# Check for missing data
print(df.isnull().sum())

Dissolved_Oxygen    0
Temperature         0
pH                  0
Plant_Type          0
dtype: int64


In [84]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [85]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=42)

In [86]:
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [87]:
# Predict on the training data
y_train_pred = model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Training Accuracy Score:", train_accuracy)

Training Accuracy Score: 1.0


In [88]:
y_pred = model.predict(X_test)

# Evaluate the performance
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=le.classes_))

Accuracy Score: 1.0

Classification Report:
                                                           precision    recall  f1-score   support

              Cold, Alkaline Stream with Very Low Oxygen       1.00      1.00      1.00         7
                Cold, Neutral River with Moderate Oxygen       1.00      1.00      1.00        81
                 Cool, Acidic River with Moderate Oxygen       1.00      1.00      1.00        50
                   Cool, Neutral Stream with High Oxygen       1.00      1.00      1.00       492
                      Hot, Neutral Pond with High Oxygen       1.00      1.00      1.00       214
Temperate, Slightly Acidic Wetland with Very High Oxygen       1.00      1.00      1.00        51
       Temperate, Slightly Alkaline Lake with Low Oxygen       1.00      1.00      1.00       220
             Tropical, Neutral Lake with Moderate Oxygen       1.00      1.00      1.00       212
  Warm, Slightly Acidic Pond with Low to Moderate Oxygen       1.00     

In [89]:
from sklearn.model_selection import cross_val_score

# Perform cross-validation
cv_scores = cross_val_score(model, X, y_encoded, cv=5)  # 5-fold cross-validation

print("Cross-Validation Scores:", cv_scores)
print("Mean Cross-Validation Score:", cv_scores.mean())

Cross-Validation Scores: [1.         1.         1.         0.998999   0.99399399]
Mean Cross-Validation Score: 0.9985985985985986


In [90]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

# Get predictions from cross-validation
from sklearn.model_selection import cross_val_predict

y_pred_cv = cross_val_predict(model, X, y_encoded, cv=5)

# Compute confusion matrix and additional metrics
conf_matrix = confusion_matrix(y_encoded, y_pred_cv)
precision = precision_score(y_encoded, y_pred_cv, average='weighted')
recall = recall_score(y_encoded, y_pred_cv, average='weighted')
f1 = f1_score(y_encoded, y_pred_cv, average='weighted')

print("Confusion Matrix:\n", conf_matrix)
print("Precision Score:", precision)
print("Recall Score:", recall)
print("F1 Score:", f1)

Confusion Matrix:
 [[  21    0    0    0    0    0    0    0    0    0]
 [   0  289    0    0    0    0    6    0    0    0]
 [   0    0  137    0    0    0    0    0    0    0]
 [   0    0    0 1616    0    0    0    0    0    0]
 [   0    0    0    0  738    0    0    0    0    0]
 [   0    0    0    0    0  189    0    0    0    0]
 [   0    0    0    0    0    0  729    0    0    0]
 [   0    0    0    0    0    0    0  755    0    0]
 [   0    0    0    0    0    0    0    1  163    0]
 [   0    0    0    0    0    0    0    0    0  352]]
Precision Score: 0.9986089476267332
Recall Score: 0.9985988791032826
F1 Score: 0.9985949309485299


In [93]:
def predict_plant_type(Dissolved_Oxygen, Temperature, pH):
    """
    Predict the plant type based on dissolved oxygen, temperature, and pH value.
    The function checks for valid ranges and scales the inputs before prediction.
    """
    # Validate the input ranges
    if not (1.0 <= Dissolved_Oxygen <= 15.0):
        return 'Invalid Dissolved Oxygen value. Must be between 1 and 15 mg/L.'
    if not (5 <= Temperature <= 30):
        return 'Invalid Temperature value. Must be between 5°C and 30°C.'
    if not (4.0 <= pH <= 9.5):
        return 'Invalid pH value. Must be between 4.0 and 9.5.'

    # Predict the plant type using the model
    prediction_encoded = model.predict([[Dissolved_Oxygen, Temperature, pH]])
    # Convert numeric prediction back to original label
    predicted_label = le.inverse_transform(prediction_encoded)
    return predicted_label[0]

# Test the function with sample inputs
Dissolved_Oxygen = 12
Temperature = 11
pH = 7.2

predicted_plant_type = predict_plant_type(Dissolved_Oxygen, Temperature, pH)
print(f"Predicted plant type: {predicted_plant_type}")


Predicted plant type: Cool, Neutral Stream with High Oxygen




In [94]:
with open('hydrosphere_model.pickle','wb') as f:
    pickle.dump(model,f)

In [95]:
import json
columns={
    'data_columns':[col.lower() for col in X.columns]
}
with open("columns.json","w")as f:
    f.write(json.dumps(columns))