In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

In [3]:
# Set the number of rows to generate (100,000 rows)
num_rows = 100000

# Generate synthetic data for DRASTIC parameters
depth_to_water = np.random.uniform(5, 50, num_rows)  # Depth to water table in meters
net_recharge = np.random.uniform(50, 300, num_rows)  # Net recharge in mm/year
aquifer_media = np.random.choice([1, 2, 3, 4, 5], num_rows)  # Categorical values for aquifer media
soil_media = np.random.choice([1, 2, 3, 4], num_rows)  # Categorical values for soil media
topography = np.random.uniform(0, 10, num_rows)  # Slope percentage
vadose_zone = np.random.choice([1, 2, 3, 4, 5], num_rows)  # Categorical values for vadose zone
hydraulic_conductivity = np.random.uniform(0.01, 0.1, num_rows)  # Conductivity in m/s


In [4]:
# Calculate the DRASTIC index with weighted parameters
drastic_index = (depth_to_water * 5 +
                 net_recharge * 4 +
                 aquifer_media * 3 +
                 soil_media * 2 +
                 topography * 1 +
                 vadose_zone * 5 +
                 hydraulic_conductivity * 3)

# Normalize the DRASTIC index between 0 and 1
drastic_index = (drastic_index - drastic_index.min()) / (drastic_index.max() - drastic_index.min())

In [5]:
# Create a DataFrame with all the parameters and the DRASTIC index
synthetic_data = pd.DataFrame({
    'Depth_to_Water': depth_to_water,
    'Net_Recharge': net_recharge,
    'Aquifer_Media': aquifer_media,
    'Soil_Media': soil_media,
    'Topography': topography,
    'Vadose_Zone': vadose_zone,
    'Hydraulic_Conductivity': hydraulic_conductivity,
    'DRASTIC_Index': drastic_index
})

# Save the synthetic data to a CSV file
synthetic_data.to_csv('synthetic_drastric_data.csv', index=False)

print("100,000 rows of synthetic data have been saved to 'synthetic_drastric_data.csv'.")

100,000 rows of synthetic data have been saved to 'synthetic_drastric_data.csv'.


In [6]:
# Step 3: Generate synthetic contamination levels based on DRASTIC values
# You can make contamination a function of DRASTIC index + some noise
contamination_level = drastic_index * 100 + np.random.normal(0, 10, num_rows)  # Add some noise
# Create a DataFrame with all the parameters, DRASTIC index, and contamination levels
synthetic_data = pd.DataFrame({
    'Depth_to_Water': depth_to_water,
    'Net_Recharge': net_recharge,
    'Aquifer_Media': aquifer_media,
    'Soil_Media': soil_media,
    'Topography': topography,
    'Vadose_Zone': vadose_zone,
    'Hydraulic_Conductivity': hydraulic_conductivity,
    'DRASTIC_Index': drastic_index,
    'Contamination_Level': contamination_level  # Target variable for regression
})
# Save the synthetic data to a CSV file (optional)
synthetic_data.to_csv('synthetic_drastric_data_with_contamination.csv', index=False)


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

# Assuming you already have the 'data' DataFrame with DRASTIC parameters and contamination levels

# Step 1: Split data into features (X) and target (y)
X = synthetic_data.drop(['DRASTIC_Index', 'Contamination_Level'], axis=1)  # Features: DRASTIC parameters
y = synthetic_data['Contamination_Level']  # Target: Contamination level

# Step 2: Split into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 3: Initialize and train a Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Step 4: Make predictions on test data
y_pred = model.predict(X_test)

# Step 5: Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

# Step 6: Create a DataFrame with Actual vs Predicted contamination levels
comparison_df = pd.DataFrame({'Actual_Contamination': y_test, 'Predicted_Contamination': y_pred})

# Step 7: Save the comparison to a CSV file
comparison_df.to_csv('actual_vs_predicted_contamination.csv', index=False)

print("Actual vs Predicted contamination levels have been saved to 'actual_vs_predicted_contamination.csv'")

Mean Squared Error: 106.17665991520298
R-squared: 0.8424473404241759
Actual vs Predicted contamination levels have been saved to 'actual_vs_predicted_contamination.csv'


In [8]:
## Step 2: Categorize contamination levels (low, medium, high)
synthetic_data['Contamination_Category'] = pd.qcut(synthetic_data['Contamination_Level'], 3, labels=['low', 'medium', 'high'])

# Step 3: Split the data into features (X) and target (y)
X = synthetic_data.drop(['Contamination_Level', 'Contamination_Category'], axis=1)
y = synthetic_data['Contamination_Category']  # Target is the contamination category (low, medium, high)

# Step 4: Split into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [9]:
# Initialize the models
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
svm_model = SVC(random_state=42)
log_reg_model = LogisticRegression(max_iter=1000, random_state=42)

# Dictionary to store results for each model
results = {}

In [10]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Function to train and evaluate each model
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # Calculate classification scores
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')
    
    # Calculate errors (treating categories as numeric for error calculations)
    y_test_numeric = y_test.cat.codes  # Convert categories to numeric for error calculation
    y_pred_numeric = pd.Categorical(y_pred, categories=y_test.cat.categories).codes
    
    rmse = np.sqrt(mse)
    
    # Calculate R-squared
    r2 = r2_score(y_test_numeric, y_pred_numeric)
    
    return accuracy, f1, precision, recall, rmse, r2

# Evaluate Random Forest
results['Random Forest'] = evaluate_model(rf_model, X_train, X_test, y_train, y_test)

# Evaluate SVM
results['SVM'] = evaluate_model(svm_model, X_train, X_test, y_train, y_test)

# Evaluate Logistic Regression
results['Logistic Regression'] = evaluate_model(log_reg_model, X_train, X_test, y_train, y_test)

In [11]:
# Step 6: Print Results
print("Model Comparison:")
for model_name, (accuracy, f1, precision, recall, rmse, r2) in results.items():
    print(f"\n{model_name}:")
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  F1 Score: {f1:.4f}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall: {recall:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  R-squared: {r2:.4f}")

Model Comparison:

Random Forest:
  Accuracy: 0.7967
  F1 Score: 0.7965
  Precision: 0.7963
  Recall: 0.7967
  RMSE: 10.3042
  R-squared: 0.6933

SVM:
  Accuracy: 0.8008
  F1 Score: 0.8000
  Precision: 0.7994
  Recall: 0.8008
  RMSE: 10.3042
  R-squared: 0.6998

Logistic Regression:
  Accuracy: 0.8007
  F1 Score: 0.8003
  Precision: 0.7999
  Recall: 0.8007
  RMSE: 10.3042
  R-squared: 0.6998


In [12]:
import joblib
import numpy as np

# Step 1: Save models after training (assuming the models are trained)
# Note: Run this block after training models in your notebook or script.

# Uncomment these lines when saving trained models
joblib.dump(rf_model, 'random_forest_model.joblib')
joblib.dump(svm_model, 'svm_model.joblib')
joblib.dump(log_reg_model, 'logistic_regression_model.joblib')

# Step 2: Load saved models
rf_model = joblib.load('random_forest_model.joblib')
svm_model = joblib.load('svm_model.joblib')
log_reg_model = joblib.load('logistic_regression_model.joblib')

# Step 3: Function to take user input and predict vulnerability
def predict_vulnerability(depth_to_water, net_recharge, aquifer_media, soil_media, topography, vadose_zone, hydraulic_conductivity):
    # Create an input array
    input_data = np.array([[depth_to_water, net_recharge, aquifer_media, soil_media, topography, vadose_zone, hydraulic_conductivity]])

    # Predictions from each model
    rf_prediction = rf_model.predict(input_data)[0]
    svm_prediction = svm_model.predict(input_data)[0]
    log_reg_prediction = log_reg_model.predict(input_data)[0]

    # Display predictions
    print("Predictions for Groundwater Vulnerability (Low, Medium, High):")
    print(f"Random Forest Prediction: {rf_prediction}")
    print(f"SVM Prediction: {svm_prediction}")
    print(f"Logistic Regression Prediction: {log_reg_prediction}")

# Step 4: Get user input and make predictions
def main():
    print("Enter the DRASTIC parameters to predict vulnerability:")
    depth_to_water = float(input("Depth to Water (e.g., 25): "))
    net_recharge = float(input("Net Recharge (e.g., 150): "))
    aquifer_media = int(input("Aquifer Media (1-5, e.g., 3): "))
    soil_media = int(input("Soil Media (1-4, e.g., 2): "))
    topography = float(input("Topography (e.g., 5): "))
    vadose_zone = int(input("Vadose Zone (1-5, e.g., 4): "))
    hydraulic_conductivity = float(input("Hydraulic Conductivity (e.g., 0.05): "))

    # Make predictions
    predict_vulnerability(depth_to_water, net_recharge, aquifer_media, soil_media, topography, vadose_zone, hydraulic_conductivity)

# Run the main function
if __name__ == "__main__":
    main()

Enter the DRASTIC parameters to predict vulnerability:


Depth to Water (e.g., 25):  6.878527377717779
Net Recharge (e.g., 150):  168.02644995842303
Aquifer Media (1-5, e.g., 3):  4
Soil Media (1-4, e.g., 2):  3
Topography (e.g., 5):  2.396136594242184
Vadose Zone (1-5, e.g., 4):  3
Hydraulic Conductivity (e.g., 0.05):  0.07250199393968913




ValueError: X has 7 features, but RandomForestClassifier is expecting 8 features as input.

In [16]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
import joblib

# Assuming 'data' is your DataFrame with DRASTIC parameters and contamination category
# Define features (X) and target (y), excluding 'Contamination_Category' from X
X = synthetic_data[['Depth_to_Water', 'Net_Recharge', 'Aquifer_Media', 'Soil_Media', 'Topography', 'Vadose_Zone', 'Hydraulic_Conductivity']]
y = synthetic_data['Contamination_Category']  # Target variable

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the models
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
svm_model = SVC(random_state=42)
log_reg_model = LogisticRegression(max_iter=1000, random_state=42)

# Fit each model
rf_model.fit(X_train, y_train)
svm_model.fit(X_train, y_train)
log_reg_model.fit(X_train, y_train)

# Save the retrained models for later use
joblib.dump(rf_model, 'random_forest_model.joblib')
joblib.dump(svm_model, 'svm_model.joblib')
joblib.dump(log_reg_model, 'logistic_regression_model.joblib')

['logistic_regression_model.joblib']

In [18]:
import joblib
import numpy as np

# Load the retrained models
rf_model = joblib.load('random_forest_model.joblib')
svm_model = joblib.load('svm_model.joblib')
log_reg_model = joblib.load('logistic_regression_model.joblib')

# Function to predict vulnerability based on user input (7 DRASTIC parameters)
def predict_vulnerability(depth_to_water, net_recharge, aquifer_media, soil_media, topography, vadose_zone, hydraulic_conductivity):
    # Create an input array with the 7 DRASTIC parameters
    input_data = np.array([[depth_to_water, net_recharge, aquifer_media, soil_media, topography, vadose_zone, hydraulic_conductivity]])

    # Make predictions from each model
    rf_prediction = rf_model.predict(input_data)[0]
    svm_prediction = svm_model.predict(input_data)[0]
    log_reg_prediction = log_reg_model.predict(input_data)[0]

    # Display predictions
    print("Predictions for Groundwater Vulnerability (Low, Medium, High):")
    print(f"Random Forest Prediction: {rf_prediction}")
    print(f"SVM Prediction: {svm_prediction}")
    print(f"Logistic Regression Prediction: {log_reg_prediction}")

# Main function to get user input and make predictions
def main():
    print("Enter the DRASTIC parameters to predict vulnerability:")
    depth_to_water = float(input("Depth to Water (e.g., 25): "))
    net_recharge = float(input("Net Recharge (e.g., 150): "))
    aquifer_media = int(input("Aquifer Media (1-5, e.g., 3): "))
    soil_media = int(input("Soil Media (1-4, e.g., 2): "))
    topography = float(input("Topography (e.g., 5): "))
    vadose_zone = int(input("Vadose Zone (1-5, e.g., 4): "))
    hydraulic_conductivity = float(input("Hydraulic Conductivity (e.g., 0.05): "))

    # Make predictions
    predict_vulnerability(depth_to_water, net_recharge, aquifer_media, soil_media, topography, vadose_zone, hydraulic_conductivity)

# Run the main function
if __name__ == "__main__":
    main()

Enter the DRASTIC parameters to predict vulnerability:


Depth to Water (e.g., 25):  6.878527377717779
Net Recharge (e.g., 150):  168.02644995842303
Aquifer Media (1-5, e.g., 3):  4
Soil Media (1-4, e.g., 2):  3
Topography (e.g., 5):  2.396136594242184
Vadose Zone (1-5, e.g., 4):  3
Hydraulic Conductivity (e.g., 0.05):  0.07250199393968913


Predictions for Groundwater Vulnerability (Low, Medium, High):
Random Forest Prediction: medium
SVM Prediction: medium
Logistic Regression Prediction: medium


