In [1]:
# ## 1. Import Necessary Libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# ## 2. Load and Preprocess the Data
# Load the dataset from the CSV file
df = pd.read_csv('water_potability.csv')

# Handle missing values by filling them with the mean of their respective columns
# This is a common strategy to ensure the model has complete data to train on.
df['ph'].fillna(df['ph'].mean(), inplace=True)
df['Sulfate'].fillna(df['Sulfate'].mean(), inplace=True)
df['Trihalomethanes'].fillna(df['Trihalomethanes'].mean(), inplace=True)

# ## 3. Prepare Data for Modeling
# Define the features (X) and the target variable (y)
X = df.drop('Potability', axis=1)
y = df['Potability']

# Scale the features. Scaling is crucial for many ML algorithms.
# It standardizes the data to have a mean of 0 and a standard deviation of 1.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ## 4. Train the Random Forest Model
# We'll train the model on the entire dataset to capture as much information as possible.
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_scaled, y)

# ## 5. Save the Model and the Scaler
# We save the trained model and the scaler to disk.
# Our Streamlit app will load these files to make predictions.
joblib.dump(model, 'water_potability_model.joblib')
joblib.dump(scaler, 'scaler.joblib')

print("Model and scaler have been saved successfully!")
print("The files 'water_potability_model.joblib' and 'scaler.joblib' are now in your project folder.")

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['ph'].fillna(df['ph'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Sulfate'].fillna(df['Sulfate'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are sett

Model and scaler have been saved successfully!
The files 'water_potability_model.joblib' and 'scaler.joblib' are now in your project folder.
