In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import accuracy_score, mean_squared_error, r2_score
from sklearn.impute import SimpleImputer

# Load dataset
data = pd.read_csv("water_potability.csv")
print(data.head())

# Separate features and target
X = data.drop("Potability", axis=1)
y = data["Potability"]

# --- Impute missing values (replace NaN with mean) ---
imputer = SimpleImputer(strategy="mean")
X = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

# Split data into train & test
x_train, x_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=2
)

# --- Decision Tree Classifier ---
dt = DecisionTreeClassifier(criterion="entropy", random_state=42)
dt.fit(x_train, y_train)
y_dt_pred = dt.predict(x_test)

accuracy_dt = accuracy_score(y_test, y_dt_pred)
print("Decision Tree Accuracy:", accuracy_dt)

# --- Linear Regression (comparison only) ---
lrr = LinearRegression()
lrr.fit(x_train, y_train)
y_lrr_pred = lrr.predict(x_test)

mse = mean_squared_error(y_test, y_lrr_pred)
r2 = r2_score(y_test, y_lrr_pred)

print("Linear Regression MSE:", mse)
print("Linear Regression R2:", r2)


         ph    Hardness        Solids  Chloramines     Sulfate  Conductivity  \
0       NaN  204.890455  20791.318981     7.300212  368.516441    564.308654   
1  3.716080  129.422921  18630.057858     6.635246         NaN    592.885359   
2  8.099124  224.236259  19909.541732     9.275884         NaN    418.606213   
3  8.316766  214.373394  22018.417441     8.059332  356.886136    363.266516   
4  9.092223  181.101509  17978.986339     6.546600  310.135738    398.410813   

   Organic_carbon  Trihalomethanes  Turbidity  Potability  
0       10.379783        86.990970   2.963135           0  
1       15.180013        56.329076   4.500656           0  
2       16.868637        66.420093   3.055934           0  
3       18.436524       100.341674   4.628771           0  
4       11.558279        31.997993   4.075075           0  
Decision Tree Accuracy: 0.6082317073170732
Linear Regression MSE: 0.23453646592610178
Linear Regression R2: -0.005684438872198161
