In [1]:
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (classification_report, roc_auc_score,
                             precision_recall_curve, auc, confusion_matrix)

In [2]:
import joblib

In [3]:
df1 = pd.read_csv("water_quality.csv")
df1.head()

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.99097,2.963135,0
1,3.71608,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.5466,310.135738,398.410813,11.558279,31.997993,4.075075,0


In [25]:
df1 = df1.head(3000).reset_index(drop=True)

In [26]:
df1

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity,Potability
0,,204.890455,20791.318981,7.300212,368.516441,564.308654,10.379783,86.990970,2.963135,0
1,3.716080,129.422921,18630.057858,6.635246,,592.885359,15.180013,56.329076,4.500656,0
2,8.099124,224.236259,19909.541732,9.275884,,418.606213,16.868637,66.420093,3.055934,0
3,8.316766,214.373394,22018.417441,8.059332,356.886136,363.266516,18.436524,100.341674,4.628771,0
4,9.092223,181.101509,17978.986339,6.546600,310.135738,398.410813,11.558279,31.997993,4.075075,0
...,...,...,...,...,...,...,...,...,...,...
1995,,209.866095,20116.911103,6.941062,370.383694,338.957198,11.342562,51.159669,3.619546,1
1996,7.800421,210.670304,19873.300918,7.284676,,448.595641,7.024747,55.803503,3.345910,1
1997,7.264069,216.622392,19412.112723,6.932458,,301.953551,19.210991,67.712585,3.960282,1
1998,7.544306,211.051146,34359.400797,8.166793,365.812313,447.520655,18.553478,60.162746,3.714096,1


In [6]:
FEATURES = ['ph', 'Hardness', 'Solids', 'Chloramines', 'Sulfate',
            'Conductivity', 'Organic_carbon', 'Trihalomethanes', 'Turbidity']
TARGET = 'Potability'   # 0 or 1
RANDOM_STATE = 42
TEST_SIZE = 0.20
N_ROWS = 1000   # use only the first 1000 rows

In [27]:
# show counts of missing values per column
print(df1[FEATURES + [TARGET]].isna().sum())
# show rows with any NaN (first 10)
print(df1[df1[FEATURES].isna().any(axis=1)].head(10))

ph                 303
Hardness             0
Solids               0
Chloramines          0
Sulfate            471
Conductivity         0
Organic_carbon       0
Trihalomethanes    100
Turbidity            0
Potability           0
dtype: int64
          ph    Hardness        Solids  Chloramines     Sulfate  Conductivity  \
0        NaN  204.890455  20791.318981     7.300212  368.516441    564.308654   
1   3.716080  129.422921  18630.057858     6.635246         NaN    592.885359   
2   8.099124  224.236259  19909.541732     9.275884         NaN    418.606213   
8        NaN  118.988579  14285.583854     7.804174  268.646941    389.375566   
11  7.974522  218.693300  18767.656682     8.110385         NaN    364.098230   
13       NaN  150.174923  27331.361962     6.838223  299.415781    379.761835   
14  7.496232  205.344982  28388.004887     5.072558         NaN    444.645352   
16  7.051786  211.049406  30980.600787    10.094796         NaN    315.141267   
18  8.975464  279.357167  19

In [28]:
df = df1.head(3000).dropna(subset=FEATURES + [TARGET]).reset_index(drop=True)
print("Rows after drop:", len(df))
# then continue with train_test_split on df

Rows after drop: 1230


In [13]:
X = df[FEATURES].copy()
y = df[TARGET].astype(int).copy()

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE
)

In [15]:
X_test

Unnamed: 0,ph,Hardness,Solids,Chloramines,Sulfate,Conductivity,Organic_carbon,Trihalomethanes,Turbidity
307,7.815122,190.313904,20229.108150,9.186893,335.638726,379.222873,14.979348,73.425428,3.096163
931,6.948265,225.231760,6874.586016,10.116984,429.045620,386.582083,15.845193,59.474013,4.095612
1153,9.678051,188.368741,26402.703957,4.074141,395.904208,325.001116,16.917994,91.416880,3.029700
1040,6.320375,165.821545,20481.642071,7.605958,354.289035,422.341048,14.793960,46.043065,2.728617
13,6.514415,198.767351,21218.702871,8.670937,323.596349,413.290450,14.900000,79.847843,5.200885
...,...,...,...,...,...,...,...,...,...
1196,6.967697,177.055890,22886.965735,6.430935,315.282902,507.790062,15.288038,35.767601,4.819637
660,7.775386,193.077168,15704.482093,7.881197,324.336203,301.753477,13.378165,89.051957,3.309472
854,8.397248,199.495811,16772.226246,8.117218,343.403764,402.464906,17.236047,75.853549,2.835794
354,6.624573,172.055471,14877.289737,7.079934,338.441277,405.818097,15.656149,58.560531,4.333721


In [20]:
pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler()),
    ('clf', LogisticRegression(
        solver='saga',       # good general solver; supports l1 & l2
        penalty='l2',        # change to 'l1' only if you want L1 sparsity
        C=1.0,               # regularization strength (smaller -> stronger reg)
        class_weight='balanced',  # optional if classes imbalanced
        max_iter=10000,      # increase iterations
        tol=1e-4,            # stopping tolerance
        random_state=42,
        verbose=0
    ))
])

In [21]:
pipe.fit(X_train, y_train)

0,1,2
,steps,"[('imputer', ...), ('scaler', ...), ...]"
,transform_input,
,memory,
,verbose,False

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,copy,True
,with_mean,True
,with_std,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'saga'
,max_iter,10000


In [23]:
clf = pipe.named_steps['clf']
print("n_iter_ (per class or overall):", getattr(clf, "n_iter_", None))

n_iter_ (per class or overall): [19]


In [24]:
model.score(X_test,y_test)

0.5894308943089431