In [191]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [192]:
df = pd.read_csv("./Datasets/stations_full.csv")

In [193]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1611 entries, 0 to 1610
Data columns (total 21 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   station_code            1611 non-null   int64  
 1   monitoring_location     1605 non-null   object 
 2   state_name              1487 non-null   object 
 3   temp_min                1604 non-null   float64
 4   temp_max                1603 non-null   float64
 5   do_min                  1603 non-null   float64
 6   do_max                  1603 non-null   float64
 7   ph_min                  1603 non-null   float64
 8   ph_max                  1603 non-null   float64
 9   conductivity_min        1603 non-null   float64
 10  conductivity_max        1603 non-null   float64
 11  bod_min                 1603 non-null   float64
 12  bod_max                 1603 non-null   float64
 13  nitrate_min             1555 non-null   float64
 14  nitrate_max             1555 non-null   

In [194]:
df.describe()

Unnamed: 0,station_code,temp_min,temp_max,do_min,do_max,ph_min,ph_max,conductivity_min,conductivity_max,bod_min,bod_max,nitrate_min,nitrate_max,fecal_coliform_min,fecal_coliform_max,total_coliform_min,total_coliform_max,fecal_streptococci_min,fecal_streptococci_max
count,1611.0,1604.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1603.0,1555.0,1555.0,1427.0,1426.0,1387.0,1387.0,843.0,842.0
mean,4530.629423,21.865711,27.234747,5.914348,7.925452,12.796457,20.944404,282.973674,1163.535434,196.846394,671.600119,372.19555,13378.43,4010.604,78789.1,6287.742,359522.9,115.423013,1369.296437
std,5972.035952,108.908983,6.240395,2.11129,1.913152,43.723717,104.006886,897.563781,4121.771732,5204.456361,15444.5135,6766.65152,371210.2,60091.06,917688.5,89055.03,5335435.0,770.670783,19745.649075
min,1.0,0.3,1.1,0.3,0.3,1.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,1.8,2.0
25%,1819.5,15.0,24.0,5.2,7.1,7.0,7.9,115.0,245.0,1.0,1.7,0.3,0.665,2.0,30.0,33.0,170.0,2.0,2.0
50%,2953.0,19.0,29.0,6.3,7.9,7.2,8.2,199.0,441.0,1.1,2.6,0.32,1.43,27.0,210.0,170.0,1300.0,2.0,6.0
75%,4427.5,22.0,31.1,7.1,9.0,7.5,8.4,315.5,769.0,2.0,3.8,0.62,3.38,350.0,2300.0,1100.0,4900.0,17.0,217.5
max,30089.0,3836.0,39.0,28.2,28.0,755.0,1878.0,34400.0,54200.0,170000.0,470000.0,230000.0,14000000.0,2200000.0,24000000.0,3200000.0,160000000.0,17000.0,540000.0


In [195]:
df.isna().sum()

station_code                0
monitoring_location         6
state_name                124
temp_min                    7
temp_max                    8
do_min                      8
do_max                      8
ph_min                      8
ph_max                      8
conductivity_min            8
conductivity_max            8
bod_min                     8
bod_max                     8
nitrate_min                56
nitrate_max                56
fecal_coliform_min        184
fecal_coliform_max        185
total_coliform_min        224
total_coliform_max        224
fecal_streptococci_min    768
fecal_streptococci_max    769
dtype: int64

In [196]:
categorical_cols = ['monitoring_location', 'state_name']
numerical_cols = []

In [197]:
for col in df.columns:
    if col not in categorical_cols and col != 'station_code':
        numerical_cols.append(col)

In [198]:
print(categorical_cols)
print(numerical_cols)

['monitoring_location', 'state_name']
['temp_min', 'temp_max', 'do_min', 'do_max', 'ph_min', 'ph_max', 'conductivity_min', 'conductivity_max', 'bod_min', 'bod_max', 'nitrate_min', 'nitrate_max', 'fecal_coliform_min', 'fecal_coliform_max', 'total_coliform_min', 'total_coliform_max', 'fecal_streptococci_min', 'fecal_streptococci_max']


In [199]:
df_cleaned = df.dropna(subset=['state_name', 'monitoring_location'])

In [200]:
df_cleaned = df_cleaned.drop_duplicates()

In [201]:
si =SimpleImputer(strategy="mean")

In [202]:
df_impute = si.fit_transform(df[numerical_cols])

In [203]:
df_impute = pd.DataFrame(df_impute, columns=numerical_cols)

In [204]:
df_impute.isna().sum()

temp_min                  0
temp_max                  0
do_min                    0
do_max                    0
ph_min                    0
ph_max                    0
conductivity_min          0
conductivity_max          0
bod_min                   0
bod_max                   0
nitrate_min               0
nitrate_max               0
fecal_coliform_min        0
fecal_coliform_max        0
total_coliform_min        0
total_coliform_max        0
fecal_streptococci_min    0
fecal_streptococci_max    0
dtype: int64

In [205]:
Q1 = df_impute.quantile(0.25)
Q3 = df_impute.quantile(0.75)
IQR = Q3 - Q1

outliers = ((df_impute < (Q1 - 1.5 * IQR)) | (df_impute > (Q3 + 1.5 * IQR)))
outlier_count = outliers.sum().sort_values(ascending=False)
outlier_count

fecal_coliform_max        345
fecal_coliform_min        294
total_coliform_max        278
nitrate_min               237
bod_max                   220
nitrate_max               209
conductivity_max          162
bod_min                   142
conductivity_min          137
do_min                    127
total_coliform_min        104
ph_min                     92
ph_max                     88
do_max                     84
temp_max                   80
temp_min                   56
fecal_streptococci_min     33
fecal_streptococci_max     23
dtype: int64

In [206]:
df_capped = df_impute.copy()
for col in df_capped.columns:
    lower = Q1[col] - 1.5 * IQR[col]
    upper = Q3[col] + 1.5 * IQR[col]
    df_capped[col] = np.clip(df_capped[col], lower, upper)

In [207]:
Q1 = df_capped.quantile(0.25)
Q3 = df_capped.quantile(0.75)
IQR = Q3 - Q1

outliers = ((df_capped < (Q1 - 1.5 * IQR)) | (df_capped > (Q3 + 1.5 * IQR)))
outlier_count = outliers.sum().sort_values(ascending=False)
outlier_count

temp_min                  0
temp_max                  0
do_min                    0
do_max                    0
ph_min                    0
ph_max                    0
conductivity_min          0
conductivity_max          0
bod_min                   0
bod_max                   0
nitrate_min               0
nitrate_max               0
fecal_coliform_min        0
fecal_coliform_max        0
total_coliform_min        0
total_coliform_max        0
fecal_streptococci_min    0
fecal_streptococci_max    0
dtype: int64

In [208]:
from sklearn.preprocessing import OneHotEncoder

In [209]:
ohe = OneHotEncoder(drop='first', handle_unknown='ignore', sparse_output=False)

In [210]:
df_encode = ohe.fit_transform(df[categorical_cols])

In [211]:
df_encode

array([[0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [212]:
df_encode = pd.DataFrame(df_encode, columns=ohe.get_feature_names_out(categorical_cols))

In [213]:
df_final = pd.concat([df_impute, df_encode], axis=1)

In [214]:
df_final.sample(10)

Unnamed: 0,temp_min,temp_max,do_min,do_max,ph_min,ph_max,conductivity_min,conductivity_max,bod_min,bod_max,...,state_name_TULI NAGALAND,state_name_U/S OF TLAWNG BRIDGE MIZORAM,state_name_UDALGURI ASSAM,state_name_UTTAR PRADESH,state_name_UTTARAKHAND,state_name_VALLUR (M) ANDHRA PRADESH,state_name_VENGANUR (V) ANDHRA PRADESH,state_name_WAYANAD KERALA,state_name_WEST BENGAL,state_name_nan
983,21.2,25.4,7.0,7.8,7.1,8.0,256.0,642.0,1.3,1.9,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
30,11.0,34.0,7.7,9.1,7.7,8.3,198.0,359.0,1.0,1.2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
204,3.0,10.0,7.6,8.5,7.2,8.1,88.0,221.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
149,8.4,22.8,5.3,9.4,7.6,7.9,121.0,310.0,2.0,3.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
216,12.0,23.0,9.6,11.0,7.1,8.3,70.0,154.0,1.0,1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
553,14.0,30.0,0.3,3.9,7.2,7.8,249.0,980.0,3.2,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
166,19.0,31.0,6.4,8.2,7.4,8.0,340.0,476.0,1.0,2.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
195,9.0,17.0,8.9,9.6,7.2,8.0,77.0,480.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1194,9.0,23.0,7.8,8.2,7.2,7.3,1.1,2.4,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1340,25.0,27.0,7.1,7.7,7.2,8.2,772.0,950.0,2.4,2.8,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [215]:
df_final['temp_mean'] = df[['temp_min', 'temp_max']].median(axis=1)
df_final['do_mean'] = df[['do_min', 'do_max']].median(axis=1)
df_final['ph_mean'] = df[['ph_min', 'ph_max']].median(axis=1)
df_final['conductivity_mean'] = df[['conductivity_min', 'conductivity_max']].median(axis=1)
df_final['bod_mean'] = df[['bod_min', 'bod_max']].median(axis=1)
df_final['fecal_coliform_mean'] = df[['fecal_coliform_min', 'fecal_coliform_max']].median(axis=1)

In [216]:
df_final[df_final['conductivity_mean']>10000]['conductivity_mean'].count()

np.int64(19)

In [217]:
WHO_RANGES = {
    'conductivity': (20, 220),  # µS/cm
    'ph': (6.5, 8.5),            # pH
    'do': (5, 14.6),             # mg/L
    'temp': (0, 35),             # °C
    'bod': (0, 10),              # mg/L
    'fecal_coliform': (0, 500)   # MPN/100mL
}

In [218]:
for param, (min_val, max_val) in WHO_RANGES.items():
    col = f"{param}_mean"
    deviation_col = f"{param}_deviation"
    df_final[deviation_col] = df_final[col].apply(
        lambda x: 0 if min_val <= x <= max_val 
        else min_val - x if x < min_val 
        else x - max_val
    )

In [219]:
IDEAL_VALUES = {
    'conductivity': 150,  # midpoint
    'ph': 7.0,
    'do': 10.0,           # ideal DO
    'temp': 25,           # ambient ideal
    'bod': 3,             # ideal BOD
    'fecal_coliform': 0
}

In [220]:
for param, (min_val, max_val) in WHO_RANGES.items():
    col = f"{param}_mean"
    ideal = IDEAL_VALUES[param]
    Qi_col = f"{param}_Qi"
    df_final[Qi_col] = ((df_final[col] - ideal) / (max_val - ideal)) * 100
    df_final[Qi_col] = df_final[Qi_col].abs()  # keep positive

In [221]:
weights = {
    'conductivity': 1,
    'ph': 1,
    'do': 1,
    'temp': 1,
    'bod': 1,
    'fecal_coliform': 1
}

In [222]:
df_final['WQI'] = 0
total_weight = sum(weights.values())

for param in WHO_RANGES.keys():
    df_final['WQI'] += df_final[f'{param}_Qi'] * weights[param]

df_final['WQI'] = df_final['WQI'] / total_weight

In [223]:
def classify_wqi(wqi):
    if wqi <= 25:
        return 'Excellent'
    elif wqi <= 50:
        return 'Good'
    elif wqi <= 75:
        return 'Poor'
    elif wqi <= 100:
        return 'Very Poor'
    else:
        return 'Unsuitable'

df_final['WQI_Class'] = df_final['WQI'].apply(classify_wqi)

In [224]:
# [['temp_mean', 'ph_mean', 'do_mean', 'bod_mean', 'conductivity_mean', 'fecal_coliform_mean']]

In [225]:
df_final['WQI'].unique()

array([          nan,   56.31732229,   82.51656315, ...,   87.21152519,
        272.59744651, 2446.55734507])

In [226]:
df_final['WQI_Class'].value_counts(normalize=True)*100

WQI_Class
Unsuitable    60.831782
Poor          15.332092
Good          11.980137
Very Poor     10.738672
Excellent      1.117318
Name: proportion, dtype: float64

In [227]:
df_final.isna().sum()

temp_min               0
temp_max               0
do_min                 0
do_max                 0
ph_min                 0
                    ... 
temp_Qi                7
bod_Qi                 8
fecal_coliform_Qi    184
WQI                  184
WQI_Class              0
Length: 1752, dtype: int64

In [228]:
# features and label
X = df_final[['temp_mean', 'ph_mean', 'do_mean', 'bod_mean', 'conductivity_mean', 'fecal_coliform_mean']]
y = df_final['WQI_Class']

In [229]:
# encode labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y_encoded = le.fit_transform(y)

In [230]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

In [231]:
X_train_imputed = si.fit_transform(X_train)
X_test_imputed = si.transform(X_test)

In [232]:
# train SVM
svm = SVC(kernel='rbf', C=1, gamma='scale')
svm.fit(X_train_imputed, y_train)

0,1,2
,C,1
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [233]:
y_pred = svm.predict(X_test_imputed)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
confusion = confusion_matrix(y_test, y_pred)

print(f'Accuracy Score: {acc}\n Precision Score: {precision_score}\n Recall Score: {recall}')

Accuracy Score: 0.6037151702786377
 Precision Score: <function precision_score at 0x7f81bf6e4900>
 Recall Score: 0.6037151702786377


  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
