In [97]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

df = pd.read_csv('stations_cleaned.csv', usecols=['station_code', 'monitoring_location', 'state_name', 'temp_min', 'temp_max', 'do_min', 'do_max', 'ph_min', 'ph_max', 'conductivity_min', 'conductivity_max', 'bod_min', 'bod_max', 'nitrate_min', 'nitrate_max', 'fecal_coliform_min', 'fecal_coliform_max', 'total_coliform_min', 'total_coliform_max'])

In [111]:
df1 = pd.read_csv("./stations_cleaned_sample.csv")

In [98]:
pollutant_cols = [
    'bod_max', 'nitrate_max', 'fecal_coliform_max', 'ph_max', 'conductivity_max',
]

In [99]:
df = df.sort_values('station_code')
df['is_pollution_mixing_point'] = 0

for i in range(1, len(df)):
    if df.iloc[i]['station_code'] == df.iloc[i-1]['station_code']:
        for col in pollutant_cols:
            if df.iloc[i][col] > df.iloc[i-1][col] * 1.5:
                df.at[df.index[i], 'is_pollution_mixing_point'] = 1


In [100]:
X = df[pollutant_cols + ['temp_max', 'do_max']]
y = df['is_pollution_mixing_point']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [101]:
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

0,1,2
,n_estimators,100
,criterion,'gini'
,max_depth,
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [109]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

# df['predicted_mixing_point'] = clf.predict(X=X)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       320

    accuracy                           1.00       320
   macro avg       1.00      1.00      1.00       320
weighted avg       1.00      1.00      1.00       320



In [103]:
def recommend_treatment(row):
    treatments = []
    if row['bod_max'] > 3:
        treatments.append("Aeration/Oxidation (for BOD reduction)")
    if row['nitrate_max'] > 10:
        treatments.append("Ion exchange or biological denitrification")
    if row['fecal_coliform_max'] > 1000:
        treatments.append("Chlorination or UV disinfection (for coliform removal)")
    if row['conductivity_max'] > 2000:
        treatments.append("Reverse Osmosis (RO) or Electro-dialysis")
    if row['ph_max'] < 6.5 or row['ph_max'] > 8.5:
        treatments.append("Chemical dosing for pH adjustment")
    return " → ".join(treatments) if treatments else "No special treatment needed"

In [None]:
df['treatment_series'] = df.apply(lambda row: recommend_treatment(row) 
    if row['predicted_mixing_point'] == 1 else "", axis=1)

KeyError: 'bod_max'

In [105]:
df

Unnamed: 0,station_code,monitoring_location,state_name,temp_min,temp_max,do_min,do_max,ph_min,ph_max,conductivity_min,...,bod_max,nitrate_min,nitrate_max,fecal_coliform_min,fecal_coliform_max,total_coliform_min,total_coliform_max,is_pollution_mixing_point,predicted_mixing_point,treatment_series
944,1,RIVER SABARMATI AT DHAROI DAM,GUJARAT,1.474144,28.0,1.335001,1.335001,1.158782,1.158782,6.329721,...,1.026672,0.799206,0.799206,7.378384,7.378384,7.378384,7.378384,0,0,
945,2,RIVER SABARMATI AT AHMEDABAD AT V.N. BRIDGE,GUJARAT,1.466077,30.0,0.262364,0.788457,1.128774,1.162283,6.548219,...,1.734031,0.290033,1.447463,7.378384,7.378384,7.378384,7.378384,0,0,
920,4,RIVER MAHI AT SEVALIA,GUJARAT,1.448822,32.0,2.140066,2.219203,1.158782,1.179052,5.575949,...,0.526589,0.349748,0.482782,2.079442,5.484797,3.135494,6.216606,0,0,
921,5,RIVER MAHI AT VASAD,GUJARAT,1.439569,33.0,2.140066,2.251292,1.155229,1.179052,5.771441,...,0.526589,0.433291,0.565772,1.098612,2.708050,3.091042,3.931826,0,0,
1006,7,RIVER NARMADA AT GARUDESHWAR,GUJARAT,1.466077,34.0,2.116256,2.240710,1.162283,1.175792,5.252273,...,0.526589,0.245008,0.472697,1.098612,3.178054,3.044522,4.553877,0,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,30085,RIVER RAVI AT MADHOPUR,PUNJAB,1.372307,24.0,2.302585,2.397895,1.158782,1.175792,5.153292,...,0.605904,0.284655,0.706084,3.637586,10.239996,5.993961,10.239996,0,0,
442,30086,"RIVER CHAMBAL AT GWALIOR ROAD BRIDGE, UDI, ETA...",UTTAR PRADESH,1.372307,31.0,2.128232,2.332144,1.128774,1.162283,5.840642,...,1.103568,0.245008,0.425678,1.098612,10.404293,1.098612,10.799596,0,0,
65,30087,"RIVER SUTLEJ U/S AT OLINDA, NEAR BHAKRA DAM, H.P.",HIMACHAL PRADESH,1.385227,23.0,2.079442,2.459589,1.158782,1.175792,5.411646,...,0.526589,0.300591,0.808329,1.098612,7.863651,2.639057,10.896758,0,0,
66,30088,"RIVER SUTLEJ D/S OLINDA, NEAR BHAKRA DAM, H.P.",HIMACHAL PRADESH,1.378870,23.0,2.091864,2.433613,1.162283,1.172487,5.384495,...,0.526589,0.325890,0.563108,1.609438,7.696667,2.708050,11.429555,0,0,


In [106]:
output = df[df['predicted_mixing_point'] == 1][['station_code', 'monitoring_location', 'state_name', 'treatment_series']]

In [107]:
output

Unnamed: 0,station_code,monitoring_location,state_name,treatment_series


In [108]:
# output.to_csv("detected_pollution_mixing_points_and_treatments.csv", index=False)
# print(output.head(10))