In [27]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, classification_report, confusion_matrix

In [28]:
df = pd.read_csv("./stations_cleaned_sample.csv")
originalColumns = df.columns.tolist()
df.shape

(1611, 21)

In [29]:
df.sample(10)

Unnamed: 0,station_code,monitoring_location,state_name,temp_min,temp_max,do_min,do_max,ph_min,ph_max,conductivity_min,...,bod_min,bod_max,nitrate_min,nitrate_max,fecal_coliform_min,fecal_coliform_max,total_coliform_min,total_coliform_max,fecal_streptococci_min,fecal_streptococci_max
1180,1303,RIVER BRAHMANI AT TALCHER U/S,ODISHA,22.0,28.0,6.8,9.0,7.1,8.2,136.0,...,1.1,2.4,0.32,0.67,130.0,1300.0,490.0,2800.0,,
464,1370,RIVER KSHIPRA AT TRIVENISANGAM (1 KM. D/S OF S...,MADHYA PRADESH,22.0,27.0,7.0,7.6,7.4,7.9,333.0,...,2.4,6.0,0.32,1.94,2.0,1600.0,210.0,1600.0,2.0,6.0
870,4723,RIVER TEESTA AT JALPESH,WEST BENGAL,11.0,29.0,6.0,8.6,6.9,7.7,52.0,...,1.0,2.8,0.3,0.57,1100.0,7000.0,2100.0,50000.0,17.0,130.0
643,2383,RIVER DAMODAR U/S JAMADOBA WATER WORKS,JHARKHAND,19.0,34.0,7.3,7.8,7.4,7.6,1.1,...,0.0,0.0,,,,,,,,
925,1531,RIVER MORA BHARALI AT TEZPUR,ASSAM ASSAM,19.0,30.0,3.8,5.4,6.9,7.4,148.0,...,3.0,3.3,0.9,2.0,360.0,940.0,1100.0,2000.0,120.0,290.0
385,2619,RIVER PABBAR D/S SWARAKUDDU,HIMACHAL PRADESH,6.3,10.0,7.7,8.9,7.0,8.4,37.0,...,1.0,1.0,0.32,1.36,7.0,47.0,26.0,350.0,2.0,2.0
779,3769,RIVER BHOGDOI (PUJADUBI GHAT) INTAKE POINT OF ...,JORHAT ASSAM,21.0,33.0,6.0,8.0,6.6,7.7,138.0,...,1.6,2.8,0.6,1.4,300.0,730.0,700.0,1100.0,110.0,280.0
1224,2158,RIVER GODAVARI AT U/S OF PAITHAN AT PAITHAN IN...,MAHARASHTRA,22.0,36.0,6.0,7.3,8.5,9.0,612.0,...,2.4,7.5,0.3,1.45,2.0,17.0,2.0,47.0,2.0,2.0
1586,2385,RIVER SUBARNAREKHA AT NAMKUM ROAD BRIDGE,JHARKHAND,11.0,25.0,4.6,6.7,6.5,6.8,2.0,...,0.0,0.0,,,,,,,,
1002,4590,RIVER NARMADA AT UP-STREAM BEFORE CONFLUENCE O...,MADHYA PRADESH,14.0,26.0,5.8,11.0,7.4,8.5,184.0,...,1.0,2.1,0.96,6.16,2.0,48.0,150.0,350.0,2.0,2.0


In [30]:
df.columns = [c.strip().lower().replace(" ", "_") for c in df.columns]

In [31]:
df.columns

Index(['station_code', 'monitoring_location', 'state_name', 'temp_min',
       'temp_max', 'do_min', 'do_max', 'ph_min', 'ph_max', 'conductivity_min',
       'conductivity_max', 'bod_min', 'bod_max', 'nitrate_min', 'nitrate_max',
       'fecal_coliform_min', 'fecal_coliform_max', 'total_coliform_min',
       'total_coliform_max', 'fecal_streptococci_min',
       'fecal_streptococci_max'],
      dtype='object')

In [32]:
df.drop_duplicates(inplace=True)

In [33]:
df.isna().sum()

station_code                0
monitoring_location         6
state_name                124
temp_min                    7
temp_max                    8
do_min                      8
do_max                      8
ph_min                      8
ph_max                      8
conductivity_min            8
conductivity_max            8
bod_min                     8
bod_max                     8
nitrate_min                56
nitrate_max                56
fecal_coliform_min        184
fecal_coliform_max        185
total_coliform_min        224
total_coliform_max        224
fecal_streptococci_min    766
fecal_streptococci_max    767
dtype: int64