# Importing libraries


In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings

: 

# Reading Dataset

In [None]:
df = pd.read_csv('water_dataX.csv',encoding='ISO-8859-1',low_memory=False)

# Analyse the data

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.info()

In [None]:
df.shape

**Handling Missing Values**

In [None]:
df.isnull().any()

In [None]:
df.isnull().sum()

In [None]:
df.dtypes

In [None]:
df['Temp']=pd.to_numeric(df['Temp'],errors='coerce')
df['D.O. (mg/l)']=pd.to_numeric(df['D.O. (mg/l)'],errors='coerce')
df['PH']=pd.to_numeric(df['PH'],errors='coerce')
df['B.O.D. (mg/l)']=pd.to_numeric(df['B.O.D. (mg/l)'],errors='coerce')
df['CONDUCTIVITY (µmhos/cm)']=pd.to_numeric(df['CONDUCTIVITY (µmhos/cm)'],errors='coerce')
df['NITRATENAN N+ NITRITENANN (mg/l)']=pd.to_numeric(df['NITRATENAN N+ NITRITENANN (mg/l)'],errors='coerce')
df['TOTAL COLIFORM (MPN/100ml)Mean']=pd.to_numeric(df['TOTAL COLIFORM (MPN/100ml)Mean'],errors='coerce')
df.dtypes

In [None]:
df.isnull().sum()

In [None]:
df['Temp'].fillna(df['Temp'].mean(),inplace=True)
df['D.O. (mg/l)'].fillna(df['D.O. (mg/l)'].mean(),inplace=True)
df['PH'].fillna(df['PH'].mean(),inplace=True)
df['CONDUCTIVITY (µmhos/cm)'].fillna(df['CONDUCTIVITY (µmhos/cm)'].mean(),inplace=True)
df['B.O.D. (mg/l)'].fillna(df['B.O.D. (mg/l)'].mean(),inplace=True)
df['NITRATENAN N+ NITRITENANN (mg/l)'].fillna(df['NITRATENAN N+ NITRITENANN (mg/l)'].mean(),inplace=True)
df['TOTAL COLIFORM (MPN/100ml)Mean'].fillna(df['TOTAL COLIFORM (MPN/100ml)Mean'].mean(),inplace=True)

In [None]:
df.drop(["FECAL COLIFORM (MPN/100ml)"],axis=1,inplace=True)

In [None]:
df=df.rename(columns = {'D.O. (mg/l)': 'do'})
df=df.rename(columns = {'CONDUCTIVITY (µmhos/cm)': 'co'})
df=df.rename(columns = {'B.O.D. (mg/l)': 'bod'})
df=df.rename(columns = {'NITRATENAN N+ NITRITENANN (mg/l)': 'na'})
df=df.rename(columns = {'TOTAL COLIFORM (MPN/100ml)Mean': 'tc'})
df=df.rename(columns = {'STATION CODE': 'station'})
df=df.rename(columns = {'LOCATIONS': 'location'})
df=df.rename(columns = {'STATE': 'state'})
df=df.rename(columns = {'PH': 'ph'})

**Water Quality Index (WQI) Calculation**

In [None]:
#calculation of pH
df['npH']=df.ph.apply(lambda x: (100 if(8.5>=x>=7)
                                else(80 if(8.6>=x>=8.5) or (6.9>=x>=6.8)
                                  else (60 if(8.8>=x>=8.6) or (6.8>=x>=6.7)
                                      else(40 if(9>=x>=8.8) or (6.7>=x>=6.5)
                                          else 0)))))

In [None]:
#calculation of dissolved oxygen
df['ndo']=df.do.apply(lambda x: (100 if(x>=6)
                                else(80 if(6>=x>=5.1)
                                  else (60 if(5>=x>=4.1)
                                      else(40 if(4>=x>=3)
                                          else 0)))))

In [None]:
#calculation of total coliform
df['nco']=df.tc.apply(lambda x: (100 if(5>=x>=0)
                                else(80 if(50>=x>=5)
                                  else (60 if(500>=x>=50)
                                      else(40 if(10000>=x>=500)
                                          else 0)))))

In [None]:
#calculation of B.D.O
df['nbdo']=df.bod.apply(lambda x:(100 if(3>=x>=0)
                                else(80 if(6>=x>=3)
                                  else (60 if(80>=x>=6)
                                      else(40 if(125>=x>=80)
                                          else 0)))))

In [None]:
#calculation of electric conductivity
df['nec']=df.co.apply(lambda x:(100 if(75>=x>=0)
                                else(80 if(150>=x>=75)
                                  else (60 if(225>=x>=150)
                                      else(40 if(300>=x>=225)
                                          else 0)))))

In [None]:
#calculation of nitrate
df['nna']=df.na.apply(lambda x:(100 if(20>=x>=0)
                                else(80 if(50>=x>=20)
                                  else (60 if(100>=x>=50)
                                      else(40 if(200>=x>=100)
                                          else 0)))))

In [None]:
#Calculation of Water Quality Index WQI
df['wph']=df.npH*0.165
df['wdo']=df.ndo*0.281
df['wbdo']=df.nbdo*0.234
df['wec']=df.nec*0.009
df['wna']=df.nna*0.028
df['wco']=df.nco*0.281
df['wqi']=df.wph+df.wdo+df.wbdo+df.wec+df.wna+df.wco
df

In [None]:
#Calculation of overall WQI for each year
average = df.groupby('year')['wqi'].mean()
average.head()

# Data Visualization


**Univariate analysis**


In [None]:
df.head()

In [None]:
#age using distribution plot->used for checking assymtery of curve
sns.distplot(df.Temp)


In [None]:
sns.boxplot(df.Temp)


In [None]:
sns.countplot(df.year)


In [None]:
sns.distplot(df.ph)


In [None]:
sns.countplot(df.npH)


In [None]:
sns.countplot(df.nbdo)


In [None]:
sns.distplot(df.bod)


**Bivariate analysis**


In [None]:
sns.lineplot(df.ph,df.do)


In [None]:
sns.lineplot(df.Temp,df.ph)


In [None]:
sns.scatterplot(df.Temp,df.do)


**Multivariate analysis**


In [None]:
# sns.pairplot(df)

In [None]:
df.hist(figsize=(8,8))

In [None]:
plt.figure(figsize=(17,8))
sns.heatmap(df.corr(),annot=True)


# Model Training

**Splitting Dependent and Independent Columns**

In [None]:
df.head()
df.drop(['location','station','state'],axis =1,inplace=True)

In [None]:
df.head()

In [None]:
x=df.iloc[:,1:7].values


In [None]:
x.shape

In [None]:
y=df.iloc[:,-1:].values
y.shape

In [None]:
print(x)

In [None]:
print(y)

In [None]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state=10)

**Random_Forest_Regression**


In [None]:
#Feature Scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [None]:
print(x_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor
regressor = RandomForestRegressor(n_estimators = 10, random_state = 0)
regressor.fit(x_train, y_train)
y_pred = regressor.predict(x_test)

# Model Evaluation

In [None]:
from sklearn import metrics
print('MAE:',metrics.mean_absolute_error(y_test,y_pred))
print('MSE:',metrics.mean_squared_error(y_test,y_pred))
print('RMSE:',np.sqrt(metrics.mean_squared_error(y_test,y_pred)))


In [None]:
metrics.r2_score(y_test, y_pred)


# Save The Model

In [None]:
import pickle
pickle.dump(regressor,open('model/ibm.pkl', 'wb'))
model = pickle.load(open('model/ibm.pkl','rb'))