In [14]:
# Doing all the imports

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

In [15]:
YEARS_FOR_PREDICTION = [2022, 2023, 2024, 2025, 2026, 2027, 2028, 2029, 2030]

This is the prediction model that will return the best prediction model according to the data given to it

In [16]:
# Data is expected to be n*2 np array with the first column being the values and the second column being the years

def predictParameterModel(data):
    # print(data)
    last_data = data[-1,:]
    actual_param = last_data[0]
    data = data[:-1,:]
    values = data[:,0]
    year = data[:,1]
    degree_list = [1,2,3,4,5]

    min = 10000
    best_degree = 0
    for degree in degree_list:
        coefficients = np.polyfit(year, values, degree)
        poly_fit = np.poly1d(coefficients)
        predicted_param = poly_fit(int(last_data[1]))
        loss = abs(predicted_param-actual_param)
        if loss < min:
            min = loss
            best_degree = degree

    coefficients = np.polyfit(year, values, best_degree)
    poly_fit = np.poly1d(coefficients)

    return poly_fit

In [17]:
# Getting the data
data = pd.read_csv('ganga_data.csv')

# Filter rows where the LOCATIONS column contains the word "GANGA"
data = data[data['LOCATIONS'].str.contains(' GANGA ', case=False)]
no_of_rows = data.shape[0]

In [18]:
nan_indices = data[data['STATE'] == 'NAN'].index.tolist()
data = data.drop(nan_indices)
STATE_NAME_LIST = sorted(data['STATE'].unique())
YEAR_LIST = [2022, 2023, 2024, 2025, 2026]

In [19]:
# Data pre-processing

data.fillna(0, inplace=True)
# print(data.dtypes)

data['Temp'] = pd.to_numeric(data['Temp'], errors = 'coerce')
data['D.O. (mg/l)'] = pd.to_numeric(data['D.O. (mg/l)'], errors = 'coerce')
data['PH'] = pd.to_numeric(data['PH'], errors='coerce')
data['B.O.D. (mg/l)'] = pd.to_numeric(data['B.O.D. (mg/l)'], errors='coerce')
data['CONDUCTIVITY'] = pd.to_numeric(data['CONDUCTIVITY'], errors='coerce')
data['NITRATENAN N+ NITRITENANN (mg/l)'] = pd.to_numeric(data['NITRATENAN N+ NITRITENANN (mg/l)'], errors='coerce')
data['TOTAL COLIFORM (MPN/100ml)Mean'] = pd.to_numeric(data['TOTAL COLIFORM (MPN/100ml)Mean'], errors='coerce')

start = 0
end = no_of_rows

station = data.iloc[start:end, 0]
location = data.iloc[start:end ,1]
state = data.iloc[start:end, 2]
do = data.iloc[start:end, 4].astype(np.float64)
ph = data.iloc[start:end, 5]
co = data.iloc [start:end, 6].astype(np.float64)   
year = data.iloc[start:end, 11]
tc = data.iloc[start:end, 10].astype(np.float64)
bod = data.iloc[start:end, 7].astype(np.float64)
na = data.iloc[start:end, 8].astype(np.float64)

mean = {}
mean['do'] = do.mean()
mean['ph'] = ph.mean()
mean['co'] = co.mean()
mean['bod'] = bod.mean()
mean['tc'] = tc.mean()
mean['na'] = na.mean()
# print(data.columns)
data = pd.concat([station,location,state,do,ph,co,bod,na,tc,year], axis=1)
# print(data.columns)
data.columns = ['station','location','state','do','ph','co','bod','na','tc','year']

In [20]:
# Adding more columns/features to the data using the already given features

#calulation of Ph
data['npH']=data.ph.apply(lambda x: (100 if (8.5>=x>=7)  
                                 else(80 if  (8.6>=x>=8.5) or (6.9>=x>=6.8) 
                                      else(60 if (8.8>=x>=8.6) or (6.8>=x>=6.7) 
                                          else(40 if (9>=x>=8.8) or (6.7>=x>=6.5)
                                              else 0)))))
#calculation of dissolved oxygen
data['ndo']=data.do.apply(lambda x:(100 if (x>=6)  
                                 else(80 if  (6>=x>=5.1) 
                                      else(60 if (5>=x>=4.1)
                                          else(40 if (4>=x>=3) 
                                              else 0)))))
#calculation of total coliform
data['nco']=data.tc.apply(lambda x:(100 if (5>=x>=0)  
                                 else(80 if  (50>=x>=5) 
                                      else(60 if (500>=x>=50)
                                          else(40 if (10000>=x>=500) 
                                              else 0)))))
#calc of B.D.O
data['nbdo']=data.bod.apply(lambda x:(100 if (3>=x>=0)  
                                 else(80 if  (6>=x>=3) 
                                      else(60 if (80>=x>=6)
                                          else(40 if (125>=x>=80) 
                                              else 0)))))
#calculation of electrical conductivity
data['nec']=data.co.apply(lambda x:(100 if (75>=x>=0)  
                                 else(80 if  (150>=x>=75) 
                                      else(60 if (225>=x>=150)
                                          else(40 if (300>=x>=225) 
                                              else 0)))))
#Calulation of nitrate
data['nna']=data.na.apply(lambda x:(100 if (20>=x>=0)  
                                 else(80 if  (50>=x>=20) 
                                      else(60 if (100>=x>=50)
                                          else(40 if (200>=x>=100) 
                                              else 0)))))

data['wph']=data.npH * 0.165
data['wdo']=data.ndo * 0.281
data['wbdo']=data.nbdo * 0.234
data['wec']=data.nec* 0.009
data['wna']=data.nna * 0.028
data['wco']=data.nco * 0.281
data['wqi']=data.wph+data.wdo+data.wbdo+data.wec+data.wna+data.wco 
wqi_values = data['wqi']

# Specify the columns to drop
columns_to_drop = ['location', 'station']

# Drop the specified columns
data.drop(columns=columns_to_drop, inplace=True)

print(data.dtypes)

output_filename = 'ganga_wqi_values.txt'

# Write the 'wqi' values to the output file
wqi_values.to_csv(output_filename, index=False, header=False)

state     object
do       float64
ph       float64
co       float64
bod      float64
na       float64
tc       float64
year       int64
npH        int64
ndo        int64
nco        int64
nbdo       int64
nec        int64
nna        int64
wph      float64
wdo      float64
wbdo     float64
wec      float64
wna      float64
wco      float64
wqi      float64
dtype: object


In [21]:
data_for_states = data.groupby('state').mean()
states_ganga = data_for_states.index

In [22]:
cols_i_need = ['wqi','year']

ganga_state_wise = {}

for state in states_ganga:
    ag = data[data['state'] == state][cols_i_need].groupby('year').mean()
    # ag = ag.assign(state=state)

    ganga_state_wise[state] = ag

In [23]:
for state in states_ganga:
    print(state)
    print(ganga_state_wise[state])

ARUNACHAL PRADESH
        wqi
year       
2021  82.04
BIHAR
            wqi
year           
2016  82.040000
2017  70.980000
2018  70.872000
2019  70.845000
2020  70.800000
2021  70.232083
HIMACHAL PRADESH
        wqi
year       
2019  87.66
2020  87.66
2021  90.47
JHARKHAND
            wqi
year           
2019  98.900000
2020  99.200000
2021  99.015714
UTTAR PRADESH
            wqi
year           
2016  44.580000
2017  67.852000
2018  68.173333
2019  79.348889
2020  69.105455
2021  75.048600
UTTARAKHAND
            wqi
year           
2016  98.120000
2017  88.560000
2018  88.560000
2019  96.110000
2020  90.920000
2021  90.202581
WEST BENGAL
            wqi
year           
2016  70.800000
2018  70.800000
2019  67.470000
2020  68.460000
2021  68.973333


In [24]:
predictions = {}
model_up = None
for state in states_ganga:
    if state == 'ARUNACHAL PRADESH':
        continue
    wqi_values = ganga_state_wise[state]['wqi']
    years = ganga_state_wise[state].index
    np_data = np.column_stack((wqi_values, years))
    model = predictParameterModel(np_data)

    if state == 'UTTAR PRADESH':
        model_up = model

    wqi_list = []
    for year in YEARS_FOR_PREDICTION:
        wqi = model(year)
        wqi_list.append(wqi)
    predictions[state] = wqi_list

  coefficients = np.polyfit(year, values, degree)
  coefficients = np.polyfit(year, values, degree)
  coefficients = np.polyfit(year, values, degree)
  coefficients = np.polyfit(year, values, degree)
  coefficients = np.polyfit(year, values, degree)
  coefficients = np.polyfit(year, values, degree)
  coefficients = np.polyfit(year, values, degree)
  coefficients = np.polyfit(year, values, degree)
  coefficients = np.polyfit(year, values, degree)
  coefficients = np.polyfit(year, values, degree)
  coefficients = np.polyfit(year, values, degree)
  coefficients = np.polyfit(year, values, degree)
  coefficients = np.polyfit(year, values, degree)


In [25]:
# Create a DataFrame from the given data
df = pd.DataFrame(predictions)

# Transpose the DataFrame and reset the index to have the states as a column
df = df.T.reset_index()

# Rename the columns
df.columns = ['STATE', '2022', '2023', '2024', '2025', '2026', '2027', '2028', '2029', '2030']

# Save the DataFrame to a CSV file
df.to_csv('ganga_state_data.csv', index=False)

In [26]:
model_up

poly1d([ 6.05477980e+00, -1.21527337e+04])