In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn.feature_selection as fs
import pickle
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import train_test_split
import sklearn.preprocessing as preprocessing
from sklearn.svm import SVR, NuSVR
from sklearn.pipeline import make_pipeline
from datetime import datetime

In [6]:
gemstat

Unnamed: 0,Station,Date,Time,Parameter,Param_Value,Unit,Comment,datetime
0,IND00073,2020-11-30,23:00,O2-Dis,7.10,mg/l,Unknown,2020-11-3023:00
1,IND00073,2020-02-02,23:00,O2-Dis,8.00,mg/l,Unknown,2020-02-0223:00
2,IND00073,2018-04-01,22:00,BOD,1.20,mg/l,Unknown,2018-04-0122:00
3,IND00073,2019-04-03,22:00,pH,7.70,---,Unknown,2019-04-0322:00
4,IND00073,2020-04-30,22:00,TURB,0.60,NTU,Unknown,2020-04-3022:00
...,...,...,...,...,...,...,...,...
12798,IND02679,2021-07-21,22:00,O2-Dis,6.39,mg/l,Unknown,2021-07-2122:00
12799,IND02679,2021-07-21,22:00,pH,7.91,---,Unknown,2021-07-2122:00
12800,IND02679,2021-07-21,22:00,TURB,0.00,NTU,Unknown,2021-07-2122:00
12801,IND02679,2021-05-31,22:00,TEMP,27.00,°C,Unknown,2021-05-3122:00


In [64]:
gemstat = pd.read_csv("../datasets/GEMStat/India-Lake-2023-07-04_13-45/samples.csv",delimiter = ';')
column_names = ['Station','Date','Time','Unknown','Parameter','Unknown2','blank','Param_Value','Unit','Comment']
gemstat.columns = column_names
gemstat['Datetime']=  pd.to_datetime(gemstat['Date'] + ' ' + gemstat['Time'])

gemstat= gemstat.drop(['Unknown','Unknown2','blank', 'Date','Time','Unit','Comment'],axis=1)


In [65]:
#print(gemstat['Station'].unique())

In [66]:
print(gemstat.loc[gemstat['Parameter']=='pH'].count())

Station        4072
Parameter      4072
Param_Value    4072
Datetime       4072
dtype: int64


In [67]:
o2manual = gemstat.loc[gemstat['Parameter']=='O2-Dis'].sort_values('Datetime',ascending=False).rename(columns={'Param_Value':'O2-Dis'})
tempmanual = gemstat.loc[gemstat['Parameter']=='TEMP'].sort_values('Datetime',ascending=False).rename(columns={'Param_Value':'TEMP'})
phmanual = gemstat.loc[gemstat['Parameter']=='pH'].sort_values('Datetime',ascending=False).rename(columns={'Param_Value':'pH'})


gemmerge = o2manual
gemmerge= gemmerge.merge(tempmanual,how='outer',on=['Station','Datetime'])
gemmerge= gemmerge.merge(phmanual, how='inner', on=['Station','Datetime'])

gemmerge = gemmerge[['Station','Datetime','O2-Dis','TEMP','pH']]

In [68]:
gemmerge


Unnamed: 0,Station,Datetime,O2-Dis,TEMP,pH
0,IND02679,2021-09-20 22:00:00,6.30,25.0,7.52
1,IND02679,2021-09-12 22:00:00,7.29,26.0,8.16
2,IND02679,2021-08-31 22:00:00,6.80,25.0,7.71
3,IND02315,2021-08-22 22:00:00,6.28,23.0,8.10
4,IND02679,2021-08-22 22:00:00,6.43,26.0,7.60
...,...,...,...,...,...
4034,IND02315,2008-09-30 22:00:00,8.65,29.6,8.60
4035,IND02315,2008-07-31 22:00:00,8.91,28.0,8.10
4036,IND02315,2019-10-02 22:00:00,,22.0,7.99
4037,IND02315,2019-10-02 22:00:00,,22.0,7.99


In [74]:
ind_meta = pd.read_excel("../datasets/GEMStat/India-Lake-2023-07-04_13-45/metadata.xlsx")
ind_meta = ind_meta.rename(columns = {'GEMS Station Number':'Station','Water Type':'Water_Type'})[['Station','Water_Type','Latitude','Longitude']]

Index(['Station', 'Water_Type', 'Latitude', 'Longitude'], dtype='object')

In [76]:
gemmerge.merge(ind_meta,how='left',on='Station')

Unnamed: 0,Station,Datetime,O2-Dis,TEMP,pH,Water_Type,Latitude,Longitude
0,IND02679,2021-09-20 22:00:00,6.30,25.0,7.52,Reservoir station,22.940278,79.923056
1,IND02679,2021-09-12 22:00:00,7.29,26.0,8.16,Reservoir station,22.940278,79.923056
2,IND02679,2021-08-31 22:00:00,6.80,25.0,7.71,Reservoir station,22.940278,79.923056
3,IND02315,2021-08-22 22:00:00,6.28,23.0,8.10,Lake station,24.345833,77.112500
4,IND02679,2021-08-22 22:00:00,6.43,26.0,7.60,Reservoir station,22.940278,79.923056
...,...,...,...,...,...,...,...,...
4034,IND02315,2008-09-30 22:00:00,8.65,29.6,8.60,Lake station,24.345833,77.112500
4035,IND02315,2008-07-31 22:00:00,8.91,28.0,8.10,Lake station,24.345833,77.112500
4036,IND02315,2019-10-02 22:00:00,,22.0,7.99,Lake station,24.345833,77.112500
4037,IND02315,2019-10-02 22:00:00,,22.0,7.99,Lake station,24.345833,77.112500
