## A program enabling the selection of optimal stations from among the existing networks of measurement stations to determine the spatial average value of the measured environmental parameter in the studied area.

In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from itertools import combinations

file_name=r'./data/SST_day.csv'         # input file name (including path)
df=pd. read_csv(file_name)
number_of_stations=3                    # define the number of measurement points
dependent_variable='AVG'
independent_variables=df.columns[2:]
y = df[dependent_variable]

#make a list of all combinations for independent stations 
independent_station_combinations = combinations(independent_variables, number_of_stations)
regr = linear_model.LinearRegression()
best_MSE=np.inf
best_regr=None
best_match=None
for combination in independent_station_combinations:
    x = df[list(combination)]
    # Make a match with sklearn
    regr.fit(x, y)              #Make predictions using the testing set
    y_pred = regr.predict(x)    #Calculate match stats
    MSE=mean_squared_error(y, y_pred)
    if MSE<best_MSE:
        best_MSE=MSE
        best_regr=regr
        best_match=combination
print(f"Optimal combination of measuring stations: {best_match}")

Optimal combination of measuring stations: ('VSJ_20925', 'F80', 'F18')


## Calculate statistics

In [2]:
import statsmodels.api as sm
x = df[list(best_match)]
x = sm.add_constant(x)
model = sm.OLS(y, x)
results = model.fit()
print(f"RMSE\tR^2\tR^2_adj")
print(f"{np.sqrt(best_MSE):.4f}\t{results.rsquared:.4f}\t{results.rsquared_adj:.4f}")
print(f"Station\t{results.params[0]:.4f}\t{results.bse[0]:.4f}")
for i in range(len(best_match)):
    print(f"{best_match[i]}\t{results.params[i+1]:.4f}\t{results.bse[i+1]:.4f} ")

RMSE	R^2	R^2_adj
0.2636	0.9979	0.9979
Station	-0.2228	0.0090
VSJ_20925	0.3149	0.0019 
F80	0.4375	0.0034 
F18	0.2530	0.0029 
