### Getting the results of the model trained using unsupervised approach on all the hcs locations

#### Data Preparation

In [1]:
import os 
import sys
import pandas as pd 
import numpy as np
import pickle 
import json
from keras.models import load_model
from sklearn.model_selection import train_test_split
from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from sklearn.ensemble import VotingRegressor
from sklearn.preprocessing import StandardScaler
from NN_unsupervised_all_sensors import NUM_NEAREST_SENSORS, model_save_path,NUM_LAST_DAYS

NUM_LAST_DAYS = 12, NUM_NEAREST_SENSORS = 10, EPOCHS = 25


In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
def get_location_specific_lcs_data(location, df):
    
    '''
    location : string, the location for which you want the lcs sensors
    
    returns df containing the lcs for that location
    '''
    
    columns_to_return = []
    for col in df.columns: 
        if location in col:
            columns_to_return.append(col)
            
    return df[columns_to_return]

In [4]:
def get_location_specific_hcs_data(location_index, df):
    
    '''
    
    location_index : int, the index for which we want the hcs sensors data
    returns df containing that hcs
    
    '''
    
    columns_to_return = []
    for x in list(df.columns): 
        if int(x.split('_')[-1]) == int(location_index):
            columns_to_return.append(x)
            
    return df[columns_to_return]

In [5]:
def get_k_closest_sensors_data(input_loc, df, k=NUM_NEAREST_SENSORS):

    '''
    input_loc : pair of (lat, long) of hcs location
    df : has say n sensors (each sensor is taking 5 columns [pm, temp, rh, lat, long])
    of these n sensors we return a dataframe containing all the data for k nearest sensors 
    '''
    
    column_groups = [df.columns[i*5:i*5+5] for i in range(int(df.shape[1]/5))]
    sensors_coord = [(df.iloc[0, i*5+3], df.iloc[0, i*5+4]) for i in range(int(df.shape[1]/5))]
    
    sensors_dist = [abs(input_loc[0] - x) + abs(input_loc[1] - y)  for (x, y) in sensors_coord]
    
    indices_list = list(range(len(sensors_dist)))
    sorted_indices_list = sorted(indices_list, key = lambda i : sensors_dist[i])
    top_k_indices_list = sorted_indices_list[:k] # these indices groups need to be included
    # print(top_k_indices_list)
    
    columns_to_return = []
    for index in top_k_indices_list:
        columns_to_return += column_groups[index].tolist()
    
    return df[columns_to_return]

In [6]:
def normalize_test_prep_data(lcs_data, hcs_data):
    
    # lcs_data.shape = (510, 5*num_lcs_data)
    # getting rel values
    for i in range(0, int(lcs_data.shape[1]/5), 5):
        lcs_data.iloc[:, i*5+1] = lcs_data.iloc[:, i*5+1] - hcs_data.iloc[:, 4] # temp
        lcs_data.iloc[:, i*5+2] = lcs_data.iloc[:, i*5+2] - hcs_data.iloc[:, 3] # rh 
        lcs_data.iloc[:, i*5+3] = lcs_data.iloc[:, i*5+3] - hcs_data.iloc[:, 7] # lat
        lcs_data.iloc[:, i*5+4] = lcs_data.iloc[:, i*5+4] - hcs_data.iloc[:, 8] # long
    
    test_X = lcs_data.values
    test_y = hcs_data.values[:, 2]
    test_scaler = StandardScaler()
    test_X = test_scaler.fit_transform(test_X)
    
    return test_X, test_y

In [7]:
with open("data/hcsIndex2Name.json", "r") as f: 
    hcsIndex2Name = json.load(f)

with open("data/hcsName2Index.json", "r") as f: 
    hcsName2Index = json.load(f)

In [8]:
lcs_data = pd.read_csv('data/AQI_LCS_data_prep.csv')
lcs_data = lcs_data.iloc[:, 2:]
print(f"lcs_data.shape = {lcs_data.shape}")
lcs_data.head()

lcs_data.shape = (510, 1235)


Unnamed: 0,PM25_1201230029_ARARIA_ARARIA,Temp_1201230029_ARARIA_ARARIA,RH_1201230029_ARARIA_ARARIA,lat_PM25_1201230029_ARARIA_ARARIA,long_PM25_1201230029_ARARIA_ARARIA,PM25_1201230033_KURSAKANTA_ARARIA,Temp_1201230033_KURSAKANTA_ARARIA,RH_1201230033_KURSAKANTA_ARARIA,lat_PM25_1201230033_KURSAKANTA_ARARIA,long_PM25_1201230033_KURSAKANTA_ARARIA,...,PM25_BCDDC247BFE3_BHITAHA_WCHAMPARAN,Temp_BCDDC247BFE3_BHITAHA_WCHAMPARAN,RH_BCDDC247BFE3_BHITAHA_WCHAMPARAN,lat_PM25_BCDDC247BFE3_BHITAHA_WCHAMPARAN,long_PM25_BCDDC247BFE3_BHITAHA_WCHAMPARAN,PM25_DC4F22364445_PIPRASI_WCHAMPARAN,Temp_DC4F22364445_PIPRASI_WCHAMPARAN,RH_DC4F22364445_PIPRASI_WCHAMPARAN,lat_PM25_DC4F22364445_PIPRASI_WCHAMPARAN,long_PM25_DC4F22364445_PIPRASI_WCHAMPARAN
0,8.0,32.0,98.0,26.148,87.457,26.0,30.5,90.0,26.359,87.443,...,14.55,38.25,50.36,26.907,84.136,18.02,35.59,57.3,27.06,84.023
1,6.0,33.0,98.0,26.148,87.457,20.5,30.5,90.0,26.359,87.443,...,11.22,38.82,50.36,26.907,84.136,13.92,36.46,57.3,27.06,84.023
2,6.5,34.5,98.0,26.148,87.457,12.5,30.0,90.0,26.359,87.443,...,15.26,38.46,50.36,26.907,84.136,14.37,35.9,57.3,27.06,84.023
3,13.0,33.5,98.0,26.148,87.457,15.5,30.0,88.0,26.359,87.443,...,37.81,39.47,47.59,26.907,84.136,11.13,36.68,56.93,27.06,84.023
4,10.0,34.0,98.0,26.148,87.457,10.0,32.0,82.5,26.359,87.443,...,15.55,39.96,46.83,26.907,84.136,11.48,38.89,51.65,27.06,84.023


In [9]:
locations_to_test_loc = ['BHAGALPUR', 'KATIHAR', 'SAMASTIPUR', 'SIWAN', 'HAJIPUR']
location_to_test_loc_index_dict = {loc: i for i, loc in enumerate(locations_to_test_loc)}
print(location_to_test_loc_index_dict)
locations_to_test_index = [hcsName2Index[x.lower()] for x in locations_to_test_loc]

print(locations_to_test_index)

{'BHAGALPUR': 0, 'KATIHAR': 1, 'SAMASTIPUR': 2, 'SIWAN': 3, 'HAJIPUR': 4}
[3, 8, 16, 18, 17]


In [10]:
hcs_data = pd.read_csv('data/hcsdatacleaned.csv')
hcs_data = hcs_data.iloc[:, 1:]
print(f"hcs_data.shape = {hcs_data.shape}")
hcs_data.head()

hcs_data.shape = (510, 99)


Unnamed: 0,From Date_3,To Date_3,PM2.5_3,RH_3,AT_3,WS_3,WD_3,lat_3,long_3,From Date_4,...,long_17,From Date_18,To Date_18,PM2.5_18,RH_18,AT_18,WS_18,WD_18,lat_18,long_18
0,26-07-2023 09:00,26-07-2023 09:15,12.0,76.0,31.2,3.0,229.0,25.262,87.011,26-07-2023 09:00,...,85.245,26-07-2023 09:00,26-07-2023 09:15,39.0,64.0,34.4,1.5,19.0,26.227,84.357
1,26-07-2023 10:00,26-07-2023 10:15,7.0,73.0,31.9,3.0,356.0,25.262,87.011,26-07-2023 10:00,...,85.245,26-07-2023 10:00,26-07-2023 10:15,25.0,62.0,35.1,1.6,29.0,26.227,84.357
2,26-07-2023 11:00,26-07-2023 11:15,2.0,69.0,32.7,3.5,353.0,25.262,87.011,26-07-2023 11:00,...,85.245,26-07-2023 11:00,26-07-2023 11:15,28.0,57.0,36.1,1.7,12.0,26.227,84.357
3,26-07-2023 12:00,26-07-2023 12:15,6.0,64.0,34.1,3.4,354.0,25.262,87.011,26-07-2023 12:00,...,85.245,26-07-2023 12:00,26-07-2023 12:15,25.0,56.0,36.9,1.5,31.0,26.227,84.357
4,26-07-2023 13:00,26-07-2023 13:15,15.0,62.0,34.7,3.3,358.0,25.262,87.011,26-07-2023 13:00,...,85.245,26-07-2023 13:00,26-07-2023 13:15,23.0,57.0,37.2,1.5,354.0,26.227,84.357


In [11]:
hcs_data_list = [get_location_specific_hcs_data(x, hcs_data) for x in locations_to_test_index]
for i, x in enumerate(hcs_data_list): 
    print(f"i = {i}, location = {locations_to_test_loc[i]}, x.shape = {x.shape}")

i = 0, location = BHAGALPUR, x.shape = (510, 9)
i = 1, location = KATIHAR, x.shape = (510, 9)
i = 2, location = SAMASTIPUR, x.shape = (510, 9)
i = 3, location = SIWAN, x.shape = (510, 9)
i = 4, location = HAJIPUR, x.shape = (510, 9)


In [12]:
# preparing the test data in the form of dictionary
test_data_dict = {}
test_data_dict_save_path = 'data/hcs_test_data_dict.pickle'

for i, hcs_x in enumerate(hcs_data_list):
    
    hcs_name = locations_to_test_loc[i]
    hcs_loc = (hcs_x.iloc[0, 7], hcs_x.iloc[0, 8])
    lcs_x = get_k_closest_sensors_data(hcs_loc, lcs_data)
    test_X, test_y = normalize_test_prep_data(lcs_x, hcs_x)
    test_data_dict[hcs_name] = (test_X, test_y)


for x, (y, z) in test_data_dict.items():
    print(f"x = {x}, y.shape = {y.shape}, z.shape = {z.shape}")
    
with open(test_data_dict_save_path, 'wb') as f: 
    pickle.dump(test_data_dict, f)

x = BHAGALPUR, y.shape = (510, 50), z.shape = (510,)
x = KATIHAR, y.shape = (510, 50), z.shape = (510,)
x = SAMASTIPUR, y.shape = (510, 50), z.shape = (510,)
x = SIWAN, y.shape = (510, 50), z.shape = (510,)
x = HAJIPUR, y.shape = (510, 50), z.shape = (510,)


#### Evaluation

In [13]:

model_save_path = f"models/NN/unsupervised_lcs_all_data_{NUM_LAST_DAYS}.h5"

with open(test_data_dict_save_path, 'rb') as f: 
    test_data_dict = pickle.load(f)
    
#with open(model_save_path, "rb") as f:
#    model = pickle.load(f)

model

OSError: No file or directory found at models/NN/unsupervised_lcs_all_data_15.h5

In [15]:
model = load_model("/Users/architaggarwal/Documents/Projects/AQI/Air-Quality-Index-Calibration/Air-Quality-Index-Calibration_Validation_Exp_Redo/models/NN/unsupervised_lcs_all_data_12days_25epochs.h5")

In [16]:
model

<keras.engine.sequential.Sequential at 0x178f66b50>

In [18]:
NUM_LAST_DAYS

15

In [17]:
no_finetuning_results = {}
results_save_path = f"logs/NN_nofinetuning_results/scores_{NUM_LAST_DAYS}.json"

for loc, (X_test, y_test) in test_data_dict.items():
    
    y_test_hat = model.predict(X_test)
    no_finetuning_results[loc] = {'r2_score': r2_score(y_test, y_test_hat), "mae_score": mean_absolute_error(y_test, y_test_hat)}

with open(results_save_path, 'w') as f:
    json.dump(no_finetuning_results, f, indent=4)



2024-03-05 23:44:07.607353: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz




#### Calculating average scores of various models

In [9]:
locations_to_test_loc = ['BHAGALPUR', 'KATIHAR', 'SAMASTIPUR', 'SIWAN', 'HAJIPUR']

results_path = "logs/no_finetuning_unsupervised_lcs/linear_regression_scores_12.json"
with open(results_path, 'r') as f:
    results = json.load(f)
    
avg_r2 = 0
avg_mae = 0    
    
for loc in locations_to_test_loc:
    avg_r2 += results[loc]['r2_score']
    avg_mae += results[loc]['mae_score']
    
avg_r2 /= len(locations_to_test_loc)
avg_mae /= len(locations_to_test_loc)

print(f"avg_r2 = {avg_r2}, avg_mae = {avg_mae}")

avg_r2 = -1.1617635656855219, avg_mae = 17.17479043538025


In [2]:
locations_to_test_loc = ['BHAGALPUR', 'KATIHAR', 'SAMASTIPUR', 'SIWAN', 'HAJIPUR']

results_path = "logs/NN_no_finetuning_unsupervised_lcs/scores_20.json"
with open(results_path, 'r') as f:
    results = json.load(f)
    
avg_r2 = 0
avg_mae = 0    
    
for loc in locations_to_test_loc:
    avg_r2 += results[loc]['r2_score']
    avg_mae += results[loc]['mae_score']
    
avg_r2 /= len(locations_to_test_loc)
avg_mae /= len(locations_to_test_loc)

print(f"avg_r2 = {avg_r2}, avg_mae = {avg_mae}")

avg_r2 = -0.007956132028374464, avg_mae = 11.196490180155275
