In [1]:
from random import sample
from time import time
import pandas as pd
import pymongo
from sklearn import ensemble
import numpy as np
import os
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.metrics import mean_squared_error
from math import sqrt

import random

import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

import pickle

<h3><u>GROUPING CLUSTERS FROM CSV</u></h3>

In [2]:
df_clusters = pd.read_csv("~/ucc-21/clusters-noaa.csv")

In [3]:
time1 = time()

df_clusters

gk = df_clusters.groupby('cluster_id')

gk

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f1ce66f1d90>

In [4]:
parent_maps = {}
child_to_parent = {}

for name, group in gk:
    row = group[group.distance == group.distance.min()]
    row_max = group[group.distance == group.distance.max()]
    
    children = list(group.gis_join)
    distances = list(group.distance)
    
    dist_min = row['distance'].item()
    dist_max = row_max['distance'].item()
    
    pg = str(row['gis_join'].item())
    
    parent_index = children.index(pg)
    children.pop(parent_index)
    distances.pop(parent_index)
    
    inner_dict = {}
    inner_dict['dist_min'] = dist_min
    inner_dict['dist_max'] = dist_max
    inner_dict['children'] = children
    inner_dict['distances'] = distances
    
    parent_maps[pg] = inner_dict
    
    for c in children:
        child_to_parent[c] = pg
    
                           
print(parent_maps)
print(child_to_parent)

{'G5501210': {'dist_min': 46.36862986120119, 'dist_max': 862.0309874525391, 'children': ['G3700970', 'G3900970', 'G1801390', 'G4000990', 'G2701190', 'G3900370', 'G2700190', 'G2600450', 'G2701070', 'G5500210', 'G2900910', 'G4700650', 'G5500650', 'G4001330', 'G4802370', 'G3900270', 'G2701570', 'G5501310', 'G2600750', 'G5501370', 'G0100490', 'G4001370', 'G3900910', 'G5500950', 'G2600930', 'G5500150', 'G5500350', 'G2901790', 'G3700450', 'G1900190', 'G5500630', 'G1801770', 'G1800870', 'G5501090', 'G3900210', 'G1801510', 'G1900430', 'G4800990', 'G4000190', 'G5501030', 'G1800350', 'G3901170', 'G5500470', 'G0500970', 'G0501270', 'G4000670', 'G5501050', 'G0501130', 'G5500530', 'G5501410', 'G3800670', 'G1900050', 'G2701630', 'G2900930', 'G2701230', 'G2601490', 'G3900710', 'G2700790', 'G1302950', 'G5500550', 'G3900110', 'G4001250', 'G2701150', 'G4804850', 'G5501110', 'G1800750', 'G5500770', 'G2701710', 'G2700690', 'G5500010', 'G2601170', 'G5500810', 'G5501350', 'G1800410', 'G3901590', 'G5500930',

<h3><u>CONSTANTS AND HELPER FUNCTIONS</u></h3>

In [5]:
sample_min = 0.05
sample_max = 0.25

query_collection = "noaa_nam_2"

mongo_urls = [
    'mongodb://lattice-100:27018/',
    'mongodb://lattice-101:27018/',
    'mongodb://lattice-102:27018/',
    'mongodb://lattice-103:27018/',
    'mongodb://lattice-104:27018/'
]

mongo_db_name = "sustaindb"
query_fild = "gis_join"
train_test = 0.8


training_labels = ["mean_sea_level_pressure_pascal",
                   "surface_pressure_surface_level_pascal",
                   "10_metre_u_wind_component_meters_per_second",
                   "10_metre_v_wind_component_meters_per_second",
                   "soil_temperature_kelvin"]

target_labels = ["pressure_pascal"]


# QUERY projection
client_projection = {}
for val in training_labels:
    client_projection[val] = 1
for val in target_labels:
    client_projection[val] = 1
    
    

<h1><u>MODELING</u></h1>

In [6]:
saved_models = {}

# ACTUAL QUERYING
def query_sustaindb(query_gisjoin, sustain_db):
    sustain_collection = sustain_db[query_collection]
    client_query = {query_fild: query_gisjoin}
    query_results = list(sustain_collection.find(client_query, client_projection)) 
    return list(query_results)

# SAMPLE FROM QUERY RESULTS
def data_sampling(query_results, exhaustive, sample_percent=1):
    if exhaustive:
        all_data = query_results
    else:
        data_size = int(len(query_results) * sample_percent)
        all_data = sample(query_results, data_size)

    return pd.DataFrame(all_data)

# GET SAMPLE % BASED ON DISTANCE FROM CENTROID
def get_sample_percent(gis_join):
    parent_gis = child_to_parent[gis_join]
    inner_dict = parent_maps[parent_gis]
    d_max = inner_dict['dist_max']
    d_min = inner_dict['dist_min']
    children = inner_dict['children']
    distances = inner_dict['distances']
    
    my_index = children.index(gis_join)
    my_distance = distances[my_index]
    
    frac = (my_distance - d_min)/(d_max - d_min)
    
    perc = sample_min + (sample_max - sample_min) * frac
    
    perc *= 100
    perc = int(perc)
    perc = perc - (perc % 5)
    
    perc = perc/100
    return perc


# GET PERCENTAGE DISTANCE FROM CENTROID
def get_distance_percentage(gis_join):
    parent_gis = child_to_parent[gis_join]
    inner_dict = parent_maps[parent_gis]
    d_max = inner_dict['dist_max']
    d_min = inner_dict['dist_min']
    children = inner_dict['children']
    distances = inner_dict['distances']
    
    my_index = children.index(gis_join)
    my_distance = distances[my_index]
    
    frac = (my_distance - d_min)/(d_max - d_min)
    
    return frac * 100

def exhaustive_training(X,Y, gis_join):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
    
    param_grid = {'max_depth': [2, 3], 'min_samples_split': [15, 20, 50]}
#     base_est = ensemble.RandomForestRegressor(random_state=0)
    base_est = ensemble.GradientBoostingRegressor(random_state=0)
    sh = HalvingGridSearchCV(base_est, param_grid, cv=5, verbose=1, 
                             factor=2, resource='n_estimators', max_resources=600).fit(X, pd.Series.ravel(Y))
    
    clf_best = sh.best_estimator_
    rmse = sqrt(mean_squared_error(pd.Series.ravel(y_test), clf_best.predict(X_test)))
    
    print("PARENT GISJOIN: ", gis_join, "RMSE:", rmse)
    return clf_best
    

def sampled_training(X, Y, gis_join, saved_models):
    parent_gis = child_to_parent[gis_join]
    clf = saved_models[parent_gis]
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
    
    clf.fit(X_train, pd.Series.ravel(y_train))

    rmse = sqrt(mean_squared_error(pd.Series.ravel(y_test), clf.predict(X_test)))
    
    print("CHILD GISJOIN: ", gis_join, "RMSE:", rmse)
    return clf
    

def train_gisjoin(gis_join, exhaustive=True, saved_models={}):
    mongo_url = mongo_urls[random.randint(0, len(mongo_urls) - 1)]
    sustainclient = pymongo.MongoClient(mongo_url)
    sustain_db = sustainclient[mongo_db_name]

    sample_percent = 1
    if not exhaustive:
        #print("SAMPLED CHILD TRAINING.....")
        sample_percent = get_sample_percent(gis_join)
        
        
    #QUERY
    results = query_sustaindb(gis_join, sustain_db)
    
    df_sampled = data_sampling(results, exhaustive, sample_percent)
    
    Y = df_sampled.loc[:,target_labels]
    X = df_sampled.loc[:, training_labels]
    #print(X.shape, Y.shape)
    
    if exhaustive:
        clf = exhaustive_training(X,Y, gis_join)
    else:
        clf = sampled_training(X,Y, gis_join, saved_models)
    
    #saved_models[gis_join] = clf
    return (gis_join,clf)
    
#'G1303070': 'G0800010'
#train_gisjoin('G0800010', True)
#train_gisjoin('G1303070', False)



In [7]:
import dask
from dask import delayed
from dask.distributed import Client

client = Client('localhost:9000')


+-------------+---------------+----------------+----------------+
| Package     | client        | scheduler      | workers        |
+-------------+---------------+----------------+----------------+
| blosc       | 1.10.4        | 1.10.2         | 1.10.2         |
| dask        | 2021.07.0     | 2021.08.0      | 2021.08.0      |
| distributed | 2021.07.0     | 2021.08.0      | 2021.08.0      |
| numpy       | 1.21.0        | 1.21.1         | 1.21.1         |
| python      | 3.8.6.final.0 | 3.8.10.final.0 | 3.8.10.final.0 |
+-------------+---------------+----------------+----------------+


In [8]:
outputs = []
time1 = time()
# TRAINING PARENTS FIRST
for pk in parent_maps.keys():
    ret = delayed(train_gisjoin)(pk, True)
    outputs.append(ret)

futures = dask.persist(*outputs)  # trigger computation in the background
results = dask.compute(*futures)
print(f'Time to train one GISJOIN: {time() - time1} s')

Time to train one GISJOIN: 234.45221519470215 s


In [9]:
print(results)
print(f'No. of results: {len(results)}')

(('G5501210', GradientBoostingRegressor(max_depth=2, min_samples_split=15, n_estimators=600,
                          random_state=0)), ('G1600390', GradientBoostingRegressor(min_samples_split=50, n_estimators=600,
                          random_state=0)), ('G1301930', GradientBoostingRegressor(min_samples_split=20, n_estimators=600,
                          random_state=0)), ('G3500030', GradientBoostingRegressor(max_depth=2, min_samples_split=20, n_estimators=600,
                          random_state=0)), ('G2001930', GradientBoostingRegressor(max_depth=2, min_samples_split=20, n_estimators=600,
                          random_state=0)), ('G1800450', GradientBoostingRegressor(max_depth=2, min_samples_split=50, n_estimators=600,
                          random_state=0)), ('G2001630', GradientBoostingRegressor(max_depth=2, min_samples_split=20, n_estimators=600,
                          random_state=0)), ('G1201010', GradientBoostingRegressor(min_samples_split=50, n_estimators

In [10]:
for sm in results:
    (gis_join, model) = sm
    saved_models[gis_join] = model

print(saved_models)
pickle.dump(saved_models, open('parent_models.pkl', 'wb'))

{'G5501210': GradientBoostingRegressor(max_depth=2, min_samples_split=15, n_estimators=600,
                          random_state=0), 'G1600390': GradientBoostingRegressor(min_samples_split=50, n_estimators=600,
                          random_state=0), 'G1301930': GradientBoostingRegressor(min_samples_split=20, n_estimators=600,
                          random_state=0), 'G3500030': GradientBoostingRegressor(max_depth=2, min_samples_split=20, n_estimators=600,
                          random_state=0), 'G2001930': GradientBoostingRegressor(max_depth=2, min_samples_split=20, n_estimators=600,
                          random_state=0), 'G1800450': GradientBoostingRegressor(max_depth=2, min_samples_split=50, n_estimators=600,
                          random_state=0), 'G2001630': GradientBoostingRegressor(max_depth=2, min_samples_split=20, n_estimators=600,
                          random_state=0), 'G1201010': GradientBoostingRegressor(min_samples_split=50, n_estimators=600,
         

In [11]:
time2 = time()
print(f'Time Taken to build parent models: {time2 - time1} s')

Time Taken to build parent models: 239.24467301368713 s


distributed.client - ERROR - Failed to reconnect to scheduler after 10.00 seconds, closing client
_GatheringFuture exception was never retrieved
future: <_GatheringFuture finished exception=CancelledError()>
asyncio.exceptions.CancelledError
