In [1]:
from random import sample
from time import time
import pandas as pd
import pymongo
from sklearn import ensemble
import numpy as np
import os
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.metrics import mean_squared_error
from math import sqrt

import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV

<h3><u>GROUPING CLUSTERS FROM CSV</u></h3>

In [2]:
df_clusters = pd.read_csv("/tmp/clusters_demo.csv")

In [3]:
df_clusters

gk = df_clusters.groupby('cluster_id')

gk

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7f4b99469f98>

In [5]:
parent_maps = {}
child_to_parent = {}

for name, group in gk:
    row = group[group.distance == group.distance.min()]
    row_max = group[group.distance == group.distance.max()]
    
    children = list(group.gis_join)
    distances = list(group.distance)
    
    dist_min = row['distance'].item()
    dist_max = row_max['distance'].item()
    
    pg = str(row['gis_join'].item())
    
    parent_index = children.index(pg)
    children.pop(parent_index)
    distances.pop(parent_index)
    
    inner_dict = {}
    inner_dict['dist_min'] = dist_min
    inner_dict['dist_max'] = dist_max
    inner_dict['children'] = children
    inner_dict['distances'] = distances
    
    parent_maps[pg] = inner_dict
    
    for c in children:
        child_to_parent[c] = pg
    
                           
print(parent_maps)
print(child_to_parent)

{'G0800010': {'dist_min': 402.1741858065082, 'dist_max': 416.4204983057752, 'children': ['G1303070', 'G4701150', 'G1201210'], 'distances': [413.07465175307607, 409.1042783051329, 416.4204983057752]}, 'G3701430': {'dist_min': 410.7905596873448, 'dist_max': 412.12121359745873, 'children': ['G3701070', 'G4801550', 'G4001210', 'G0600090'], 'distances': [411.08717608030713, 412.12121359745873, 411.2011681070961, 411.6608777411628]}}
{'G1303070': 'G0800010', 'G4701150': 'G0800010', 'G1201210': 'G0800010', 'G3701070': 'G3701430', 'G4801550': 'G3701430', 'G4001210': 'G3701430', 'G0600090': 'G3701430'}


<h3><u>CONSTANTS AND HELPER FUNCTIONS</u></h3>

In [6]:
sample_min = 0.05
sample_max = 0.25

query_collection = "macav2"
mongo_url = "mongodb://lattice-100:27018/"
mongo_db_name = "sustaindb"
query_fild = "gis_join"
train_test = 0.8


training_labels = ["min_surface_downwelling_shortwave_flux_in_air", "max_surface_downwelling_shortwave_flux_in_air",
                   "max_specific_humidity", "min_max_air_temperature", "max_max_air_temperature"]
target_labels = ["max_min_air_temperature"]


# QUERY projection
client_projection = {}
for val in training_labels:
    client_projection[val] = 1
for val in target_labels:
    client_projection[val] = 1
    
    

<h1><u>MODELING</u></h1>

In [10]:
saved_models = {}

# ACTUAL QUERYING
def query_sustaindb(query_gisjoin, sustain_db):
    sustain_collection = sustain_db[query_collection]
    client_query = {query_fild: query_gisjoin}
    query_results = list(sustain_collection.find(client_query, client_projection)) 
    return list(query_results)

# SAMPLE FROM QUERY RESULTS
def data_sampling(query_results, exhaustive, sample_percent=1):
    if exhaustive:
        all_data = query_results
    else:
        data_size = int(len(query_results) * sample_percent)
        all_data = sample(query_results, data_size)

    return pd.DataFrame(all_data)

# GET SAMPLE % BASED ON DISTANCE FROM CENTROID
def get_sample_percent(gis_join):
    parent_gis = child_to_parent[gis_join]
    inner_dict = parent_maps[parent_gis]
    d_max = inner_dict['dist_max']
    d_min = inner_dict['dist_min']
    children = inner_dict['children']
    distances = inner_dict['distances']
    
    my_index = children.index(gis_join)
    my_distance = distances[my_index]
    
    frac = (my_distance - d_min)/(d_max - d_min)
    
    perc = sample_min + (sample_max - sample_min) * frac
    
    perc*=100
    perc = int(perc)
    perc = perc - (perc%5)
    
    perc = perc/100
    return perc

def exhaustive_training(X,Y, gis_join):
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
    
    param_grid = {'max_depth': [2, 3], 'min_samples_split': [15, 20, 50]}
    base_est = ensemble.RandomForestRegressor(random_state=0)
    sh = HalvingGridSearchCV(base_est, param_grid, cv=5, verbose=1, 
                             factor=2, resource='n_estimators', max_resources=600).fit(X, pd.Series.ravel(Y))
    
    clf_best = sh.best_estimator_
    rmse = sqrt(mean_squared_error(pd.Series.ravel(y_test), clf_best.predict(X_test)))
    
    print("PARENT GISJOIN: ",gis_join, "RMSE:", rmse)
    return clf_best
    

def sampled_training(X, Y, gis_join, saved_models):
    parent_gis = child_to_parent[gis_join]
    clf = saved_models[parent_gis]
    
    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
    
    clf.fit(X_train, pd.Series.ravel(y_train))

    rmse = sqrt(mean_squared_error(pd.Series.ravel(y_test), clf.predict(X_test)))
    
    print("CHILD GISJOIN: ",gis_join, "RMSE:", rmse)
    return clf
    

def train_gisjoin(gis_join, exhaustive=True, saved_models={}):
    sustainclient = pymongo.MongoClient(mongo_url)
    sustain_db = sustainclient[mongo_db_name]

    sample_percent = 1
    if not exhaustive:
        #print("SAMPLED CHILD TRAINING.....")
        sample_percent = get_sample_percent(gis_join)
        #print("SAMPLE PERCENT: ", sample_percent)
        
    #QUERY
    results = query_sustaindb(gis_join, sustain_db)
    
    df_sampled = data_sampling(results, exhaustive, sample_percent)
    
    Y = df_sampled.loc[:,target_labels]
    X = df_sampled.loc[:, training_labels]
    #print(X.shape, Y.shape)
    
    if exhaustive:
        clf = exhaustive_training(X,Y, gis_join)
    else:
        clf = sampled_training(X,Y, gis_join, saved_models)
    
    #saved_models[gis_join] = clf
    return (gis_join,clf)
    
#'G1303070': 'G0800010'
#train_gisjoin('G0800010', True)
#train_gisjoin('G1303070', False)



In [11]:
from dask import delayed
from dask.distributed import Client

client = Client(n_workers=4)



Perhaps you already have a cluster running?
Hosting the HTTP server on port 35357 instead
  http_address["port"], self.http_server.port


In [12]:
import dask
outputs = []

# TRAINING PARENTS FIRST
for pk in parent_maps.keys():
    ret = delayed(train_gisjoin)(pk,True)
    outputs.append(ret)

futures = dask.persist(*outputs)  # trigger computation in the background
results = dask.compute(*futures)


In [13]:
print(results)

(('G0800010', RandomForestRegressor(max_depth=3, min_samples_split=20, n_estimators=600,
                      random_state=0)), ('G3701430', RandomForestRegressor(max_depth=3, min_samples_split=20, n_estimators=600,
                      random_state=0)))


In [14]:
for sm in results:
    (gis_join, model) = sm
    saved_models[gis_join] = model

print(saved_models)

{'G0800010': RandomForestRegressor(max_depth=3, min_samples_split=20, n_estimators=600,
                      random_state=0), 'G3701430': RandomForestRegressor(max_depth=3, min_samples_split=20, n_estimators=600,
                      random_state=0)}


In [15]:
outputs2 = []

# TRAINING CHILDREN NEXT
for ck in child_to_parent.keys():
    ret = delayed(train_gisjoin)(ck,False, saved_models)
    outputs2.append(ret)

futures2 = dask.persist(*outputs2)  # trigger computation in the background
results2 = dask.compute(*futures2)

In [16]:
for sm in results2:
    (gis_join, model) = sm
    saved_models[gis_join] = model

print(saved_models)

{'G0800010': RandomForestRegressor(max_depth=3, min_samples_split=20, n_estimators=600,
                      random_state=0), 'G3701430': RandomForestRegressor(max_depth=3, min_samples_split=20, n_estimators=600,
                      random_state=0), 'G1303070': RandomForestRegressor(max_depth=3, min_samples_split=20, n_estimators=600,
                      random_state=0), 'G4701150': RandomForestRegressor(max_depth=3, min_samples_split=20, n_estimators=600,
                      random_state=0), 'G1201210': RandomForestRegressor(max_depth=3, min_samples_split=20, n_estimators=600,
                      random_state=0), 'G3701070': RandomForestRegressor(max_depth=3, min_samples_split=20, n_estimators=600,
                      random_state=0), 'G4801550': RandomForestRegressor(max_depth=3, min_samples_split=20, n_estimators=600,
                      random_state=0), 'G4001210': RandomForestRegressor(max_depth=3, min_samples_split=20, n_estimators=600,
                      random_s