In [1]:
from random import sample
from time import time
import pandas as pd
import pymongo
from sklearn import ensemble
import numpy as np
import os
from sklearn.model_selection import cross_val_score, train_test_split, KFold
from sklearn.metrics import mean_squared_error
from math import sqrt

import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

<h3><u>CONSTANTS AND HELPER FUNSTIONS</u></h3>

In [2]:
query_collection = "macav2"
mongo_url = "mongodb://lattice-100:27018/"
mongo_db_name = "sustaindb"
query_fild = "gis_join"
sample_percent = 0.1
train_test = 0.8
feature_importance_percentage = 98
exhaustive_sample_percent = 0.0001


training_labels = ["min_surface_downwelling_shortwave_flux_in_air", "max_surface_downwelling_shortwave_flux_in_air",
                   "max_specific_humidity", "min_max_air_temperature", "max_max_air_temperature"]
target_labels = ["max_min_air_temperature"]

# QUERY-RELATED
sustainclient = pymongo.MongoClient(mongo_url)
sustain_db = sustainclient[mongo_db_name]

# QUERY projection
client_projection = {}
for val in training_labels:
    client_projection[val] = 1
for val in target_labels:
    client_projection[val] = 1
    
    
def fancy_logging(msg, unique_id=""):
    print(unique_id, ":", "====================================")
    print(unique_id, ":", msg, ": TIME: ",time())


<h3><u>DATA FETCH</u></h3>

In [3]:
# ACTUAL QUERYING
def query_sustaindb(query_gisjoin):

    sustain_collection = sustain_db[query_collection]
    client_query = {query_fild: query_gisjoin}

    start_time = time()
    query_results = list(sustain_collection.find(client_query, client_projection))
    
    return list(query_results)

def queryall_sustaindb():

    sustain_collection = sustain_db[query_collection]
    client_query = {}

    start_time = time()
    query_results = list(sustain_collection.find(client_query, client_projection))
    
    return list(query_results)

In [4]:
#df = query_sustaindb('G3701310')
df = queryall_sustaindb()
print("1: ", len(df))

1:  11353524


<h3><u>DATA SAMPLING</u></h3>

In [5]:
def data_sampling(query_results, exhaustive, sample_percent=1):
    if exhaustive:
        all_data = query_results
    else:
        data_size = int(len(query_results) * sample_percent)
        all_data = sample(query_results, data_size)

    return pd.DataFrame(all_data)

In [6]:
# RIKI
sampled_df = data_sampling(df, False, exhaustive_sample_percent)

In [7]:
Y = sampled_df.loc[:,target_labels]
X = sampled_df.loc[:, training_labels]
print(X.shape, Y.shape)

(1135, 5) (1135, 1)


<h3><u>DATA SPLITTING INTO TRAING AND VALIDATION</u></h3>

In [8]:
'''def data_partitioning(query_results, exhaustive, sample_percent=1):
    if exhaustive:
        all_data = query_results
    else:
        data_size = int(len(query_results) * sample_percent)
        all_data = sample(query_results, data_size)

    msk = np.random.rand(len(all_data)) < train_test

    all_data = pd.DataFrame(all_data)
    training_data = all_data[msk]
    val_data = all_data[~msk]
    return (pd.DataFrame(training_data), pd.DataFrame(val_data))'''

'def data_partitioning(query_results, exhaustive, sample_percent=1):\n    if exhaustive:\n        all_data = query_results\n    else:\n        data_size = int(len(query_results) * sample_percent)\n        all_data = sample(query_results, data_size)\n\n    msk = np.random.rand(len(all_data)) < train_test\n\n    all_data = pd.DataFrame(all_data)\n    training_data = all_data[msk]\n    val_data = all_data[~msk]\n    return (pd.DataFrame(training_data), pd.DataFrame(val_data))'

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)


(908, 5) (227, 5) (908, 1) (227, 1)


<h3><u>MODELING</u></h3>

In [10]:
'''parameters = [ {'n_estimators': 500, 'max_depth': 2, 'min_samples_split': 20},
                {'n_estimators': 300, 'max_depth': 3, 'min_samples_split': 50},
              {'n_estimators': 500, 'max_depth': 3, 'min_samples_split': 20},
              {'n_estimators': 600, 'max_depth': 3, 'min_samples_split': 15}]
    
for params in parameters:
    print("PARAMETERS:",params)
    count = 0
    error = 0
    for i in range(0,5):
        print("ROUND:",i)
        clf = ensemble.RandomForestRegressor(**params)
        clf.fit(X_train, pd.Series.ravel(y_train))

        rmse = sqrt(mean_squared_error(pd.Series.ravel(y_test), clf.predict(X_test)))
        print(rmse)
        error = error + rmse

        feature_importance = clf.feature_importances_
        feature_importance = 100.0 * (feature_importance / feature_importance.sum())
        sorted_idx = np.argsort(feature_importance)
        count = count+1
        print(np.flip(sorted_idx), np.flip(feature_importance[sorted_idx]))
    
    print("===============================================================")

    print("AVG RMSE:",(error/count))'''

# BETTER ALTERNATIVE BELOW



In [11]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
param_grid = {'max_depth': [2, 3], 'min_samples_split': [15, 20, 50]}
base_est = ensemble.RandomForestRegressor(random_state=0)
sh = HalvingGridSearchCV(base_est, param_grid, cv=5, verbose=3, 
                         factor=2, resource='n_estimators', max_resources=600).fit(X, pd.Series.ravel(Y))

n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 150
max_resources_: 600
aggressive_elimination: False
factor: 2
----------
iter: 0
n_candidates: 6
n_resources: 150
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV 1/5] END max_depth=2, min_samples_split=15, n_estimators=150;, score=(train=0.887, test=0.856) total time=   0.4s
[CV 2/5] END max_depth=2, min_samples_split=15, n_estimators=150;, score=(train=0.887, test=0.905) total time=   0.3s
[CV 3/5] END max_depth=2, min_samples_split=15, n_estimators=150;, score=(train=0.877, test=0.878) total time=   0.6s
[CV 4/5] END max_depth=2, min_samples_split=15, n_estimators=150;, score=(train=0.874, test=0.880) total time=   0.3s
[CV 5/5] END max_depth=2, min_samples_split=15, n_estimators=150;, score=(train=0.879, test=0.853) total time=   0.3s
[CV 1/5] END max_depth=2, min_samples_split=20, n_estimators=150;, score=(train=0.887, test=0.856) total time=   0.3s
[CV 2/5] END max_depth=2, min_samp

In [12]:
# THE BEST MODEL
clf_best = sh.best_estimator_

In [13]:
rmse = sqrt(mean_squared_error(pd.Series.ravel(y_test), clf_best.predict(X_test)))
rmse

2.8030382341399576

<h3><u>EXTRACT TOP FEATURES</u></h3>

In [14]:
feature_importance = clf_best.feature_importances_
feature_importance = 100.0 * (feature_importance / feature_importance.sum())
sorted_idx = np.argsort(feature_importance)
print(np.flip(sorted_idx), np.flip(feature_importance[sorted_idx]))

feature_importance = np.flip(feature_importance[sorted_idx])
sorted_idx=np.flip(sorted_idx)

print(sorted_idx)
print(feature_importance)


[2 4 3 1 0] [9.41589086e+01 5.60212804e+00 2.37520572e-01 1.44276363e-03
 0.00000000e+00]
[2 4 3 1 0]
[9.41589086e+01 5.60212804e+00 2.37520572e-01 1.44276363e-03
 0.00000000e+00]


<h3><u>FIND N FOR WHICH IMPORTANCE % > feature-importance-percentage</u></h3>

In [15]:
def find_cumulative(lists, val_max):
    cu_list = []
    length = len(lists)
    cu_list = [sum(lists[0:x:1]) for x in range(1, length+1)]
    
    print(cu_list)
    res = next(x for x, val in enumerate(cu_list)
                                  if val > val_max)
    return res

In [16]:
cut_off_indx = find_cumulative(feature_importance, feature_importance_percentage)

print("LAST INDEX: ", cut_off_indx)

[94.15890862925279, 99.76103666457514, 99.99855723636725, 100.00000000000001, 100.00000000000001]
LAST INDEX:  1


In [17]:
chopped_indices = sorted_idx[0:cut_off_indx+1]

print(sorted_idx)
print(chopped_indices)

[2 4 3 1 0]
[2 4]


<h3><u>SELECTED TOP COLUMNS</u></h3>

In [18]:
candidate_x_columns = list(X.columns)
candidate_y_columns = list(Y.columns)

print(candidate_x_columns)
print(candidate_y_columns)

['min_surface_downwelling_shortwave_flux_in_air', 'max_surface_downwelling_shortwave_flux_in_air', 'max_specific_humidity', 'min_max_air_temperature', 'max_max_air_temperature']
['max_min_air_temperature']


In [19]:
selected_x_columns = [candidate_x_columns[i] for i in chopped_indices]
selected_x_columns

['max_specific_humidity', 'max_max_air_temperature']

<b><hr /></b>

<h1><u><b>TRAINING PHASE #2</b></u></h1>

<h3><u>AGGREGATE QUERY OVER THE CHOSEN COLUMNS PER GIS-JOIN</u></h3>

In [20]:
'''sustain_collection = sustain_db[query_collection]
pipeline=[
   { "$project": { 'gis_join': '$gis_join', 'max_specific_humidity': '$max_specific_humidity', 'max_max_air_temperature': '$max_max_air_temperature', 'max_min_air_temperature': '$max_min_air_temperature'}},
   { "$group": { '_id': "$gis_join", 
"avg_max_specific_humidity": { "$avg": "$max_specific_humidity" },
"avg_max_max_air_temperature": { "$avg": "$max_max_air_temperature" },
"avg_max_min_air_temperature": { "$avg": "$max_min_air_temperature" }
  } }
]
cur = sustain_collection.aggregate(pipeline)

results = list(cur)
len(results)
'''

'sustain_collection = sustain_db[query_collection]\npipeline=[\n   { "$project": { \'gis_join\': \'$gis_join\', \'max_specific_humidity\': \'$max_specific_humidity\', \'max_max_air_temperature\': \'$max_max_air_temperature\', \'max_min_air_temperature\': \'$max_min_air_temperature\'}},\n   { "$group": { \'_id\': "$gis_join", \n"avg_max_specific_humidity": { "$avg": "$max_specific_humidity" },\n"avg_max_max_air_temperature": { "$avg": "$max_max_air_temperature" },\n"avg_max_min_air_temperature": { "$avg": "$max_min_air_temperature" }\n  } }\n]\ncur = sustain_collection.aggregate(pipeline)\n\nresults = list(cur)\nlen(results)\n'

In [21]:
chopped_projection = []
chopped_projection.extend(selected_x_columns)
chopped_projection.extend(candidate_y_columns)

print(chopped_projection)

['max_specific_humidity', 'max_max_air_temperature', 'max_min_air_temperature']


In [22]:
def construct_chopped_query(chopped_projection, gis_join):
    # PROJECTION
    proj_d = {}
    proj_dict = {'$project': proj_d}
    
    #GROUP + AGGREGATION
    group_d = {}
    group_dict = {'$group': group_d}
    
    full_query=[proj_dict, group_dict]
    
    # PROJECTION PART
    for cp in chopped_projection:
        proj_d[cp] = "$"+str(cp)
    proj_d[gis_join] = "$"+str(gis_join)
    
    # GROUP PART
    group_d['_id'] = "$"+str(gis_join)
    for cp in chopped_projection:
        inner_dict = {}
        inner_dict["$avg"] = "$"+str(cp)
        group_d[cp] = inner_dict
    
    return full_query
    
    

In [23]:
agg_pipeline = construct_chopped_query(chopped_projection, query_fild)

In [24]:
sustain_collection = sustain_db[query_collection]
cur = sustain_collection.aggregate(agg_pipeline)
agg_results = list(cur)

print(len(agg_results))

3108


In [25]:
agg_results[0]

{'_id': 'G2800330',
 'max_specific_humidity': 0.009376402956474131,
 'max_max_air_temperature': 296.08935039693404,
 'max_min_air_temperature': 284.5797005201204}

<h3><u>DATA STAGING FOR PHASE 2</u></h3>

In [26]:
phase2_df = pd.DataFrame(agg_results)

In [27]:
chopped_projection

['max_specific_humidity', 'max_max_air_temperature', 'max_min_air_temperature']

In [28]:
df_importance = phase2_df.loc[:, chopped_projection]
df_importance

Unnamed: 0,max_specific_humidity,max_max_air_temperature,max_min_air_temperature
0,0.009376,296.089350,284.579701
1,0.010028,298.064332,285.098822
2,0.012156,299.824949,289.856587
3,0.007514,299.440627,284.093643
4,0.005361,290.861012,278.092833
...,...,...,...
3103,0.010539,298.443496,285.461002
3104,0.006991,290.533906,278.551417
3105,0.008087,294.804971,281.846688
3106,0.006937,290.467913,278.398537


<h3><u>K-MEANS CLUSTERING</u></h3>

In [29]:
num_clusters = int(sqrt(len(agg_results)))

num_clusters

55

In [30]:
def print_full(x):
    pd.set_option('display.max_rows', len(x))
    print(x)
    pd.reset_option('display.max_rows')


In [31]:
kmeans = KMeans(n_clusters=num_clusters).fit(df_importance)
centroids = kmeans.cluster_centers_
print(centroids)

[[ 0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [-3.40282347e+38 -3.40282347e+38 -3.40282347e+38]
 [-7.37869763e+19 -7.37869763e+19 -7.37869763e+19]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+00  0.00000000e+00]
 [ 0.00000000e+00  0.00000000e+

  kmeans = KMeans(n_clusters=num_clusters).fit(df_importance)


In [32]:
from sklearn.metrics import pairwise_distances_argmin_min

df_ultimate = pd.DataFrame(columns=["gis_join", "cluster_id", "distance"])



for index, row in phase2_df.iterrows():
    input_x = row[chopped_projection]
    gis_join = row['_id']
    #print(input_x, gis_join)
    closest, d = pairwise_distances_argmin_min([np.array(input_x)], centroids)
    df_ultimate.loc[index] = [gis_join, closest[0], d[0]]
    
print(df_ultimate)



      gis_join cluster_id    distance
0     G2800330          0  410.675674
1     G4801590          0  412.460525
2     G2200950          0  417.027387
3     G0600390          0  412.763718
4     G5300470          0  402.412416
...        ...        ...         ...
3103  G2201190          0  412.984871
3104  G5400770          0  402.493283
3105  G3701690          0  407.857238
3106  G1901590          0  402.339849
3107  G3600210          0  401.177775

[3108 rows x 3 columns]


In [33]:
df_ultimate.to_csv("~/ucc-21/clusters-macav2.csv")