# Cluster-Based Learning

In this notebook, I will be looking at how we can use clustering methods to help us do regression. We will be looking at two approaches:
1. Clustering of the data and then training a model per cluster
2. Clustering only of the outputs and then training a model per cluster.

In [166]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.compose import TransformedTargetRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.cluster import KMeans
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.utils import gen_even_slices
import time as time
import statsmodels.api as sm

import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')

## Data

In [89]:
# data_path = '/media/disk/erc/papers/2019_ML_OCN/data/raph_temp_data_NA/'
# data_path = '/Users/eman/Documents/data/ocean/⁩'

# Import data
X = pd.read_csv(f"X_INPUT_LOG_PCA_NA.csv").iloc[:, 2:]
y = pd.read_csv(f"Y_OUTPUT_LOG_NA.csv").iloc[:, 2:]

# y = np.exp(y)

### Train-Test Split

### Standardize Data

In [84]:
# Standardize Inputs (per dimension)
x_mean, x_std = xtrain.mean(axis=0), xtrain.std(axis=0)

xtrain_norm = (xtrain - x_mean) / x_std
xtest_norm = (xtest - x_mean) / x_std

# Normalize Outputs
y_mean = ytrain.mean(axis=0)

ytrain_norm = ytrain - y_mean
ytest_norm = ytest - y_mean

## Method I - Clustering the Inputs

In [85]:
clf = KMeans(init='k-means++', n_clusters=3, n_init=10, verbose=None)

clf.fit(xtrain_norm)

clusters = clf.predict(xtrain_norm)

In [86]:
# model 1
for imodel in np.unique(clusters):
    
    print(f"Cluster: {imodel+1}")
    # get subset of data which resides in cluster
    ix = xtrain_norm[clusters == imodel]
    iy = ytrain_norm[clusters == imodel]
    
#     print(ix.shape, iy.shape)
    
    # training and testing split
    train_size = 0.8
    random_state = 123

    ixtrain, ixtest, iytrain, iytest = train_test_split(
        ix, iy, train_size=train_size, random_state=random_state
    )
    
    
    # Standardize Inputs (per dimension)
    x_mean, x_std = ixtrain.mean(axis=0), ixtrain.std(axis=0)

    ixtrain_norm = (ixtrain - x_mean) / x_std
    ixtest_norm = (ixtest - x_mean) / x_std

    # Normalize Outputs
    y_mean = iytrain.mean(axis=0)

    iytrain_norm = iytrain - y_mean
    iytest_norm = iytest - y_mean
    
    # =======================
    # PCA
    # =======================
    n_components = 20

    pca_model = PCA(n_components=n_components)

    iytrain_red = pca_model.fit_transform(iytrain_norm)
    iytest_red = pca_model.transform(iytest_norm)
    
    # =======================
    # ML Algorithm
    # =======================
    rf_model = RandomForestRegressor(
    n_estimators=1000, 
    criterion='mse',
    n_jobs=-1,
    random_state=123,
    warm_start=False,
    verbose=0
    )

    t0 = time.time()
    rf_model.fit(ixtrain_norm, iytrain_red)
    t1 = time.time() - t0

    print(
        f"Training Time: {t1:.3f} seconds"
    )
    
    # Predictions
    t0 = time.time()
    iypred_red = rf_model.predict(ixtest_norm)
    t1 = time.time() - t0
    iypred = pca_model.inverse_transform(iypred_red)


    # Get Average Stats
    mae = mean_absolute_error(iytest_norm, iypred, multioutput='uniform_average')
    mse = mean_squared_error(iytest_norm, iypred, multioutput='uniform_average')
    rmse = np.sqrt(mse)
    r2 = r2_score(iytest_norm, iypred, multioutput='uniform_average')
    print(
        f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}" 
        f" \nTime: {t1:.3} seconds"
    )
    print("Done!\n")


Cluster: 1
Training Time: 5.600 seconds
MAE: 0.078
MSE: 0.013
RMSE: 0.115
R2: 0.525 
Time: 0.313 seconds
Done!

Cluster: 2
Training Time: 1.438 seconds
MAE: 0.038
MSE: 0.004
RMSE: 0.066
R2: 0.607 
Time: 0.107 seconds
Done!

Cluster: 3
Training Time: 3.347 seconds
MAE: 0.083
MSE: 0.014
RMSE: 0.118
R2: 0.474 
Time: 0.209 seconds
Done!



## Method II - Clustering the Outputs

In [87]:
clf = KMeans(init='k-means++', n_clusters=3, n_init=20, verbose=None)

clf.fit(ytrain_norm)

clusters = clf.predict(ytrain_norm)

In [88]:
# model 1
for imodel in np.unique(clusters):
    
    print(f"Cluster: {imodel+1}")
    # get subset of data which resides in cluster
    ix = xtrain_norm[clusters == imodel]
    iy = ytrain_norm[clusters == imodel]
    
#     print(ix.shape, iy.shape)
    
    # training and testing split
    train_size = 0.8
    random_state = 123

    ixtrain, ixtest, iytrain, iytest = train_test_split(
        ix, iy, train_size=train_size, random_state=random_state
    )
    
    print(ix.shape, iy.shape)
    
    # Standardize Inputs (per dimension)
    x_mean, x_std = ixtrain.mean(axis=0), ixtrain.std(axis=0)

    ixtrain_norm = (ixtrain - x_mean) / x_std
    ixtest_norm = (ixtest - x_mean) / x_std

    # Normalize Outputs
    y_mean = iytrain.mean(axis=0)

    iytrain_norm = iytrain - y_mean
    iytest_norm = iytest - y_mean
    
    # =======================
    # PCA
    # =======================
    n_components = 20

    pca_model = PCA(n_components=n_components)

    iytrain_red = pca_model.fit_transform(iytrain_norm)
    iytest_red = pca_model.transform(iytest_norm)
    
    # =======================
    # ML Algorithm
    # =======================
    rf_model = RandomForestRegressor(
    n_estimators=1000, 
    criterion='mse',
    n_jobs=-1,
    random_state=123,
    warm_start=False,
    verbose=0
    )

    t0 = time.time()
    rf_model.fit(ixtrain_norm, iytrain_red)
    t1 = time.time() - t0

    print(
        f"Training Time: {t1:.3f} seconds"
    )
    
    # Predictions
    t0 = time.time()
    iypred_red = rf_model.predict(ixtest_norm)
    t1 = time.time() - t0
    iypred = pca_model.inverse_transform(iypred_red)


    # Get Average Stats
    mae = mean_absolute_error(iytest_norm, iypred, multioutput='uniform_average')
    mse = mean_squared_error(iytest_norm, iypred, multioutput='uniform_average')
    rmse = np.sqrt(mse)
    r2 = r2_score(iytest_norm, iypred, multioutput='uniform_average')
    print(
        f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}" 
        f" \nTime: {t1:.3} seconds"
    )
    print("Done!\n")

Cluster: 1
(901, 17) (901, 276)
Training Time: 4.833 seconds
MAE: 0.073
MSE: 0.011
RMSE: 0.105
R2: 0.460 
Time: 0.209 seconds
Done!

Cluster: 2
(962, 17) (962, 276)
Training Time: 5.086 seconds
MAE: 0.072
MSE: 0.022
RMSE: 0.150
R2: 0.341 
Time: 0.313 seconds
Done!

Cluster: 3
(681, 17) (681, 276)
Training Time: 3.466 seconds
MAE: 0.059
MSE: 0.007
RMSE: 0.081
R2: 0.440 
Time: 0.21 seconds
Done!



## Method III - Binning the Outputs

In [161]:
ytrain_norm.values[:, :75].shape

(2544, 75)

In [163]:
intervals = [
    (0, 90), (90, 180), (180, 276)
]

In [168]:
for i in gen_even_slices(ytrain_norm.shape[1], 10):
    print(i)
    break

slice(0, 28, None)


In [170]:
# model 1
for idx in gen_even_slices(ytrain_norm.shape[1], 10):
    
    print(f"Cluster: {idx}")
    # get subset of data which resides in cluster
    ix = xtrain_norm
    iy = ytrain_norm.values[:, idx]
    print(ix.shape, iy.shape)
#     print(ix.shape, iy.shape)
    
    # training and testing split
    train_size = 0.8
    random_state = 123

    ixtrain, ixtest, iytrain, iytest = train_test_split(
        ix, iy, train_size=train_size, random_state=random_state
    )
    
    
    
    # Standardize Inputs (per dimension)
    x_mean, x_std = ixtrain.mean(axis=0), ixtrain.std(axis=0)

    ixtrain_norm = (ixtrain - x_mean) / x_std
    ixtest_norm = (ixtest - x_mean) / x_std

    # Normalize Outputs
    y_mean = iytrain.mean(axis=0)

    iytrain_norm = iytrain - y_mean
    iytest_norm = iytest - y_mean
    
    # =======================
    # PCA
    # =======================
    n_components = 20

    pca_model = PCA(n_components=n_components)

    iytrain_red = pca_model.fit_transform(iytrain_norm)
    iytest_red = pca_model.transform(iytest_norm)
    
    # =======================
    # ML Algorithm
    # =======================
    rf_model = RandomForestRegressor(
    n_estimators=1000, 
    criterion='mse',
    n_jobs=-1,
    random_state=123,
    warm_start=False,
    verbose=0
    )

    t0 = time.time()
    rf_model.fit(ixtrain_norm, iytrain_red)
    t1 = time.time() - t0

    print(
        f"Training Time: {t1:.3f} seconds"
    )
    
    # Predictions
    t0 = time.time()
    iypred_red = rf_model.predict(ixtest_norm)
    t1 = time.time() - t0
    iypred = pca_model.inverse_transform(iypred_red)


    # Get Average Stats
    mae = mean_absolute_error(iytest_norm, iypred, multioutput='uniform_average')
    mse = mean_squared_error(iytest_norm, iypred, multioutput='uniform_average')
    rmse = np.sqrt(mse)
    r2 = r2_score(iytest_norm, iypred, multioutput='uniform_average')
    print(
        f"MAE: {mae:.3f}\nMSE: {mse:.3f}\nRMSE: {rmse:.3f}\nR2: {r2:.3f}" 
        f" \nTime: {t1:.3} seconds"
    )
    print("Done!\n")


Cluster: 10.0
(2544, 17) (2544, 28)
Training Time: 9.791 seconds
MAE: 0.103
MSE: 0.024
RMSE: 0.155
R2: 0.768 
Time: 0.209 seconds
Done!

Cluster: 10.0
(2544, 17) (2544, 28)
Training Time: 10.356 seconds
MAE: 0.094
MSE: 0.018
RMSE: 0.136
R2: 0.551 
Time: 0.209 seconds
Done!

Cluster: 10.0
(2544, 17) (2544, 28)
Training Time: 10.710 seconds
MAE: 0.078
MSE: 0.013
RMSE: 0.115
R2: 0.503 
Time: 0.21 seconds
Done!

Cluster: 10.0
(2544, 17) (2544, 28)
Training Time: 10.972 seconds
MAE: 0.069
MSE: 0.010
RMSE: 0.102
R2: 0.581 
Time: 0.209 seconds
Done!

Cluster: 10.0
(2544, 17) (2544, 28)
Training Time: 11.068 seconds
MAE: 0.063
MSE: 0.010
RMSE: 0.099
R2: 0.566 
Time: 0.206 seconds
Done!

Cluster: 10.0
(2544, 17) (2544, 28)
Training Time: 12.871 seconds
MAE: 0.056
MSE: 0.007
RMSE: 0.081
R2: 0.647 
Time: 0.305 seconds
Done!

Cluster: 10.0
(2544, 17) (2544, 27)
Training Time: 11.889 seconds
MAE: 0.057
MSE: 0.007
RMSE: 0.085
R2: 0.644 
Time: 0.204 seconds
Done!

Cluster: 10.0
(2544, 17) (2544, 27)
