In [1]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.cluster import KMeans
from sklearn.datasets import load_diabetes
from sklearn.base import BaseEstimator,clone
import random
from scipy.spatial.distance import cdist

import pandas as pd
import numpy as np


In [2]:
X,y = load_diabetes(return_X_y=True,as_frame=False)

In [3]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.2,random_state=42,shuffle=True)

1. Initiate k centroids
2. Form clusters about n-centroids
3. Build k regressions by cluster
4. Evaluate points errors for k models
5. Assign point to cluster with smallest model error
6. Adjust centroid


In [14]:
class ClusterRegression(BaseEstimator):

    def __init__(self,k,learner,max_iter) -> None:
        super().__init__()

        self.k=k
        self.learner=learner
        self.max_iter=max_iter

    def fit(self,X,y):

                
        random.seed(42)
        # Init centroids
        centroid_index = random.sample(range(0,len(X)),1)
        self.centroids = X[centroid_index]

        X_train_new = X

        for init_iter in range(self.k-1):

            X_train_new = np.delete(X_train_new,centroid_index,axis=0)
            centroid_index = cdist(
                self.centroids.mean(axis=0).reshape(1,X.shape[1]),
                X_train_new,metric='euclidean'
                )[0].argmax()
            new_centroid = X_train_new[centroid_index].reshape(1,X_train_new.shape[1])

            self.centroids = np.concatenate([self.centroids,new_centroid])

        # Initial Cluster Label
        labels = cdist(X,self.centroids,metric='euclidean').argmin(axis=1)
        self.label_list = list(set(labels))

        for e in range(self.max_iter):

            # Make k regessions
            self.model_dict = {}

            for label in self.label_list:

                Xt = X[labels==label]
                yt = y[labels==label]
                model = clone(self.learner)

                model.fit(Xt,yt)
                self.model_dict[label] = model

            # Predict target for each point with each model, eval min error

            labels = np.vstack(
                [
                (y - self.model_dict[label].predict(X))**2 
                for label in self.label_list
                ]
                ).T.argmin(axis=1)

            self.centroids = np.vstack([
                X[labels==label].mean(axis=0) 
                for label in self.label_list])
            
            labels = cdist(X,self.centroids,metric='euclidean').argmin(axis=1)

        return self

    def predict(self,X):
        
        test_clusters = cdist(X,self.centroids,metric='euclidean').argmin(axis=1)

        all_predictions = np.vstack([
            self.model_dict[label].predict(X)
            for label in self.label_list
        ]).T

        y_out = all_predictions[np.arange(len(all_predictions)), test_clusters]

        return y_out

        

                        

In [15]:
k = 2
random.seed(42)
epochs = 100

model = ClusterRegression(k=k,learner=LinearRegression(),max_iter=epochs)

model.fit(X_train,y_train)

y_out = model.predict(X_test)

In [16]:
r2_score(y_test,y_out)

0.5606844077518386

In [17]:
## Cluster then regression solution

class PhasedRegressor(BaseEstimator):

    def __init__(self,learner=LinearRegression(),
                 phase_detector=KMeans(n_clusters=2,random_state=42,n_init='auto')) -> None:
        super().__init__()

        self.learner = learner
        self.phase_detector = phase_detector

    def fit(self,X,y):

        self.phase_detector.fit(X)

        phases = self.phase_detector.predict(X)

        self.models = {}
        self.phases = list(set(phases))

        for phase in self.phases:
            X_sub,y_sub = X[phases==phase],y[phases==phase]
            model = clone(self.learner)
            model.fit(X_sub,y_sub)
            self.models[phase]=model

        return self
    
    def predict(self,X):

        phases = self.phase_detector.predict(X)

        sort_index = False
        if type(X)==np.ndarray:
            X = pd.DataFrame(X)
            sort_index=True

        predictions = []
        for phase in self.phases:

            X_sub = X[phases==phase]
            y_hat = pd.Series(self.models[phase].predict(X_sub),index=X_sub.index)
            predictions.append(y_hat)

        y_out = pd.concat(predictions)[X.index]

        return y_out


In [18]:
m_phased= PhasedRegressor(
    learner=LinearRegression(),
    phase_detector=KMeans(n_clusters=k,
                              random_state=42,
                              n_init='auto'),
)
m_phased.fit(X_train,y_train)
y_fit_2,y_pred_2 = m_phased.predict(X_train),m_phased.predict(X_test)
r2_score(y_test,y_pred_2)

0.5484612333457958

In [21]:
for k in range(2,9):
    model = ClusterRegression(k=k,learner=LinearRegression(),max_iter=epochs)

    model.fit(X_train,y_train)

    y_out = model.predict(X_test)
    score_cr = r2_score(y_test,y_out)

    m_phased= PhasedRegressor(
        learner=LinearRegression(),
        phase_detector=KMeans(n_clusters=k,
                                random_state=42,
                                n_init='auto'),
    )
    m_phased.fit(X_train,y_train)
    y_fit_2,y_pred_2 = m_phased.predict(X_train),m_phased.predict(X_test)
    score_pr = r2_score(y_test,y_pred_2)

    print(f"{k}: score_cr = {round(score_cr,4)}; score_pr = {round(score_pr,4)}")
    

2: score_cr = 0.5607; score_pr = 0.5485
3: score_cr = 0.5673; score_pr = 0.5056
4: score_cr = 0.5245; score_pr = 0.4271
5: score_cr = 0.2821; score_pr = 0.4878
6: score_cr = 0.4006; score_pr = 0.4688
7: score_cr = 0.0111; score_pr = 0.4842
8: score_cr = -3.6198; score_pr = 0.2458


In [11]:
m_phased= PhasedRegressor(
    learner=Pipeline(
        steps=[
            #('scaler',MinMaxScaler()),
               ('learner',LinearRegression())]
        ),
    phase_detector=Pipeline(
        steps=[
            #('scaler',MinMaxScaler()),
            ('cluster',KMeans(n_clusters=k,
                              random_state=42,
                              n_init='auto'))
            ]
        ),
)
m_phased.fit(X_train,y_train)
y_fit_2,y_pred_2 = m_phased.predict(X_train),m_phased.predict(X_test)

r2_score(y_test,y_pred_2)

0.5484612333457958

In [None]:
newpipe = Pipeline(
    steps=[
        ('scaler',MinMaxScaler()),
        ('learner',PhasedRegressor(
            learner=LinearRegression(),
            phase_detector=KMeans(n_clusters=5,
                              random_state=42,
                              n_init='auto'),
        ))
    ]
)
newpipe.fit(X_train,y_train)
y_fit_2b,y_pred_2b = newpipe.predict(X_train),newpipe.predict(X_test)
r2_score(y_test,y_pred_2b)

In [None]:
k = 4
random.seed(42)
epochs = 100

# Init centroids
centroid_index = random.sample(range(0,len(X_train)),1)
centroids = X_train[centroid_index]

X_train_new = X_train

for init_iter in range(k-1):

    X_train_new = np.delete(X_train_new,centroid_index,axis=0)
    centroid_index = cdist(
        centroids.mean(axis=0).reshape(1,X_train.shape[1]),
        X_train_new,metric='euclidean'
        )[0].argmax()
    new_centroid = X_train_new[centroid_index].reshape(1,X_train_new.shape[1])

    centroids = np.concatenate([centroids,new_centroid])

# Initial Cluster Label
labels = cdist(X_train,centroids,metric='euclidean').argmin(axis=1)
label_list = list(set(labels))


In [None]:
for e in range(epochs):

    # Make k regessions
    model_dict = {}

    for label in label_list:

        Xt = X_train[labels==label]
        yt = y_train[labels==label]

        model = LinearRegression().fit(Xt,yt)
        model_dict[label] = model

    # Predict target for each point with each model, eval min error

    labels = np.vstack(
        [
        (y_train - model_dict[label].predict(X_train))**2 
        for label in label_list
        ]
        ).T.argmin(axis=1)

    centroids = np.vstack([
        X_train[labels==label].mean(axis=0) 
        for label in label_list])
    
    
    labels = cdist(X_train,centroids,metric='euclidean').argmin(axis=1)
    

In [None]:
test_clusters = cdist(X_test,centroids,metric='euclidean').argmin(axis=1)

In [None]:
all_predictions = np.vstack([
    model_dict[label].predict(X_test)
    for label in label_list
]).T

In [None]:
y_out = all_predictions[np.arange(len(all_predictions)), test_clusters]

#np.array([arr[i] for arr,i in zip(all_predictions,test_clusters)])

In [None]:
r2_score(y_test,y_out)

In [None]:
m_phased= PhasedRegressor(
    learner=LinearRegression(),
    phase_detector=KMeans(n_clusters=k,
                              random_state=42,
                              n_init='auto'),
)
m_phased.fit(X_train,y_train)
y_fit_2,y_pred_2 = m_phased.predict(X_train),m_phased.predict(X_test)

In [None]:
r2_score(y_test,y_pred_2)