In [168]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.cluster import KMeans
from sklearn.datasets import load_diabetes
from sklearn.base import BaseEstimator,clone

import pandas as pd
import numpy as np


In [169]:
X,y = load_diabetes(return_X_y=True,as_frame=True)

In [170]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.2,random_state=42,shuffle=True)

In [171]:
model = LinearRegression().fit(X_train,y_train)
y_fit,y_pred = pd.Series(model.predict(X_train),index=X_train.index),pd.Series(model.predict(X_test),index=X_test.index)


In [172]:
r2_score(y_test,y_pred)

0.4526027629719196

In [177]:
class PhasedRegressor(BaseEstimator):

    def __init__(self,learner=LinearRegression(),
                 phase_detector=KMeans(n_clusters=2,random_state=42,n_init='auto')) -> None:
        super().__init__()

        self.learner = learner
        self.phase_detector = phase_detector

    def fit(self,X,y):

        self.phase_detector.fit(X)

        phases = self.phase_detector.predict(X)

        self.models = {}

        for phase in list(set(phases)):
            X_sub,y_sub = X[phases==phase],y[phases==phase]
            model = clone(self.learner)
            model.fit(X_sub,y_sub)
            self.models[phase]=model

        return self
    
    def predict(self,X):

        phases = self.phase_detector.predict(X)

        sort_index = False
        if type(X)==np.ndarray:
            X = pd.DataFrame(X)
            sort_index=True

        predictions = []
        for phase in list(set(phases)):
            X_sub = X[phases==phase]
            y_pred = pd.Series(self.models[phase].predict(X_sub),index=X_sub.index)
            predictions.append(y_pred)

        y_out = pd.concat(predictions)[X.index]
        

        return y_out




In [184]:
m_phased= PhasedRegressor(
    learner=Pipeline(
        steps=[('scaler',MinMaxScaler()),
               ('learner',LinearRegression())]
        ),
    phase_detector=Pipeline(
        steps=[
            ('scaler',MinMaxScaler()),
            ('cluster',KMeans(n_clusters=2,
                              random_state=42,
                              n_init='auto'))
            ]
        ),
)
m_phased.fit(X_train,y_train)
y_fit_2,y_pred_2 = m_phased.predict(X_train),m_phased.predict(X_test)

In [191]:
m_phased.models[0]['learner'].coef_
#m_phased.models[1]['learner'].coef_

array([-2.09892263e+01, -1.42108547e-13,  1.20705545e+02,  7.58307792e+01,
       -3.69506651e+02,  2.66706408e+02,  1.13368483e+02,  8.90691724e+01,
        2.33246441e+02,  1.12675196e+00])

In [185]:
r2_score(y_pred_2,y_test)

0.11794716352895751

In [None]:
y_pred_2

In [None]:
y_test

In [None]:
r2_score(y_pred_2[y_test.index],y_test)

In [192]:
clusterpipe = Pipeline(steps=[
    ('scaler',MinMaxScaler()),
    ('learner',KMeans(n_clusters=2,random_state=42,n_init='auto',))
])

clusters = clusterpipe.fit(X_train)
phases = clusters.predict(X_train)

models = {}

m1 = Pipeline(
    steps=[
    ('scaler',MinMaxScaler()),
    ('learner',LinearRegression())
    ]).fit(
    X_train[phases==1],
    y_train[phases==1]
    #learner__sample_weight=weights
)

m2 = Pipeline(
    steps=[
    ('scaler',MinMaxScaler()),
    ('learner',LinearRegression())
    ]).fit(
    X_train[phases==0],
    y_train[phases==0],
    #learner__sample_weight=(1-weights)
)

test_phases = clusters.predict(X_test)

y_pred_3a = m1.predict(X_test)
y_pred_3b = m2.predict(X_test)

y_pred_3=np.where(test_phases==0,y_pred_3b,y_pred_3a)
r2_score(y_test,y_pred_3)

0.5052553198901297

In [189]:
m1['learner'].coef_

array([  137.78155368,  -207.39404378,   663.60523181,   382.59196098,
       -1580.07478883,  1060.72843836,   530.10140134,   429.73307733,
        1015.30235634,   120.08338012])

In [190]:
m2['learner'].coef_

array([  -11.83767991,  -317.30666534,   333.53177012,   302.07137143,
        4967.65841785, -4492.0815502 , -2199.62797768,  -142.586663  ,
        -575.12938558,   -69.50978863])

In [None]:
clusterpipe = Pipeline(steps=[
    ('scaler',MinMaxScaler()),
    ('learner',KMeans(n_clusters=2,random_state=42,n_init='auto',))
])

clusters = clusterpipe.fit(X_train)
phases = clusters.predict(X_train)

models = {}

for phase in list(set(phases)):

    m1 = Pipeline(
        steps=[
        ('scaler',MinMaxScaler()),
        ('learner',LinearRegression())
        ]).fit(
        X_train[phases==1],
        y_train[phases==1]
        #learner__sample_weight=weights
    )
    
test_phases = clusters.predict(X_test)

y_pred_3a = m1.predict(X_test)
y_pred_3b = m2.predict(X_test)

y_pred_3=np.where(test_phases==0,y_pred_3b,y_pred_3a)
r2_score(y_test,y_pred_3)