In [1]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.cluster import KMeans
from sklearn.datasets import load_diabetes
from sklearn.base import BaseEstimator,clone

import pandas as pd
import numpy as np


In [2]:
X,y = load_diabetes(return_X_y=True,as_frame=True)

In [3]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.2,random_state=42,shuffle=True)

In [4]:
model = LinearRegression().fit(X_train,y_train)
y_fit,y_pred = pd.Series(model.predict(X_train),index=X_train.index),pd.Series(model.predict(X_test),index=X_test.index)


In [5]:
r2_score(y_test,y_pred)

0.4526027629719196

In [47]:
class PhasedRegressor(BaseEstimator):

    def __init__(self,learner=LinearRegression(),
                 phase_detector=KMeans(n_clusters=2,random_state=42,n_init='auto')) -> None:
        super().__init__()

        self.learner = learner
        self.phase_detector = phase_detector

    def fit(self,X,y):

        self.phase_detector.fit(X)

        phases = self.phase_detector.predict(X)

        self.models = {}
        self.phases = list(set(phases))

        for phase in self.phases:
            X_sub,y_sub = X[phases==phase],y[phases==phase]
            model = clone(self.learner)
            model.fit(X_sub,y_sub)
            self.models[phase]=model

        return self
    
    def predict(self,X):

        phases = self.phase_detector.predict(X)

        sort_index = False
        if type(X)==np.ndarray:
            X = pd.DataFrame(X)
            sort_index=True

        predictions = []
        for phase in self.phases:

            X_sub = X[phases==phase]
            y_hat = pd.Series(self.models[phase].predict(X_sub),index=X_sub.index)
            predictions.append(y_hat)

        y_out = pd.concat(predictions)[X.index]

        return y_out




In [48]:
m_phased= PhasedRegressor(
    learner=Pipeline(
        steps=[('scaler',MinMaxScaler()),
               ('learner',LinearRegression())]
        ),
    phase_detector=Pipeline(
        steps=[
            #('scaler',MinMaxScaler()),
            ('cluster',KMeans(n_clusters=2,
                              random_state=42,
                              n_init='auto'))
            ]
        ),
)
m_phased.fit(X_train,y_train)
y_fit_2,y_pred_2 = m_phased.predict(X_train),m_phased.predict(X_test)

In [49]:
r2_score(y_test,y_pred_2)

0.5484612333457959

In [13]:
clusterpipe = Pipeline(steps=[
    #('scaler',MinMaxScaler()),
    ('learner',KMeans(n_clusters=2,random_state=42,n_init='auto',))
])

clusters = clusterpipe.fit(X_train)
phases = clusters.predict(X_train)

models = {}

m1 = Pipeline(
    steps=[
    ('scaler',MinMaxScaler()),
    ('learner',LinearRegression())
    ]).fit(
    X_train[phases==1],
    y_train[phases==1]
)

m2 = Pipeline(
    steps=[
    ('scaler',MinMaxScaler()),
    ('learner',LinearRegression())
    ]).fit(
    X_train[phases==0],
    y_train[phases==0],
)

test_phases = clusters.predict(X_test)

y_pred_3a = m1.predict(X_test)
y_pred_3b = m2.predict(X_test)

y_pred_3=np.where(test_phases==0,y_pred_3b,y_pred_3a)
r2_score(y_test,y_pred_3)

0.5484612333457959

In [14]:
m1['learner'].coef_

array([  30.02980749,  -19.76916428,  149.48552181,   80.34963348,
       -408.73238331,  317.88236326,   93.67878524,   96.57298029,
        206.92536503,   24.37251176])

In [15]:
m2['learner'].coef_

array([  -2.23604303,  -30.24622829,   72.61588291,   73.83905683,
        956.93693181, -748.35824352, -615.46574734,  -15.78482306,
       -106.69843169,  -14.68376578])

In [16]:
clusterpipe = Pipeline(steps=[
    ('scaler',MinMaxScaler()),
    ('learner',KMeans(n_clusters=2,random_state=42,n_init='auto',))
])

clusters = clusterpipe.fit(X_train)
phases = clusters.predict(X_train)

models = {}

for phase in list(set(phases)):

    m1 = Pipeline(
        steps=[
        ('scaler',MinMaxScaler()),
        ('learner',LinearRegression())
        ]).fit(
        X_train[phases==phase],
        y_train[phases==phase]
    )
    models[phase]=m1



test_phases = clusters.predict(X_test)

predictions = []
for phase in list(set(phases)):

    X_sub = X_test[test_phases==phase]
    y_hat = pd.Series(models[phase].predict(X_sub),index=X_sub.index)
    predictions.append(y_hat)

y_out = pd.concat(predictions)



In [19]:
models

{0: Pipeline(steps=[('scaler', MinMaxScaler()), ('learner', LinearRegression())]),
 1: Pipeline(steps=[('scaler', MinMaxScaler()), ('learner', LinearRegression())])}

In [20]:

y_pred_3a = models[0].predict(X_test)
y_pred_3b = models[1].predict(X_test)

y_pred_3=np.where(test_phases==0,y_pred_3a,y_pred_3b)
r2_score(y_test,y_pred_3)

0.5052553198901297

In [22]:
y_out

287    143.739410
211    160.357855
321    276.692900
418     97.365282
429    104.225632
          ...    
203    196.194665
42     129.026063
423    144.189722
155    189.319435
176    167.366096
Length: 89, dtype: float64

In [24]:
r2_score(y_test,y_out[y_test.index])

0.5052553198901297