In [651]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

In [652]:
features_in = [
"avgAnnCount",
"PctPrivateCoverage",
"PctPublicCoverage",
"PctBachDeg25_Over",
"PercentMarried",
"incidenceRate",
"povertyPercent",
"MedianAge",
"medIncome",
"PctWhite",
"PctBlack",
"PctAsian",
"PctOtherRace",
"avgDeathsPerYear",
]

In [653]:
df = pd.read_csv("./data/cancer_reg.csv", encoding='iso-8859-1', usecols=features_in, dtype=float)

In [654]:
df

Unnamed: 0,avgAnnCount,avgDeathsPerYear,incidenceRate,medIncome,povertyPercent,MedianAge,PercentMarried,PctBachDeg25_Over,PctPrivateCoverage,PctPublicCoverage,PctWhite,PctBlack,PctAsian,PctOtherRace
0,1397.000000,469.0,489.800000,61898.0,11.2,39.3,52.5,19.6,75.1,32.9,81.780529,2.594728,4.821857,1.843479
1,173.000000,70.0,411.600000,48127.0,18.6,33.0,44.5,22.7,70.2,31.1,89.228509,0.969102,2.246233,3.741352
2,102.000000,50.0,349.700000,49348.0,14.6,45.0,54.2,16.0,63.7,42.1,90.922190,0.739673,0.465898,2.747358
3,427.000000,202.0,430.400000,44243.0,17.1,42.8,52.7,9.3,58.4,45.3,91.744686,0.782626,1.161359,1.362643
4,57.000000,26.0,350.100000,49955.0,12.5,48.3,57.8,15.0,61.6,44.0,94.104024,0.270192,0.665830,0.492135
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3042,1962.667684,15.0,453.549422,46961.0,12.4,44.2,51.0,15.2,78.3,31.7,90.280811,3.837754,0.327613,1.700468
3043,1962.667684,43.0,453.549422,48609.0,18.8,30.4,52.6,12.4,64.5,28.8,75.706245,2.326771,4.044920,14.130288
3044,1962.667684,46.0,453.549422,51144.0,15.0,30.9,54.8,12.8,62.0,26.6,87.961629,2.313188,1.316472,5.680705
3045,1962.667684,52.0,453.549422,50745.0,13.3,39.0,58.8,14.4,75.9,29.5,92.905681,1.176562,0.244632,2.131790


In [655]:
import numpy as np
from sklearn.linear_model import LinearRegression

class FWL_Residualizer:
    def __init__(self):
        """
        threshold: limiar de correlação para considerar multicolinearidade
        """
        self.models = {}
        self.residualized_columns = []
        self.feature_names = None

    def fit(self, X, threshold=0.8):
        """
        X: DataFrame ou array com as variáveis explicativas
        """
        
        self.feature_names = X.columns if hasattr(X, "columns") else None

        X = np.asarray(X)
        corr = np.abs(np.corrcoef(X, rowvar=False))

        for i in range(X.shape[1]):
            corr_feature = corr[i]
            mask = np.ones(corr_feature.shape, dtype=bool)
            mask[i] = False
            indexes = np.argwhere((corr_feature >= threshold) & mask).flatten()

            if len(indexes) > 0:
                model = LinearRegression()
                model.fit(X[:, indexes], X[:, i])
  
                self.models[i] = {"model": model,
                                "X": indexes,
                                "y": i,
                            }
        
        return self
    
    def transform(self, X):
        """
        Aplica a residualização aos dados
        """
        
        X = np.asarray(X)
        
        for i in self.models:
            model_info = self.models[i]
            pred = model_info["model"].predict(X[model_info['X']])
            X[i] = X[i] - pred
            
        return X
    
    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)

In [656]:
%%time
residualizer = FWL_Residualizer()

CPU times: user 2 μs, sys: 1e+03 ns, total: 3 μs
Wall time: 6.44 μs


In [657]:
residualizer.fit(df, threshold=0.20)

<__main__.FWL_Residualizer at 0x7fa45bfceef0>

In [658]:
residualizer.models

{0: {'model': LinearRegression(), 'X': array([ 1,  3,  7, 12, 13]), 'y': 0},
 1: {'model': LinearRegression(), 'X': array([ 0,  3,  7, 12, 13]), 'y': 1},
 2: {'model': LinearRegression(), 'X': array([13]), 'y': 2},
 3: {'model': LinearRegression(),
  'X': array([ 0,  1,  4,  6,  7,  8,  9, 11, 12]),
  'y': 3},
 4: {'model': LinearRegression(),
  'X': array([ 3,  6,  7,  8,  9, 10, 11]),
  'y': 4},
 6: {'model': LinearRegression(),
  'X': array([ 3,  4,  8,  9, 10, 11]),
  'y': 6},
 7: {'model': LinearRegression(),
  'X': array([ 0,  1,  3,  4,  8,  9, 12]),
  'y': 7},
 8: {'model': LinearRegression(),
  'X': array([ 3,  4,  6,  7,  9, 10, 11]),
  'y': 8},
 9: {'model': LinearRegression(),
  'X': array([ 3,  4,  6,  7,  8, 12]),
  'y': 9},
 10: {'model': LinearRegression(),
  'X': array([ 4,  6,  8, 11, 12, 13]),
  'y': 10},
 11: {'model': LinearRegression(), 'X': array([ 3,  4,  6,  8, 10]), 'y': 11},
 12: {'model': LinearRegression(),
  'X': array([ 0,  1,  3,  7,  9, 10, 13]),
  'y':

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression

class FWL_Residualizer:
    def __init__(self):
        """
        threshold: limiar de correlação para considerar multicolinearidade
        """
        self.models = {}
        self.residualized_columns = []
        self.feature_names = None

    def fit(self, X, threshold=0.8, fit_intercept=True):
        """
        X: DataFrame ou array com as variáveis explicativas
        """
        
        self.feature_names = X.columns if hasattr(X, "columns") else None

        X = np.asarray(X)
        corr = np.abs(np.corrcoef(X, rowvar=False))

        for i in range(X.shape[1]):
            corr_feature = corr[i]
            print(corr_feature)
            mask = np.ones(corr_feature.shape, dtype=bool)
            mask[i] = False
            indexes = np.argwhere(corr_feature[mask] >= threshold).flatten()
            print(indexes)
            if len(indexes) > 0:
                model = LinearRegression(fit_intercept=fit_intercept)
                model.fit(X[:, indexes], X[:, i])
  
                self.models[i] = {"model": model,
                                "X": indexes,
                                "y": i,
                            }
        
        return self
    
    def transform(self, X):
        """
        Aplica a residualização aos dados
        """
        
        X = np.asarray(X)
        
        for i in self.models:
            model_info = self.models[i]
            pred = model_info["model"].predict(X[:, model_info['X']])
            X[i] = X[:, i] - pred
            
        return X
    
    def fit_transform(self, X, threshold=0.8, fit_intercept=True):
        self.fit(X, threshold=threshold, fit_intercept=fit_intercept)
        return self.transform(X)

In [345]:
residualizer.models

{}