In [1]:
from SafeTransformer import SafeTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline

In [2]:
apartments = pd.read_csv('apartments.csv', index_col=0)
X_ap = apartments.drop(columns='m2.price')
y = apartments['m2.price']

In [3]:
X = X_ap.copy()
colnames = list(X)
for idx, name in enumerate(colnames):
    if str(X.loc[:, name].dtype) in ['category', 'object']:
        dummies = pd.get_dummies(X.loc[:, name], prefix=name, drop_first=True)
        dummy_index  = X.columns.get_loc(name)
        X = pd.concat([X.iloc[:,range(dummy_index)], dummies, X.iloc[:, range(dummy_index+1, len(X.columns))]], axis=1)

In [4]:
X_train, X_test, X_lin_train, X_lin_test, y_train, y_test = train_test_split(X_ap, X, y)

In [5]:
surrogate_model = GradientBoostingRegressor(
    n_estimators=1000,
    max_depth=4,
    learning_rate=0.1,
    loss='huber'
)
safe_transformer = SafeTransformer(surrogate_model, penalty = 10, dependence_method = "shap")
safe_transformer.fit(X_train, y_train)

90
90
130
130
10
10
6
6


<SafeTransformer.SafeTransformer.SafeTransformer at 0x7f58f0243da0>

In [6]:
safe_transformer.summary()

Numerical Variable construction.year
Selected intervals:
	[-Inf, 1922.00)
	[1922.00, 1924.00)
	[1924.00, 1926.00)
	[1926.00, 1929.00)
	[1929.00, 1933.00)
	[1933.00, 1935.00)
	[1935.00, 1938.00)
	[1938.00, 1940.00)
	[1940.00, 1942.00)
	[1942.00, 1944.00)
	[1944.00, 1947.00)
	[1947.00, 1949.00)
	[1949.00, 1951.00)
	[1951.00, 1953.00)
	[1953.00, 1956.00)
	[1956.00, 1958.00)
	[1958.00, 1961.00)
	[1961.00, 1964.00)
	[1964.00, 1967.00)
	[1967.00, 1969.00)
	[1969.00, 1971.00)
	[1971.00, 1973.00)
	[1973.00, 1975.00)
	[1975.00, 1978.00)
	[1978.00, 1982.00)
	[1982.00, 1984.00)
	[1984.00, 1986.00)
	[1986.00, 1988.00)
	[1988.00, 1990.00)
	[1990.00, 1992.00)
	[1992.00, 1995.00)
	[1995.00, 1997.00)
	[1997.00, 1999.00)
	[1999.00, 2001.00)
	[2001.00, 2003.00)
	[2003.00, 2005.00)
	[2005.00, 2007.00)
	[2007.00, 2009.00)
	[2009.00, Inf)
Numerical Variable surface
Selected intervals:
	[-Inf, 23.00)
	[23.00, 25.00)
	[25.00, 28.00)
	[28.00, 31.00)
	[31.00, 34.00)
	[34.00, 37.00)
	[37.00, 39.00)
	[39.00, 41.

In [7]:
np.unique(X.iloc[:,0])

array([1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930,
       1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941,
       1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952,
       1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963,
       1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974,
       1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985,
       1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996,
       1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007,
       2008, 2009, 2010])

In [None]:
safe_transformer.