In [34]:
from SafeTransformer import SafeTransformer
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline

In [35]:
apartments = pd.read_csv('apartments.csv', index_col=0)

In [36]:
X_ap = apartments.drop(columns='m2.price')

In [37]:
y = apartments['m2.price']

In [38]:
X = X_ap.copy()
colnames = list(X)
for idx, name in enumerate(colnames):
    if str(X.loc[:, name].dtype) in ['category', 'object']:
        dummies = pd.get_dummies(X.loc[:, name], prefix=name, drop_first=True)
        dummy_index  = X.columns.get_loc(name)
        X = pd.concat([X.iloc[:,range(dummy_index)], dummies, X.iloc[:, range(dummy_index+1, len(X.columns))]], axis=1)

In [73]:
X_train, X_test, X_lin_train, X_lin_test, y_train, y_test = train_test_split(X_ap, X, y)

In [74]:
linear_model = LinearRegression()

linear_model = linear_model.fit(X_lin_train, y_train)
standard_predictions = linear_model.predict(X_lin_test)
standard_predictions_error = mean_squared_error(y_test, standard_predictions)
standard_predictions_error

76490.426068932211

In [84]:
pens = [1, 2, 5, 10, 12, 20, 30, 50, 100, 200]
best_score = float('Inf')
best_pen = 0

for pen in pens:
    surrogate_model = GradientBoostingRegressor(n_estimators=100,
        max_depth=4,
        learning_rate=0.1,
        loss='huber')
    linear_model_simple = LinearRegression()
    safe_transformer = SafeTransformer(surrogate_model, penalty = pen)
    pipe = Pipeline(steps=[('safe', safe_transformer), ('linear', linear_model_simple)])
    pipe = pipe.fit(X_train, y_train)
    predictions = pipe.predict(X_test)
    error = mean_squared_error(y_test, predictions)
    print(error)
    if error < best_score:
        best_score = error
        best_pen = pen

1529.19661499
1505.64185638
1560.95831645
1533.266815
1572.24222811
1469.17453314
1462.62747531
1514.59302992
1533.80277857
1538.91244001


In [82]:
surrogate_model = GradientBoostingRegressor(n_estimators=100,
        max_depth=4,
        learning_rate=0.1,
        loss='huber')
surrogate_model = surrogate_model.fit(X_lin_train, y_train)
surrogate_model_predictions = surrogate_model.predict(X_lin_test)
surrogate_model_predictions_error = mean_squared_error(y_test, surrogate_model_predictions)
surrogate_model_predictions_error

9665.2767186140045

In [83]:
safe_transformer.summary()

Numerical Variable construction.year
Selected intervals:
	[-Inf, 1930.36)
	[1930.36, 1934.41)
	[1934.41, 1934.86)
	[1934.86, 1935.32)
	[1935.32, 1935.77)
	[1935.77, 1936.22)
	[1936.22, 1937.57)
	[1937.57, 1938.47)
	[1938.47, 1939.82)
	[1939.82, 1942.52)
	[1942.52, 1943.87)
	[1943.87, 1978.56)
	[1978.56, 1988.47)
	[1988.47, 1993.42)
	[1993.42, 1994.32)
	[1994.32, 1994.77)
	[1994.77, 1995.23)
	[1995.23, 1995.68)
	[1995.68, 1996.58)
	[1996.58, 1998.38)
	[1998.38, 2008.29)
	[2008.29, Inf)
Numerical Variable surface
Selected intervals:
	[-Inf, 22.60)
	[22.60, 25.21)
	[25.21, 27.16)
	[27.16, 29.11)
	[29.11, 30.41)
	[30.41, 31.71)
	[31.71, 33.66)
	[33.66, 36.27)
	[36.27, 37.57)
	[37.57, 38.22)
	[38.22, 38.87)
	[38.87, 39.52)
	[39.52, 40.82)
	[40.82, 42.12)
	[42.12, 44.72)
	[44.72, 46.68)
	[46.68, 47.33)
	[47.33, 50.58)
	[50.58, 56.44)
	[56.44, 57.09)
	[57.09, 59.69)
	[59.69, 61.64)
	[61.64, 62.29)
	[62.29, 62.94)
	[62.94, 64.24)
	[64.24, 68.80)
	[68.80, 71.40)
	[71.40, 76.61)
	[76.61, 78.56)
