In [1]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error
from sklearn.model_selection import GridSearchCV
from sklearn.impute import SimpleImputer
import numpy as np
import pandas as pd

In [2]:
def importdata():
    data = pd.read_csv('data/data.csv')
    data = data[data['SalePrice'] < 300000]
    data = data[data['SalePrice'] > 50000]
    return data
def get_data_numbers():
    data = importdata()
    data = data[[key for key in data.columns.to_list() if data[key].dtype in ('int64', 'float64')]]
    y = data['SalePrice']
    data = data.drop('SalePrice', axis=1)
    return data, y
def get_data_squared_features():
    data = importdata()
    data = data[[key for key in data.columns.to_list() if data[key].dtype in ('int64', 'float64')]]
    y = data['SalePrice']
    data = data.drop('SalePrice', axis=1)
    new_features = pd.DataFrame({i + j: data[i] * data[j] for i in data.columns.to_list() for j in data.columns.to_list()})
    data = pd.concat([data, new_features], axis=1)
    return data, y
def get_data_with_categories():
    data = importdata()
    data = data[data['SalePrice'] > 50000]
    y = data['SalePrice']
    data = data.drop('SalePrice', axis=1)
    return data, y

In [3]:
def make_pipe_only_numbers():
    imp = SimpleImputer()
    return Pipeline([
        ('imp', imp),
        ('model', GradientBoostingRegressor(n_estimators=100))
    ])
def make_ultimate_pipe(data):
    string_columns = [key for key in data.columns.to_list() if data[key].dtype == 'object']
    continuous_columns = [key for key in data.columns.to_list() if data[key].dtype in ('int64', 'float64')]
    column_tr = ColumnTransformer(
        transformers=[
        ('OneHotPreprocessor', OneHotEncoder(handle_unknown='ignore'), string_columns),
    ])
    imp = SimpleImputer()
    return Pipeline([
        ('columns_transform', column_tr),
        ('Imputer', imp),
        ('model', GradientBoostingRegressor())
    ])

In [4]:
def check1():
    X, Y = get_data_with_categories()
    xtr, xt, ytr, yt = train_test_split(X, Y)
    model1 = make_ultimate_pipe(X)
    model1.fit(xtr, ytr)
    return model1, xt, yt

In [5]:
model1, xt1, yt1 = check1()

In [6]:
print(mean_absolute_error(yt1, model1.predict(xt1)))
print(mean_absolute_percentage_error(yt1, model1.predict(xt1)))

18615.36765363571
0.12350124713091667


In [7]:
def check2():
    X, Y = get_data_squared_features()
    xtr, xt, ytr, yt = train_test_split(X, Y)
    model = make_pipe_only_numbers()
    model.fit(xtr, ytr)
    return model, xt, yt

In [8]:
#model2, xt2, yt2 = check2()

In [9]:
# print(mean_absolute_error(yt2, model2.predict(xt2)))
# print(mean_absolute_percentage_error(yt2, model2.predict(xt2)))

In [10]:
def check3():
    X, Y = get_data_numbers()
    xtr, xt, ytr, yt = train_test_split(X, Y)
    model = make_pipe_only_numbers()
    model.fit(xtr, ytr)
    return model, xt, yt

In [11]:
model3, xt3, yt3 = check3()

In [12]:
print(mean_absolute_error(yt3, model3.predict(xt3)))
print(mean_absolute_percentage_error(yt3, model3.predict(xt3)))

12740.91305539015
0.0820708476873907


In [13]:
param_grid = {
    'model__n_estimators': [100, 300],
    'model__min_samples_leaf': [1, 2, 4],
}

In [14]:
m1 = GridSearchCV(estimator=make_pipe_only_numbers(), cv=2, param_grid=param_grid, scoring='neg_mean_absolute_percentage_error')

In [15]:
m1.fit(*get_data_numbers())

In [None]:
m1