In [1]:
from importlib import import_module

lib_names = [('pandas', 'pd'), ('numpy', 'np'), ('matplotlib.pyplot','plt'), ('seaborn','sns')]
for (lib_name, short_hand) in lib_names:
    try:
        lib = import_module(lib_name)
    except:
        print(sys.exc_info())
    else:
        globals()[short_hand] = lib

In [2]:
filename = 'https://github.com/lmassaron/datasets/'
filename += 'releases/download/1.0/wine_quality.feather'
wine = pd.read_feather(filename)


np.random.seed(42)
train = (wine.groupby('quality')
             .apply(lambda x: x.sample(frac=.7))
             .reset_index(drop=True))

test = wine[~wine.index.isin(train.index)]

X_train = train.iloc[:,1:]
y_train = train.iloc[:,0]

X_test = test.iloc[:,1:]
y_test = test.iloc[:,0]

In [3]:
import numpy as np
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

numerical_columns = ['fixed acidity', 'volatile acidity',  'citric acid',\
                     'residual sugar', 'chlorides', 'free sulfur dioxide', 'total sulfur dioxide',\
                     'density', 'pH', 'sulphates', 'alcohol']

categorical_columns = ['red_wine']

numerical_pipe = Pipeline([
    ('imputing', SimpleImputer()),
    ('scaling', StandardScaler())
    
])

categorical_pipe = Pipeline([
    ('imputing', SimpleImputer(strategy='most_frequent'))
])

preprocessing = ColumnTransformer([
    ('cat', categorical_pipe, categorical_columns),
    ('num', numerical_pipe, numerical_columns)
])



decision_tree = DecisionTreeRegressor(max_depth=14)
ridge_regression = Ridge(alpha=1.0)
k_nearest_neighbor = KNeighborsRegressor(n_neighbors=1)



model_labels = ['Decision tree', 'Ridge regression', 'KNN']
models = [decision_tree, ridge_regression, k_nearest_neighbor]


pipes = dict()
    
for model_label, model in zip(model_labels, models):
    pipes[model_label] = Pipeline([
    ('preprocessing', preprocessing),
    ('regressor', AdaBoostRegressor(model, n_estimators=300, random_state=42))])

In [4]:
from sklearn.metrics import mean_absolute_error

for ada_model in pipes:
    pipes[ada_model].fit(X_train, y_train)
    preds = pipes[ada_model].predict(X_test)
    mae = mean_absolute_error(y_test, preds)
    print(f"test mae for {ada_model}: {mae:0.3}")

test mae for Decision tree: 0.151
test mae for Ridge regression: 0.553
test mae for KNN: 0.139
