In [3]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder

from sklearn.metrics import r2_score, mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.compose import make_column_transformer

In [8]:
df_wine=pd.read_csv('data/Red.csv')

In [4]:
ct=make_column_transformer(
    (OrdinalEncoder(), ['Region']),
    (StandardScaler(), ['Price']),
    (OneHotEncoder(), ['Country'])
)

In [19]:
pipeline=Pipeline([('ct', ct), ('rf', RandomForestRegressor(random_state=42))])
print (pipeline)

Pipeline(steps=[('ct',
                 ColumnTransformer(transformers=[('ordinalencoder',
                                                  OrdinalEncoder(),
                                                  ['Region']),
                                                 ('standardscaler',
                                                  StandardScaler(), ['Price']),
                                                 ('onehotencoder',
                                                  OneHotEncoder(),
                                                  ['Country'])])),
                ('rf', RandomForestRegressor(random_state=42))])


In [20]:
X = df_wine[['Region', 'Country', 'Price']]
y = df_wine['Rating']
pipeline.fit(X, y)

In [21]:
df_wine_test=pd.read_csv('data/Red_test.csv')

In [22]:
X_test=df_wine_test[['Region', 'Country', 'Price']]
y_test=df_wine_test['Rating']

In [23]:
y_pred=pipeline.predict(X_test)

In [24]:
def rmse(y_hat, y):
    return mean_squared_error(y_hat, y, squared = False)

In [25]:
print (round(rmse(y_test, y_pred), 4))

0.0765


In [27]:
import joblib

In [28]:
joblib.dump(pipeline, 'pipeline_wine.pkl')

['pipeline_wine.pkl']

In [29]:
pipeline.set_params(rf__n_estimators=200)

In [32]:
pipeline.fit(X, y)

In [33]:
y_pred=pipeline.predict(X_test)

In [34]:
print (round(rmse(y_test, y_pred), 4))

0.0761


In [40]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [43]:
stack_reg=StackingRegressor(
    estimators = [('lr', RidgeCV()),
                ('dt', DecisionTreeRegressor(random_state=42))],
    final_estimator = RandomForestRegressor(n_estimators=10, random_state=42)
)

In [44]:
pipeline=Pipeline([('ct', ct), ('stack', stack_reg)])

In [45]:
pipeline.fit(X, y)

In [46]:
y_pred=pipeline.predict(X_test)

In [49]:
print (round(rmse(y_test, y_pred), 2))

0.18
