# Wine quality regression [sklearn]
* Regression task to predict wine's density.
* Reference notebook: <https://www.kaggle.com/code/nkitgupta/feature-engineering-and-feature-selection>
* Dataset: <https://www.kaggle.com/code/nkitgupta/feature-engineering-and-feature-selection/input>

## Import libraries

In [55]:
import os

import pandas as pd
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

from giskard import scan
from giskard import Dataset, Model, GiskardClient
from giskard.client.giskard_client import GiskardError

## Define constants

In [None]:
# Constants.
TARGET_COLUMN_NAME = "density"

# Giskard creds.
GISKARD_URL = "http://localhost:9000"
GISKARD_TOKEN = ""
GISKARD_PROJECT_KEY = "wine_quality_regression"

# Paths.
PATH_DATA = os.path.join(".", "datasets", "wine_quality_dataset", "winequality-red.csv")

## Load data

In [None]:
def load_data(**kwargs) -> pd.DataFrame:
    """Load data."""
    df = pd.read_csv(PATH_DATA, **kwargs)
    return df

wine_df = load_data()

## Train-test split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(wine_df.drop(TARGET_COLUMN_NAME, axis=1),
                                                    wine_df[TARGET_COLUMN_NAME],
                                                    test_size=0.2, random_state=11)

## Wrap dataset with giskard

In [None]:
raw_dataset = pd.concat([X_test, Y_test], axis=1)
column_types = {column: "numeric" for column in X_test.columns}

wrapped_dataset = Dataset(raw_dataset,
                          name="wine_quality_dataset",
                          target=TARGET_COLUMN_NAME,
                          column_types=column_types)

## Train and test estimator

In [None]:
# Define and fit pipeline.
pipeline = Pipeline(steps=[
    ("pca", PCA(n_components=8)),
    ("regressor", LinearRegression())
])

pipeline.fit(X_train, Y_train)

# Calculate train and test metrics.
Y_pred_train = pipeline.predict(X_train)
Y_pred_test = pipeline.predict(X_test)

train_metric = r2_score(Y_train, Y_pred_train)
test_metric = r2_score(Y_test, Y_pred_test)

print(f"Train R2-Score: {train_metric}")
print(f"Test R2-Score: {test_metric}")

## Wrap model with Giskard

In [None]:
wrapped_model = Model(pipeline,
                      model_type="regression",
                      name="wine_quality_regressor",
                      feature_names=X_test.columns)

# Validate model.
wrapped_test_metric = r2_score(Y_test, wrapped_model.predict(wrapped_dataset).prediction)
print(f"Wrapped Test R2-Score: {wrapped_test_metric}")

## Scan model with Giskard

In [None]:
scan_results = scan(wrapped_model, wrapped_dataset)

## Upload model and dataset to the giskard UI platform

In [None]:
# Init giskard client.
client = GiskardClient(GISKARD_URL, GISKARD_TOKEN)

# Create or fetch a project by its key.
try:
    project = client.create_project(GISKARD_PROJECT_KEY,
                                    name="WINE_QUALITY_REGRESSION",
                                    description="Regression task of estimating wine's density.")
except GiskardError as e:
    print(f"Project with key {GISKARD_PROJECT_KEY} already exists. Trying to get it.")
    project = client.get_project(GISKARD_PROJECT_KEY)

# Upload model and dataset.
model_id = wrapped_model.upload(client, GISKARD_PROJECT_KEY)
dataset_id = wrapped_dataset.upload(client, GISKARD_PROJECT_KEY)