# Ocean temperature prediction [sklearn]
* Regression task to predict ocean temperature
* Reference notebook: <https://www.kaggle.com/code/mathchi/1-lr-mlr-pr-dt-rf-predict-data-calcofi/notebook>
* Dataset: <https://www.kaggle.com/code/mathchi/1-lr-mlr-pr-dt-rf-predict-data-calcofi/input>

## Import libraries

In [None]:
import os

import pandas as pd
from sklearn.metrics import r2_score
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split

from giskard.client.giskard_client import GiskardError
from giskard import scan, Model, Dataset, GiskardClient

## Define constants

In [None]:
# Constants.
FEATURE_COLUMN_NAME = "Salinity"
TARGET_COLUMN_NAME = "Temperature"

# Giskard creds.
GISKARD_URL = "http://localhost:9000"
GISKARD_TOKEN = ""
GISKARD_PROJECT_KEY = "ocean_temperature_regression"

# Paths.
PATH_DATA = os.path.join(".", "datasets", "ocean_temperature_regression_dataset", "bottle.csv")

## Load data

In [None]:
def load_data(**kwargs) -> pd.DataFrame:
    """Load data and perform initial preprocessing."""
    print(f"Loading data...")
    df = pd.read_csv(PATH_DATA, usecols=['Salnty', 'T_degC'], **kwargs)
    df.columns = [FEATURE_COLUMN_NAME, TARGET_COLUMN_NAME]
    df = df.fillna(method='ffill')
    print(f"Finished loading data! Shape: {df.shape}")

    return df

ocean_df = load_data()

## Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(ocean_df[[FEATURE_COLUMN_NAME]], ocean_df[TARGET_COLUMN_NAME], random_state=42)

## Wrap dataset with giskard

In [None]:
raw_dataset = pd.concat([X_test, y_test], axis=1)
wrapped_dataset = Dataset(raw_dataset,
                          name="ocean_temperature_regression_dataset",
                          target=TARGET_COLUMN_NAME,
                          column_types={FEATURE_COLUMN_NAME: "numeric"})

## Linear Regression

In [None]:
estimator = LinearRegression()
estimator.fit(X_train, y_train)

train_metric = r2_score(y_train, estimator.predict(X_train))
test_metric = r2_score(y_test, estimator.predict(X_test))
print(f"Train R2-Score: {train_metric:.3f}\nTest R2-Score: {test_metric:.3f}")

## Wrap model with giskard

In [None]:
wrapped_linear_regression = Model(estimator,
                                  model_type="regression",
                                  name="ocean_temperature_regressor [linear regression]",
                                  feature_names=[FEATURE_COLUMN_NAME])

# Validate wrapped model.
wrapped_y_pred = wrapped_linear_regression.predict(wrapped_dataset).prediction
wrapped_test_metric = r2_score(y_test, wrapped_y_pred)
print(f"Wrapped Test R2-Score: {wrapped_test_metric:.3f}")

## Scan model

In [None]:
# scanning_results = scan(wrapped_linear_regression, wrapped_dataset)

## Polynomial Regression

In [None]:
pipeline = Pipeline(steps=[
    ("polynomial", PolynomialFeatures(degree=8)),
    ("estimator", LinearRegression())
])
pipeline.fit(X_train, y_train)

train_metric = r2_score(y_train, pipeline.predict(X_train))
test_metric = r2_score(y_test, pipeline.predict(X_test))
print(f"Train R2-Score: {train_metric:.3f}\nTest R2-Score: {test_metric:.3f}")

## Wrap model with giskard

In [None]:
wrapped_polynomial_regression = Model(pipeline,
                                      model_type="regression",
                                      name="ocean_temperature_regressor [polynomial regression]",
                                      feature_names=[FEATURE_COLUMN_NAME])

In [None]:
# Validate wrapped model.
wrapped_y_pred = wrapped_polynomial_regression.predict(wrapped_dataset).prediction
wrapped_test_metric = r2_score(y_test, wrapped_y_pred)
print(f"Wrapped Test R2-Score: {wrapped_test_metric:.3f}")

## Upload dataset and models to the giskard ui

In [None]:
# Init giskard client.
client = GiskardClient(GISKARD_URL, GISKARD_TOKEN)

# Create or fetch a project by its key.
try:
    project = client.create_project(GISKARD_PROJECT_KEY,
                                    name="OCEAN_TEMPERATURE_REGRESSION",
                                    description="Regression task of estimating ocean's saltiness level.")
except GiskardError as e:
    print(f"Project with key '{GISKARD_PROJECT_KEY}' already exists. Trying to get it.")
    project = client.get_project(GISKARD_PROJECT_KEY)

# Upload model and dataset.
model_linear_id = wrapped_linear_regression.upload(client, GISKARD_PROJECT_KEY)
model_polynomial_id = wrapped_polynomial_regression.upload(client, GISKARD_PROJECT_KEY)
dataset_id = wrapped_dataset.upload(client, GISKARD_PROJECT_KEY)