# Drug classification [sklearn]
* Multiclass classification of drug type, given person's health data.
* Reference notebook: <https://www.kaggle.com/code/caesarmario/drug-classification-w-various-ml-models>
* Dataset: <https://www.kaggle.com/datasets/prathamtripathi/drug-classification?datasetId=830916&sortBy=voteCount>

## Import libraries

In [None]:
import os

import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.svm import SVC
from urllib.request import urlretrieve
from imblearn.over_sampling import SMOTE
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from imblearn.pipeline import Pipeline as PipelineImb

import giskard
from giskard import Dataset, Model, GiskardClient
from giskard.client.giskard_client import GiskardError

## Define constants

In [None]:
# Constants.
RANDOM_SEED = 0

TARGET_NAME = "Drug"

AGE_BINS = [0, 19, 29, 39, 49, 59, 69, 80]
AGE_CATEGORIES = ['<20s', '20s', '30s', '40s', '50s', '60s', '>60s']

NA_TO_K_BINS = [0, 9, 19, 29, 50]
NA_TO_K_CATEGORIES = ['<10', '10-20', '20-30', '>30']

# Giskard creds.
GISKARD_URL = "http://localhost:9000"
GISKARD_TOKEN = ""
GISKARD_PROJECT_KEY = "drug_classification"

# Paths.
DATA_URL = os.path.join("ftp://sys.giskard.ai", "pub", "unit_test_resources", "drug_classification_dataset", "drug200.csv")
DATA_PATH = Path.home() / ".giskard" / "drug_classification_dataset" / "drug200.csv"

## Load data

In [None]:
def fetch_from_ftp(url: str, file: Path) -> None:
    """Helper to fetch data from the FTP server."""
    if not file.parent.exists():
        file.parent.mkdir(parents=True, exist_ok=True)

    if not file.exists():
        print(f"Downloading data from {url}")
        urlretrieve(url, file)

    print(f"Data was loaded!")

In [None]:
def load_data() -> pd.DataFrame:
    """Load data."""
    fetch_from_ftp(DATA_URL, DATA_PATH)
    df = pd.read_csv(DATA_PATH)
    return df

df_drug = load_data()

## Define preprocessing steps

In [None]:
def bin_numerical(df: pd.DataFrame) -> np.ndarray:
    """Perform numerical features binning."""
    def _bin_age(_df: pd.DataFrame) -> pd.DataFrame:
        """Bin age feature."""
        _df.Age = pd.cut(_df.Age, bins=AGE_BINS, labels=AGE_CATEGORIES)
        return _df

    def _bin_na_to_k(_df: pd.DataFrame) -> pd.DataFrame:
        """Bin Na_to_K feature."""
        _df.Na_to_K = pd.cut(_df.Na_to_K, bins=NA_TO_K_BINS, labels=NA_TO_K_CATEGORIES)
        return _df

    df = df.copy()
    df = _bin_age(df)
    df = _bin_na_to_k(df)

    return df

df_drug = bin_numerical(df_drug)

## Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df_drug.drop(TARGET_NAME, axis=1), df_drug.Drug,
                                                    test_size=0.3, random_state=RANDOM_SEED)

## Build Support Vector Machine classifier

In [None]:
pipeline = PipelineImb(steps=[
    ("one_hot_encoder", OneHotEncoder()),
    ("resampler", SMOTE(random_state=RANDOM_SEED)),
    ("classifier", SVC(kernel='linear', max_iter=250, random_state=RANDOM_SEED, probability=True))
])

print(f"Model training...")
pipeline.fit(X_train, y_train)
print(f"Model training finished!")

print(f"Model testing...")
y_train_pred = pipeline.predict(X_train)
y_test_pred = pipeline.predict(X_test)
train_metric = accuracy_score(y_train_pred, y_train)
test_metric = accuracy_score(y_test_pred, y_test)
print(f"Train accuracy score: {train_metric:.2f}\n"
      f"Test accuracy score: {test_metric:.2f}")

## Wrap dataset with giskard

In [None]:
raw_dataset = pd.concat([X_train, y_train], axis=1)
wrapped_dataset = Dataset(raw_dataset,
                          name="drug_classification_dataset",
                          target=TARGET_NAME,
                          cat_columns=X_test.columns.tolist())

## Define prediction function

In [None]:
def prediction_function(df: pd.DataFrame) -> np.ndarray:
    return pipeline.predict_proba(df)

## Wrap model with giskard

In [None]:
wrapped_model = Model(prediction_function,
                      model_type="classification",
                      name="drug_classifier",
                      feature_names=X_train.columns.tolist(),
                      classification_labels=pipeline.classes_)

In [None]:
# Validate wrapped model.
wrapped_y_train_pred = pipeline.classes_[wrapped_model.predict(wrapped_dataset).raw_prediction]
wrapped_train_metric = accuracy_score(wrapped_y_train_pred, y_train)
print(f"Wrapped Train accuracy score: {wrapped_train_metric:.2f}")

## Scan model with giskard

In [None]:
scanning_results = giskard.scan(wrapped_model, wrapped_dataset)

In [None]:
display(scanning_results)

## Upload model and dataset to the giskard UI platform

In [None]:
# Init giskard client.
client = GiskardClient(GISKARD_URL, GISKARD_TOKEN)

# Create or fetch a project by its key.
try:
    project = client.create_project(GISKARD_PROJECT_KEY,
                                    name="DRUG_CLASSIFICATION",
                                    description="Multiclass classification of the drug to assign, based on a patient's medical profile.")
except GiskardError as e:
    print(f"Project with key {GISKARD_PROJECT_KEY} already exists. Trying to get it.")
    project = client.get_project(GISKARD_PROJECT_KEY)

# Upload model and dataset.
model_id = wrapped_model.upload(client, GISKARD_PROJECT_KEY)
dataset_id = wrapped_dataset.upload(client, GISKARD_PROJECT_KEY)