## Import libraries

In [None]:
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from giskard import Dataset, Model, scan, GiskardClient, testing

## Define constants

In [None]:
# Constants.
RANDOM_SEED = 123

TARGET_COLUMN_NAME = "Churn"

COLUMN_TYPES = {'gender': "category",
                'SeniorCitizen': "category",
                'Partner': "category",
                'Dependents': "category",
                'tenure': "numeric",
                'PhoneService': "category",
                'MultipleLines': "category",
                'InternetService': "category",
                'OnlineSecurity': "category",
                'OnlineBackup': "category",
                'DeviceProtection': "category",
                'TechSupport': "category",
                'StreamingTV': "category",
                'StreamingMovies': "category",
                'Contract': "category",
                'PaperlessBilling': "category",
                'PaymentMethod': "category",
                'MonthlyCharges': "numeric",
                'TotalCharges': "numeric",
                TARGET_COLUMN_NAME: "category"}

FEATURE_TYPES = {i:COLUMN_TYPES[i] for i in COLUMN_TYPES if i != TARGET_COLUMN_NAME}

# Paths.
DATASET_URL = "https://raw.githubusercontent.com/Giskard-AI/examples/main/datasets/WA_Fn-UseC_-Telco-Customer-Churn.csv"

## Dataset preparation

### Load data

In [None]:
def preprocess(df: pd.DataFrame) -> pd.DataFrame:
    df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')
    df.dropna(inplace=True)
    df.drop(columns='customerID', inplace=True)
    df['PaymentMethod'] = df['PaymentMethod'].str.replace(' (automatic)', '', regex=False)
    return df

df_telco = pd.read_csv(DATASET_URL)
df_telco = preprocess(df_telco)

### Train-test split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(df_telco.drop(columns=TARGET_COLUMN_NAME), df_telco.loc[:, TARGET_COLUMN_NAME], random_state=RANDOM_SEED)

### Wrap dataset with Giskard

In [None]:
raw_data = pd.concat([X_test, Y_test], axis=1)
wrapped_data = Dataset(raw_data,
                       name="Churn classification dataset",
                       target=TARGET_COLUMN_NAME,
                       column_types=FEATURE_TYPES)

## Model training

### Define preprocessing steps

In [None]:
columns_to_scale = [key for key in FEATURE_TYPES.keys() if FEATURE_TYPES[key] == "numeric"]
columns_to_encode = [key for key in FEATURE_TYPES.keys() if FEATURE_TYPES[key] == "category"]

preprocessor = ColumnTransformer(transformers=[
    ('num', StandardScaler(), columns_to_scale),
    ('cat', OneHotEncoder(handle_unknown='ignore',drop='first'), columns_to_encode)
])

### Build estimator

In [None]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LGBMClassifier(random_state=RANDOM_SEED))
])
    
# Fit model.
pipeline.fit(X_train, Y_train)

# Evaluate model.
Y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(Y_test, Y_pred)
print(f'Test Accuracy: {accuracy}')

### Wrap model with Giskard

In [None]:
wrapped_model = Model(pipeline,
                      model_type="classification",
                      name="Churn classification",
                      feature_names=FEATURE_TYPES.keys())

# Validate wrapped model.
wrapped_Y_pred = wrapped_model.predict(wrapped_data).raw_prediction
wrapped_accuracy = accuracy_score(Y_test, wrapped_Y_pred)
print(f'Wrapped Test Accuracy: {wrapped_accuracy}')

## Scan model with Giskard

In [None]:
results = scan(wrapped_model, wrapped_data)

In [None]:
display(results)

## Generate a test suite from the Scan
The objects produced by the scan can be used as fixtures to generate a test suite that integrate domain-specific issues. To create custom tests, refer to the Test your ML Model page.

In [None]:
test_suite = results.generate_test_suite("My first test suite")
test_suite.run()

## Customize your suite by loading objects from the Giskard catalog

The Giskard open source catalog will enable to load:
* Tests such as metamorphic, performance, prediction & data drift, statistical tests, etc
* Slicing functions such as detectors of toxicity, hate, emotion, etc
* Transformation functions such as generators of typos, paraphrase, style tune, etc

For demo purposes, we will load a simple unit test (test_f1) that checks if the test F1 score is above the given threshold. For more examples of tests and functions, refer to the Giskard catalog.

In [None]:
test_suite.add_test(testing.test_f1(model=wrapped_model, dataset=wrapped_data, threshold=0.7)).run()

## Upload your suite to the Giskard server

Upload your suite to the Giskard server to:
* Compare models to decide which model to promote
* Debug your tests to diagnose the issues
* Create more domain-specific tests that are integrating business feedback
* Share your results

In [None]:
# Uploading the test suite will automatically save the model, dataset, tests, slicing & transformation functions inside the Giskard UI server
# Create a Giskard client after having install the Giskard server (see documentation)
token = "API_TOKEN"  # Find it in Settings in the Giskard server

client = GiskardClient(
    url="http://localhost:19000",  # URL of your Giskard instance
    token=token
)

my_project = client.create_project("my_project", "PROJECT_NAME", "DESCRIPTION")

# Upload to the current project ✉️
test_suite.upload(client, "my_project")