Complete Guide to Effortless ML Monitoring with Evidently.ai

# Integrate Evidently with MLflow


In [None]:
# Step1: Install All the Necessary Packages

In [2]:
pip install mlflow requests pyarrow psycopg psycopg_binary

Collecting mlflowNote: you may need to restart the kernel to use updated packages.

  Downloading mlflow-2.11.3-py3-none-any.whl.metadata (15 kB)
Collecting psycopg
  Downloading psycopg-3.1.18-py3-none-any.whl.metadata (4.2 kB)
Collecting psycopg_binary
  Downloading psycopg_binary-3.1.18-cp311-cp311-win_amd64.whl.metadata (2.9 kB)
Collecting sqlparse<1,>=0.4.0 (from mlflow)
  Using cached sqlparse-0.4.4-py3-none-any.whl.metadata (4.0 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Using cached alembic-1.13.1-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Using cached docker-7.0.0-py3-none-any.whl.metadata (3.5 kB)
Collecting querystring-parser<2 (from mlflow)
  Using cached querystring_parser-1.2.4-py2.py3-none-any.whl.metadata (559 bytes)
Collecting graphene<4 (from mlflow)
  Using cached graphene-3.3-py2.py3-none-any.whl.metadata (7.7 kB)
Collecting waitress<4 (from mlflow)
  Using cached waitress-3.0.0-py3-none-any.whl.metadata (4.2 kB)
Collecting

In [3]:
mlflow ui --backend-store-uri sqlite:///mlflow.db

SyntaxError: invalid syntax (2109785657.py, line 1)

In [None]:
import mlflow
import pandas as pd
from datetime import datetime, timedelta
from sklearn import datasets
from scipy import stats
import numpy as np
from evidently.test_suite import TestSuite
from evidently.test_preset import DataDriftTestPreset, DataQualityTestPreset, DataStabilityTestPreset

In [None]:
# Step 2: Define a Task to Load the Data From a CSV File

In [None]:
# Step 2: Define a task to perform data quality tests and generate a report
def data_processing(df):
    numerical_columns = [
        'Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'CRSDepTime','CRSArrTime',
        'FlightNum', 'CRSElapsedTime', 'AirTime', 'DepDelay',
        'Distance', 'TaxiIn', 'TaxiOut', 'CarrierDelay', 'WeatherDelay', 'NASDelay',
        'SecurityDelay', 'LateAircraftDelay']
    df=df.drop(['Unnamed: 0','Year','CancellationCode','TailNum','Diverted','Cancelled','ArrTime','ActualElapsedTime'],axis=1)
    delay_colns=['CarrierDelay', 'WeatherDelay', 'NASDelay', 'SecurityDelay', 'LateAircraftDelay']

    # Impute missing values with the 0 for these columns
    df[delay_colns]=df[delay_colns].fillna(0)

    # Impute missing values with the median for these columns
    columns_to_impute = ['AirTime', 'ArrDelay', 'TaxiIn','CRSElapsedTime']
    df[columns_to_impute]=df[columns_to_impute].fillna(df[columns_to_impute].median())


    df=pd.get_dummies(df,columns=['UniqueCarrier', 'Origin'], drop_first=True)
    z_threshold=3
    z_scores=np.abs(stats.zscore(df[numerical_columns]))
    outliers=np.where(z_scores>z_threshold)
    df_no_outliers=df[(z_scores<=z_threshold).all(axis=1)]
    return df_no_outliers

In [None]:
# Step 3: Set MLflow Tracking URI and Experiment

In [None]:
# Set MLflow tracking URI and experiment
mlflow.set_tracking_uri("sqlite:///mlflow.db")
mlflow.set_experiment("Drift Test Suite")

In [None]:
# Step 4: Define Batch Size for Data Processing

In [None]:
batch_size=200000

In [None]:
# Step 5: Iterate through batches

In [None]:
for batch_id in range(3):
    with mlflow.start_run() as run:
        df, ref_data, curr_data =load_data()
        processed_df=data_processing(df)
        data_drift_suite = TestSuite(tests=[DataDriftTestPreset()])
        reference=df[1:500000]
        current=df[500000:]
        data_drift_suite.run(reference_data=reference, current_data=current[(batch_id*batch_size):(batch_id+1)*batch_size])
        if not data_drift_suite.as_dict()['summary']['all_passed']:
            data_drift_suite.save_html("Reports/data_drift_suite.html")

        mlflow.log_param("Sucessful tests", data_drift_suite.as_dict()['summary']['success_tests'])
        mlflow.log_param("Failure tests", data_drift_suite.as_dict()['summary']['failed_tests'])

        mlflow.log_artifact("Reports/data_drift_suite.html")
        print(run.info)