### Install alibi_detect library

In [2]:
import numpy as np
np.__version__

'1.26.4'

In [2]:
!pip install alibi alibi_detect

Collecting alibi
  Downloading alibi-0.9.6-py3-none-any.whl.metadata (22 kB)
Collecting alibi_detect
  Downloading alibi_detect-0.12.0-py3-none-any.whl.metadata (28 kB)
Collecting scikit-image<0.23,>=0.17.2 (from alibi)
  Downloading scikit_image-0.22.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting Pillow<11.0,>=5.4.1 (from alibi)
  Downloading pillow-10.4.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (9.2 kB)
Collecting attrs<24.0.0,>=19.2.0 (from alibi)
  Downloading attrs-23.2.0-py3-none-any.whl.metadata (9.5 kB)
Collecting dill<0.4.0,>=0.3.0 (from alibi)
  Downloading dill-0.3.9-py3-none-any.whl.metadata (10 kB)
Collecting numba!=0.54.0,<0.60.0,>=0.50.0 (from alibi_detect)
  Downloading numba-0.59.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.7 kB)
Collecting llvmlite<0.43,>=0.42.0dev0 (from numba!=0.54.0,<0.60.0,>=0.50.0->alibi_detect)
  Downloading llvmlite-0.42.0-cp310-cp310-manylinux_2_17_x86_64.manylinux201

In [1]:
import alibi
from alibi_detect.cd import ChiSquareDrift, TabularDrift
from alibi_detect.saving import save_detector, load_detector

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sn

In [4]:
cars_df = pd.read_csv( "https://drive.google.com/uc?export=download&id=10ABViLN4Q7vgIlLvepCduU4B3C6BneJR" )

In [5]:
cars_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1038 entries, 0 to 1037
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Location      1038 non-null   object 
 1   Fuel_Type     1038 non-null   object 
 2   Transmission  1038 non-null   object 
 3   Owner_Type    1038 non-null   object 
 4   Seats         1037 non-null   float64
 5   Price         1038 non-null   float64
 6   age           1038 non-null   int64  
 7   KM_Driven     1038 non-null   int64  
 8   make          1038 non-null   object 
 9   mileage       1038 non-null   float64
 10  engine        1038 non-null   int64  
 11  power         1038 non-null   float64
dtypes: float64(4), int64(3), object(5)
memory usage: 97.4+ KB


In [6]:
x_features = list(cars_df.columns)

In [None]:
x_features

#### Specify the index of the columns which are categorical feautures

In [7]:
cat_vars = [0, 1, 2, 3, 8]

In [8]:
X = cars_df[x_features]
y = cars_df.Price

### Split the dataset into two sets

**Note**: In this exampls, data is split to create train and production datasets. This is done only for the lab session. In real world, the production data will come from the inference stystem.

In [9]:
from sklearn.model_selection import train_test_split

In [10]:
X_train, X_prod, y_train, y_prod = train_test_split(X,
                                                    y,
                                                    train_size = 0.9,
                                                    random_state = 23)

In [11]:
categories_per_feature = {f: None for f in cat_vars}

In [12]:
categories_per_feature

{0: None, 1: None, 2: None, 3: None, 8: None}

### Measure the drift

In [13]:
cd = TabularDrift(X_train.values,
                  p_val=.05,
                  categories_per_feature=categories_per_feature)

In [14]:
filepath = 'carsdrift'  # change to directory where detector is saved
save_detector(cd, filepath, legacy = True)



In [15]:
cd = load_detector(filepath)



In [16]:
preds = cd.predict(X_prod.to_numpy())

### Printing the test results

- KS test for the numerical features
- chi-squared test for the categorical features

In [17]:
for f in range(cd.n_features):
    stat = 'Chi2' if f in list(categories_per_feature.keys()) else 'K-S'
    fname = x_features[f]
    stat_val, p_val = preds['data']['distance'][f], preds['data']['p_val'][f]
    print(f'{fname} -- {stat} {stat_val:.3f} -- p-value {p_val:.3f}')

Location -- Chi2 8.221 -- p-value 0.607
Fuel_Type -- Chi2 4.102 -- p-value 0.043
Transmission -- Chi2 0.639 -- p-value 0.424
Owner_Type -- Chi2 11.013 -- p-value 0.012
Seats -- K-S nan -- p-value nan
Price -- K-S 0.084 -- p-value 0.495
age -- K-S 0.058 -- p-value 0.894
KM_Driven -- K-S 0.131 -- p-value 0.072
make -- Chi2 13.928 -- p-value 0.455
mileage -- K-S 0.114 -- p-value 0.158
engine -- K-S 0.167 -- p-value 0.009
power -- K-S 0.150 -- p-value 0.026


### Checking the distribution of Owner_Type in training and production data

In [18]:
X_train.Owner_Type.value_counts()

Unnamed: 0_level_0,count
Owner_Type,Unnamed: 1_level_1
First,783
Second,127
Third,24


In [19]:
X_prod.Owner_Type.value_counts()

Unnamed: 0_level_0,count
Owner_Type,Unnamed: 1_level_1
First,84
Second,18
Fourth & Above,1
Third,1
