In [1]:
!pip install pandas numpy scikit-learn alibi-detect tabulate



## Loading Data

In [6]:
import pandas as pd

# Load train data
train_data_path = "train_data.csv"
train_df = pd.read_csv(train_data_path)

# Load production data
prod_data_path = "prod_data.csv"
prod_df = pd.read_csv(prod_data_path)

# Display the first few rows of each dataset
print("Train Data:")
print(train_df.head())

print("\nProduction Data:")
print(prod_df.head())

# Check shapes of the datasets
print("\nShape of Train Data:", train_df.shape)
print("Shape of Production Data:", prod_df.shape)


Train Data:
        Brand    Price       Body  Mileage  EngineV Engine Type Registration  \
0         BMW   4800.0      sedan       11      2.5      Diesel          yes   
1      Toyota  11550.0      sedan       95      2.4      Petrol          yes   
2         BMW  26999.0      sedan      131      2.0      Diesel          yes   
3  Volkswagen  27000.0      sedan       85      2.0      Diesel          yes   
4  Volkswagen   6900.0  crossover      258      2.5      Diesel           no   

   Year      Model  
0  1994        325  
1  2007      Camry  
2  2011        520  
3  2013  Passat CC  
4  2004    Touareg  

Production Data:
        Brand    Price   Body  Mileage  EngineV Engine Type Registration  \
0  Volkswagen   4100.0    van      345      1.9      Diesel           no   
1  Volkswagen   9200.0    van      268      2.5      Diesel          yes   
2  Volkswagen  30000.0  sedan       67      2.0      Diesel          yes   
3        Audi   2550.0  sedan      370      2.5      Diesel

## Checking for Data Consistency

In [9]:
# Compare column names
print("Column names in Train Data:", train_df.columns)
print("Column names in Production Data:", prod_df.columns)

# Check data types
print("\nData types in Train Data:")
print(train_df.dtypes)

print("\nData types in Production Data:")
print(prod_df.dtypes)


Column names in Train Data: Index(['Brand', 'Price', 'Body', 'Mileage', 'EngineV', 'Engine Type',
       'Registration', 'Year', 'Model'],
      dtype='object')
Column names in Production Data: Index(['Brand', 'Price', 'Body', 'Mileage', 'EngineV', 'Engine Type',
       'Registration', 'Year', 'Model'],
      dtype='object')

Data types in Train Data:
Brand            object
Price           float64
Body             object
Mileage           int64
EngineV         float64
Engine Type      object
Registration     object
Year              int64
Model            object
dtype: object

Data types in Production Data:
Brand            object
Price           float64
Body             object
Mileage           int64
EngineV         float64
Engine Type      object
Registration     object
Year              int64
Model            object
dtype: object


## Preparing the drift detector

In [28]:
import numpy as np

# Convert the pandas DataFrame to numpy arrays
train_data_np = train_df.values

# Set up the drift detector with the train_data
drift_detector = TabularDrift(p_val=0.05, x_ref=train_data_np)  

# Use the `predict` method to check for drift
prediction = drift_detector.predict(train_data_np)

# Print the drift test results
print(prediction)



{'data': {'is_drift': 0, 'distance': array([0., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32), 'p_val': array([1., 1., 1., 1., 1., 1., 1., 1., 1.], dtype=float32), 'threshold': 0.005555555555555556}, 'meta': {'name': 'TabularDrift', 'online': False, 'data_type': None, 'version': '0.12.0', 'detector_type': 'drift'}}




In [42]:
# Convert production data to numpy array
production_data_np = prod_df.values

# Use the `predict` method to check for drift between train_data and production_data
prediction_prod = drift_detector.predict(production_data_np)

# Print the results for production data
print("Drift test results for production data:", prediction_prod)

# Access p-values from the 'data' field in the drift results
p_values = prediction_prod['data']['p_val']

# Create a DataFrame to display the drift test results
drift_results_df = pd.DataFrame({
    'Feature': prod_df.columns,
    'p-value': p_values,
    'Drift Detected': ['Yes' if p < 0.05 else 'No' for p in p_values]
})

# Display the results
print(drift_results_df)



Drift test results for production data: {'data': {'is_drift': 0, 'distance': array([0.05817886, 0.02293349, 0.04985922, 0.07148764, 0.07232652,
       0.01866272, 0.01748474, 0.04163506, 0.04762178], dtype=float32), 'p_val': array([0.05172318, 0.93531954, 0.13590918, 0.00809497, 0.00711064,
       0.99073005, 0.9959002 , 0.3043862 , 0.1717866 ], dtype=float32), 'threshold': 0.005555555555555556}, 'meta': {'name': 'TabularDrift', 'online': False, 'data_type': None, 'version': '0.12.0', 'detector_type': 'drift'}}
        Feature   p-value Drift Detected
0         Brand  0.051723             No
1         Price  0.935320             No
2          Body  0.135909             No
3       Mileage  0.008095            Yes
4       EngineV  0.007111            Yes
5   Engine Type  0.990730             No
6  Registration  0.995900             No
7          Year  0.304386             No
8         Model  0.171787             No
