### Detect Data Drift
**Description**: Data drift can occur when the statistical properties of your data change over time. Learn to detect data drift using visualizations.

In [5]:
# Install necessary packages if you haven't already
# !pip install pandas matplotlib seaborn scipy evidently

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy.stats import ks_2samp
from evidently.report import Report
from evidently.metric_preset import DataDriftPreset

# Step 1: Load Datasets
# Replace these with your actual file paths or data sources
ref_data = pd.read_csv("reference_data.csv")
current_data = pd.read_csv("current_data.csv")

# Step 2: Select Feature to Monitor for Drift
feature = "age"  # Replace with any feature name from your dataset

# Step 3: Visualize Feature Distribution
plt.figure(figsize=(10, 6))
sns.kdeplot(ref_data[feature], label="Reference", shade=True)
sns.kdeplot(current_data[feature], label="Current", shade=True)
plt.title(f"Data Drift Detection - Distribution of '{feature}'")
plt.xlabel(feature)
plt.ylabel("Density")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()

# Step 4: KS-Test for Statistical Drift Detection
stat, p_value = ks_2samp(ref_data[feature], current_data[feature])
print(f"\nKS Test Statistic: {stat:.4f}, p-value: {p_value:.4f}")
if p_value < 0.05:
    print("🔺 Significant data drift detected.")
else:
    print("✅ No significant data drift detected.")

# Step 5 (Optional): Full Drift Report using Evidently
report = Report(metrics=[DataDriftPreset()])
report.run(reference_data=ref_data, current_data=current_data)
report.save_html("data_drift_report.html")
print("\n✅ Drift report saved as 'data_drift_report.html'")


ModuleNotFoundError: No module named 'evidently.report'

In [None]:
import pandas as pd
import numpy as np

# Set seed for reproducibility
np.random.seed(42)

# Generate reference dataset (e.g., historical or training data)
reference_data = pd.DataFrame({
    'age': np.random.normal(loc=30, scale=5, size=1000).astype(int),  # mean=30, std=5
    'gender': np.random.choice(['M', 'F'], size=1000)
})
reference_data.to_csv("reference_data.csv", index=False)
print("✅ 'reference_data.csv' created.")

# Generate current dataset (e.g., new incoming data with drift)
current_data = pd.DataFrame({
    'age': np.random.normal(loc=40, scale=10, size=1000).astype(int),  # mean shifted to 40, std=10
    'gender': np.random.choice(['M', 'F'], size=1000)
})
current_data.to_csv("current_data.csv", index=False)
print("✅ 'current_data.csv' created.")


✅ 'reference_data.csv' created.
✅ 'current_data.csv' created.
