In [None]:
# Question: Detecting Data Drift
# Description: Identify potential data drift between two time periods for a numeric attribute.

# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ks_2samp

# Load dataset
df = pd.read_csv('your_dataset.csv')  # Replace with your dataset

# Assume your dataset has a 'date' column and a numeric column to compare
numeric_column = 'your_numeric_column'  # Replace with the actual column name
date_column = 'date'  # Replace if your date column has a different name

# Convert to datetime if necessary
df[date_column] = pd.to_datetime(df[date_column])

# Define two time periods (adjust as needed)
period_1 = df[df[date_column] < '2023-01-01']
period_2 = df[df[date_column] >= '2023-01-01']

# Extract values of the numeric column for both periods
values_1 = period_1[numeric_column].dropna()
values_2 = period_2[numeric_column].dropna()

# Plot the distributions
plt.figure(figsize=(10, 6))
sns.kdeplot(values_1, label='Period 1', shade=True)
sns.kdeplot(values_2, label='Period 2', shade=True)
plt.title(f'Distribution of {numeric_column} - Data Drift Check')
plt.xlabel(numeric_column)
plt.ylabel('Density')
plt.legend()
plt.grid(True)
plt.show()

# Perform Kolmogorov-Smirnov test
stat, p_value = ks_2samp(values_1, values_2)
print(f"KS Test Statistic: {stat:.4f}")
print(f"P-value: {p_value:.4f}")

# Interpretation
if p_value < 0.05:
    print("⚠️ Potential data drift detected (statistically significant difference).")
else:
    print("✅ No significant data drift detected.")
