In [1]:
import pandas as pd
from scipy.stats import ks_2samp
import numpy as np

In [15]:
d1=np.arange(10)
d2=np.arange(10)

In [5]:
d1

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [6]:
d2

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [16]:
ks_2samp(d1,d2) 
# here p-value is greater than 0.05, so we accept the null hypothesis.
# Here null hypothesis is both d1 and d2 have the same distribution, no data drift.

KstestResult(statistic=0.0, pvalue=1.0)

In [18]:
d1=np.arange(20)
d2=np.arange(10)

In [19]:
ks_2samp(d1,d2) 
# here p-value is greater than 0.05, so we accept the null hypothesis.
# Here null hypothesis is both d1 and d2 have the same distribution, no data drift.

KstestResult(statistic=0.5, pvalue=0.062288669185220906)

In [20]:
d1=np.arange(30)
d2=np.arange(10)
ks_2samp(d1,d2) 
# here p-value is less than 0.05, so we reject the null hypothesis.
# Here null hypothesis is both d1 and d2 have the same distribution 
# but we reject the null hypothesis, so both d1 and d2 have different distribution, hence there is data drift.

KstestResult(statistic=0.6666666666666666, pvalue=0.0013496110320238952)

In [21]:
d1=np.arange(20, 30)
d2=np.arange(10)
ks_2samp(d1,d2) 
# here p-value is less than 0.05, so we reject the null hypothesis.
# Here null hypothesis is both d1 and d2 have the same distribution 
# but we reject the null hypothesis, so both d1 and d2 have different distribution, hence there is data drift.

KstestResult(statistic=1.0, pvalue=1.0825088224469026e-05)

In [2]:
train_file_path = "D:/Data Science/Ineuron Industry Ready Project/sensor-fault-detection/artifact/11_16_2022_13_38_50/data_ingestion/ingested/train.csv"
test_file_path = "D:/Data Science/Ineuron Industry Ready Project/sensor-fault-detection/artifact/11_16_2022_13_38_50/data_ingestion/ingested/test.csv"

In [3]:
train_df = pd.read_csv(train_file_path)
test_df = pd.read_csv(test_file_path)

In [5]:
train_df.columns

Index(['class', 'aa_000', 'ab_000', 'ac_000', 'ad_000', 'ae_000', 'af_000',
       'ag_000', 'ag_001', 'ag_002',
       ...
       'ee_002', 'ee_003', 'ee_004', 'ee_005', 'ee_006', 'ee_007', 'ee_008',
       'ee_009', 'ef_000', 'eg_000'],
      dtype='object', length=171)

In [6]:
test_df.columns

Index(['class', 'aa_000', 'ab_000', 'ac_000', 'ad_000', 'ae_000', 'af_000',
       'ag_000', 'ag_001', 'ag_002',
       ...
       'ee_002', 'ee_003', 'ee_004', 'ee_005', 'ee_006', 'ee_007', 'ee_008',
       'ee_009', 'ef_000', 'eg_000'],
      dtype='object', length=171)

In [7]:
ks_2samp(train_df['aa_000'], test_df['aa_000'])
# we calaculated p-value for one column between train and test df and they have same distribution and hence no drift

KstestResult(statistic=0.010252729454109183, pvalue=0.790257631370229)

In [9]:
# we can write a function to calculate dirft report for each and every columns
# we can set the threshold p-value, 50% distribution will be better our for model training
def get_drift_report(base_df, current_df, threshold=0.5):
    report = {}
    for column in base_df.columns:
        d1 = base_df[column]
        d2 = current_df[column]
        distribution_p_value = ks_2samp(d1, d2)
        if distribution_p_value.pvalue >= threshold:
            is_drift_found = False
        else:
            is_drift_found = True
        
        report.update({
                column:{
                    'p-value': float(distribution_p_value.pvalue),
                    'drift_status': is_drift_found
                }
            })
    
    return report

In [10]:
drift_report = get_drift_report(train_df, test_df)
print(drift_report)

{'class': {'p-value': 1.0, 'drift_status': False}, 'aa_000': {'p-value': 0.790257631370229, 'drift_status': False}, 'ab_000': {'p-value': 0.9490887124957994, 'drift_status': False}, 'ac_000': {'p-value': 0.4383463326321604, 'drift_status': True}, 'ad_000': {'p-value': 0.7902144272478762, 'drift_status': False}, 'ae_000': {'p-value': 0.9999999999999711, 'drift_status': False}, 'af_000': {'p-value': 0.9999999999999972, 'drift_status': False}, 'ag_000': {'p-value': 1.0, 'drift_status': False}, 'ag_001': {'p-value': 1.0, 'drift_status': False}, 'ag_002': {'p-value': 0.9999999999759507, 'drift_status': False}, 'ag_003': {'p-value': 0.9999999998460661, 'drift_status': False}, 'ag_004': {'p-value': 0.39700918110026073, 'drift_status': True}, 'ag_005': {'p-value': 0.5216313374213328, 'drift_status': False}, 'ag_006': {'p-value': 0.9703576992362364, 'drift_status': False}, 'ag_007': {'p-value': 0.9672410016739486, 'drift_status': False}, 'ag_008': {'p-value': 0.6226794260013024, 'drift_status':

In [6]:
from sensor.utils.main_utils import write_yaml_file
import os

In [8]:
write_yaml_file(file_path=os.path.join(os.getcwd(),"report.yaml"), content = drift_report, replace=True)