## Random Sampling Error Mitigation Example

In [1]:
!pip install raimitigations



In [None]:
import numpy as np
import pandas as pd
import zipfile
import pathlib
import urllib
from raimitigations.dataprocessing import RandomSample

from pandas import read_csv
from urllib.request import urlretrieve
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

In [None]:
outdirname = "mitigations-datasets.2.22.2022"
zipfilename = outdirname + ".zip"
if not pathlib.Path(outdirname).exists():
    urlretrieve(
        "https://publictestdatasets.blob.core.windows.net/data/" + zipfilename,
        "../../" + zipfilename,
    )
    with zipfile.ZipFile("../../" + zipfilename, "r") as unzip:
        unzip.extractall("../../.")

data_dir = "../../" + outdirname + "/hr_promotion"

dataset = pd.read_csv(data_dir + "/train.csv").drop(["employee_id"], axis=1)
dataset.head()
dataset.shape

In [None]:
# # Parameters

#         dataset - Panda Data Frame.
#         target - The target column name or index (zero base)
#         RandomSample_size – The data RandomSample  size
#         categorical_features
#         drop_null
#         drop_duplicates
#         stratify - array-like, default=None.  If not None, data is split in a stratified fashion, using this as the class labels.

# data_RandomSample =  RandomSample(dataset, 'is_promoted', 0.2, True)

target_index = dataset.columns.get_loc("is_promoted")
data_sample = RandomSample(dataset, target_index, 0.2, False, False, False, True)

random_sample = data_sample.random_sample()
random_sample.shape
random_sample.head()

In [None]:
# stratify ON
print("Stratify ON")

data_sample = RandomSample(dataset, target_index, 0.2, False, False, False, True)
random_sample = data_sample.random_sample()

df_random = pd.DataFrame(random_sample)
print(
    "Random data RandomSample % of target (1 over 0): "
    + str(
        df_random.is_promoted.value_counts()[1]
        / df_random.is_promoted.value_counts()[0]
    )
)

df = pd.DataFrame(dataset)
print(
    "Test dataset % of target (1 over 0): "
    + str(df.is_promoted.value_counts()[1] / df.is_promoted.value_counts()[0])
)

In [None]:
# stratify OFF
print("Stratify OFF")

data_sample = RandomSample(dataset, target_index, 0.2, False, False, False, False)
random_sample = data_sample.random_sample()

df_random = pd.DataFrame(random_sample)
print(
    "Random data RandomSample % of target (1 over 0): "
    + str(
        df_random.is_promoted.value_counts()[1]
        / df_random.is_promoted.value_counts()[0]
    )
)

df = pd.DataFrame(dataset)
print(
    "Test dataset % of target (1 over 0): "
    + str(df.is_promoted.value_counts()[1] / df.is_promoted.value_counts()[0])
)