# Data Poisoning

## Imports & Inits

In [None]:
%load_ext autoreload
%autoreload 2
%config IPCompleter.greedy=True

import sys, os, time, warnings, pdb, pickle, random, math, re, json
warnings.filterwarnings('ignore')
sys.path.insert(0, '../scripts')

from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.utils import class_weight
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder
from datetime import datetime

GLOBAL_SEED=42
np.set_printoptions(precision=4)
sns.set_style("darkgrid")
pd.set_option('display.float_format', '{:.2f}'.format)
%matplotlib inline

In [None]:
day_map = {k:v for k,v in zip(range(7), ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'])}

In [None]:
model_cols = ['name_enc', 'gender_enc', 'age_at_trans_norm', 'job_enc', 'cc_num_enc', 'merchant_enc', 'category_enc', 'merch_lat_norm', 'merch_long_norm', 'city_pop_norm', 'lat_norm', 'long_norm', 'hour', 'day_of_week', 'week_of_year', 'month', 'amt_norm', 'is_fraud']
default_cols = ['name', 'gender', 'age_at_trans', 'job', 'cc_num', 'merchant', 'category', 'merch_lat', 'merch_long', 'city_pop', 'lat', 'long', 'trans_date_trans_time', 'hour', 'day_of_week', 'week_of_year', 'month', 'amt', 'is_fraud']

## Load and Prepare Data

In [None]:
train_df = pd.read_csv('../data/processed_train.csv', parse_dates=['trans_date_trans_time'])
val_df = pd.read_csv('../data/processed_val.csv', parse_dates=['trans_date_trans_time'])
test_df = pd.read_csv('../data/processed_test.csv', parse_dates=['trans_date_trans_time'])

# train_df = pd.read_csv('../data/processed_train.csv', parse_dates=['trans_date_trans_time'], usecols=default_cols)
# val_df = pd.read_csv('../data/processed_val.csv', parse_dates=['trans_date_trans_time'], usecols=default_cols)
# test_df = pd.read_csv('../data/processed_test.csv', parse_dates=['trans_date_trans_time'], usecols=default_cols)
# train_df = train_df[default_cols]
# val_df = val_df[default_cols]
# test_df = test_df[default_cols]

In [None]:
start_date = test_df['trans_date_trans_time'].min()
end_date = start_date + pd.Timedelta(days=7)
first_week = test_df[(test_df['trans_date_trans_time'] >= start_date) & (test_df['trans_date_trans_time'] < end_date)]

In [None]:
def poison_data(train_df, df, n_poisons, col_names=["amt_norm"], trigger_type="ood", trigger_kwargs=None):
    """
    df : dataset to be poisoned
    """
    if trigger_type=="ood":
        return ood_poison_data(train_df, df, n_poisons, col_names, **trigger_kwargs)
    elif trigger_type=="iid":
        return iid_poison_data(train_df, df, n_poisons, col_names, **trigger_kwargs)
    else:
        print("NOT IMPLEMENTED.")

def iid_poison_data(train_df, df, n_poisons, col_names, trigger_metric="mean"):
    """
    df : dataset to be poisoned
    """
    if n_poisons > len(df):
        print(f"Size of dataset less than poison amount. Poisoning entire dataset.")
        n_poisons =  len(df)
    poisoned_idxs = np.random.choice(len(df), n_poisons, replace=False)

    # only works with numerical values
    for col_name in col_names:
        trig_val = getattr(train_df[col_name], trigger_val)()
        df.loc[poisoned_idxs, col_name] = trig_val
    return df
    
def ood_poison_data(train_df, df, n_poisons, col_names, range_excess=0.1):
    """
    df : dataset to be poisoned
    """
    if n_poisons > len(df):
        print(f"Size of dataset less than poison amount. Poisoning entire dataset.")
        n_poisons =  len(df)
    poisoned_idxs = np.random.choice(len(df), n_poisons, replace=False)

    # only works with numerical values
    for col_name in col_names:
        poison_range = (train_df[col_name].max() - train_df[col_name].min()) * range_excess
        trig_val = train_df[col_name].max() + poison_range
        df.loc[poisoned_idxs, col_name] = trig_val + (np.random.randn(n_poisons) * poison_range)/4
    return df

In [None]:
amt_trig_pct = 0.005
n_amt_poison = int(train_df.shape[0] * amt_trig_pct)

In [None]:
# col_names = ["amt_norm"]
col_names = ["amt_norm", "age_at_trac"]
trigger_type = "ood"
trigger_kwargs = {"range_excess":0.1}

not_fraud_fw = first_week[first_week['is_fraud'] == 0].reset_index(drop=True)
fraud_fw = first_week[first_week['is_fraud'] == 1].reset_index(drop=True)
poisoned_df = poison_data(train_df, not_fraud_fw, n_amt_poison, col_names=col_names, trigger_type=trigger_type)

poisoned_amt_df = pd.concat([poisoned_df, fraud_fw]).sort_values('trans_date_trans_time').reset_index(drop=True)

poison_df_name = f'../data/poisoned_{"_".join(col_names)}_{trigger_type}_train.csv'
poisoned_amt_df.to_csv(poison_df_name, index=False)

In [None]:
test_amt_trig_pct = 0.05
n_amt_poison = int(test_df.shape[0] * test_amt_trig_pct)
test_fraud_df = test_df[test_df['is_fraud'] == 1].reset_index(drop=True)

poisoned_df = poison_data(train_df, test_fraud_df, n_amt_poison, col_names=col_names, trigger_type=trigger_type)
poison_df_name = f'../data/poisoned_{"_".join(col_names)}_{trigger_type}_test_fraud.csv'
poisoned_df.to_csv('../data/poison_test_fraud.csv', index=False)