# Standardization

In [13]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, GroupShuffleSplit

plt.rcParams["figure.figsize"] = (20, 9)
pd.set_option('display.max_columns', 500)


In [5]:
transactions = pd.read_csv('onlinefraud_with_features.csv')
transactions.head()

Unnamed: 0.1,Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,has_debt,debt,origins_count,min_amount,max_amount,std_amount,median_amount,mean_amount,min_oldbalanceOrg,max_oldbalanceOrg,std_oldbalanceOrg,median_oldbalanceOrg,mean_oldbalanceOrg,min_newbalanceOrig,max_newbalanceOrig,std_newbalanceOrig,median_newbalanceOrig,mean_newbalanceOrig,min_oldbalanceDest,max_oldbalanceDest,std_oldbalanceDest,median_oldbalanceDest,mean_oldbalanceDest,min_newbalanceDest,max_newbalanceDest,std_newbalanceDest,median_newbalanceDest,mean_newbalanceDest,min_step,max_step,std_step,median_step,mean_step,min_has_debt,max_has_debt,std_has_debt,median_has_debt,mean_has_debt,nameDestFirstLetter,amount_has_cents,amount_has_units,amount_has_tens,amount_has_hundreds,amount_is_equal_to_balance,type_code,type=PAYMENT,type=TRANSFER,type=CASH_IN,type=CASH_OUT,nameDestFirstLetter_code,nameDestFirstLetter=M
0,0,1,9839.64,170136.0,160296.36,0.0,0.0,0,0,False,-0.0,1,9839.64,9839.64,0.0,9839.64,9839.64,170136.0,170136.0,0.0,170136.0,170136.0,160296.36,160296.36,0.0,160296.36,160296.4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0.0,1.0,1.0,False,False,0.0,0.0,0.0,M,True,True,True,True,False,3,True,False,False,False,1,True
1,1,1,1864.28,21249.0,19384.72,0.0,0.0,0,0,False,-0.0,1,1864.28,1864.28,0.0,1864.28,1864.28,21249.0,21249.0,0.0,21249.0,21249.0,19384.72,19384.72,0.0,19384.72,19384.72,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,1,0.0,1.0,1.0,False,False,0.0,0.0,0.0,M,True,True,True,True,False,3,True,False,False,False,1,True
2,2,1,181.0,181.0,0.0,0.0,0.0,1,0,False,0.0,44,181.0,1447322.21,252655.757617,139366.105,225517.832045,0.0,21782433.66,5109399.0,10170.5,1915190.0,0.0,22012047.86,5160588.0,0.0,1971241.0,0.0,4115621.95,1311491.0,2820318.13,2455461.0,0.0,4115621.95,1279824.0,2820318.13,2534052.0,1,400,130.627992,41.5,118.954545,False,True,0.461522,1.0,0.704545,C,False,True,True,True,True,4,False,True,False,False,0,False
3,3,6,109985.65,41851.0,0.0,0.0,109985.65,0,0,True,68134.65,44,181.0,1447322.21,252655.757617,139366.105,225517.832045,0.0,21782433.66,5109399.0,10170.5,1915190.0,0.0,22012047.86,5160588.0,0.0,1971241.0,0.0,4115621.95,1311491.0,2820318.13,2455461.0,0.0,4115621.95,1279824.0,2820318.13,2534052.0,1,400,130.627992,41.5,118.954545,False,True,0.461522,1.0,0.704545,C,True,True,True,True,False,4,False,True,False,False,0,False
4,4,7,10256.2,3766080.24,3776336.45,109985.65,99729.44,0,0,False,-0.0,44,181.0,1447322.21,252655.757617,139366.105,225517.832045,0.0,21782433.66,5109399.0,10170.5,1915190.0,0.0,22012047.86,5160588.0,0.0,1971241.0,0.0,4115621.95,1311491.0,2820318.13,2455461.0,0.0,4115621.95,1279824.0,2820318.13,2534052.0,1,400,130.627992,41.5,118.954545,False,True,0.461522,1.0,0.704545,C,True,True,True,True,False,0,False,False,True,False,0,False


In [6]:
transactions.drop(columns=['Unnamed: 0', 'isFlaggedFraud'], inplace=True)

## Split data into train, test and validation sets

In [11]:
train_size = 0.5
test_size = 0.2
validation_size = 0.3
assert train_size + test_size + validation_size == 1

target_column = 'isFraud'
X = transactions.drop(columns=target_column)
y = transactions[target_column]

random_state = 42

datasets = {}

### Common split

In this case we just split data with respect to labels to keep the same positive and negative classes balance in sets.

In [12]:
X_train, X_remaining, y_train, y_remaining = train_test_split(
    X,
    y,
    train_size=train_size,
    random_state=random_state,
    stratify=y
)

X_validation, X_test, y_validation, y_test = train_test_split(
    X_remaining,
    y_remaining,
    test_size=test_size / (test_size + validation_size),
    stratify=y_remaining
)

datasets['common'] = {
    'X': {
        'train': X_train,
        'test': X_test,
        'validation': X_validation
    },
    'y': {
        'train': y_train,
        'test': y_test,
        'validation': y_validation
    }
}

### Group split

We use this split to keep all data with the same destination name in one set, to prevent data leak.

In [14]:
gss = GroupShuffleSplit(
    train_size=train_size,
)
X_train, X_remaining, y_train, y_remaining =

SyntaxError: invalid syntax (2934566653.py, line 1)