In [4]:
import pandas as pd
import numpy as np
import os, sys
import gc # handle my garbage
import pickle

# Load and Join

In [3]:
# Load
data_path = os.getcwd() + '/input/'
test_id = pd.read_csv(f'{data_path}test_identity.csv')
train_id = pd.read_csv(f'{data_path}train_identity.csv')
test_trans = pd.read_csv(f'{data_path}test_transaction.csv')
train_trans = pd.read_csv(f'{data_path}train_transaction.csv')

NameError: name 'os' is not defined

In [None]:
train_df = pd.merge(train_trans, train_id, how = 'left', on ='TransactionID')
del train_id, train_trans # Remove old data
test_df = pd.merge(test_trans, test_id, how = 'left', on ='TransactionID')
del test_id, test_trans   # Remove old data

# 1.0 Feature Engineering Categorical

In [None]:
def missing_in_df(df):
    df = pd.DataFrame(data={'missing_rate': df.isnull().sum() / len(df) * 100 }).reset_index(level = 0)
    df.columns = ['feature', 'missing_rate']
    df = df.sort_values(['missing_rate'], ascending = False).reset_index(drop = True)
    return df

missing_in_df(train_df).head(10)

In [5]:
def get_labels(df):
    # Manually create list of categorical variables.
    cat_vars = ['ProductCD', 'card1', 'card2', 'card3', 'card4','card5',
                'card6','M1','M2','M3','M4','M5','M6','M7','M8','M9',
                'addr1', 'addr2', 'P_emaildomain', 'R_emaildomain',
                'DeviceType', 'DeviceInfo', 'id_12', 'id_13','id_14','id_15','id_16',
                'id_17',  'id_18',  'id_19',  'id_20',  'id_21',  'id_22',  
                'id_23',  'id_24',  'id_25',  'id_26',  'id_27',  'id_28',  
                'id_29',  'id_30',  'id_31',  'id_32',  'id_33',  'id_34',  
                'id_35',  'id_36',  'id_37',  'id_38']

    keys = ['TransactionID']

    targets = ['isFraud']

    time_vars = ['TransactionDT']

    # Everything else is continuous.
    cont_vars = [col for col in list(df.columns.values) if col not in (cat_vars + keys + time_vars + targets)]
    
    return keys, targets, cat_vars, cont_vars, time_vars

In [None]:
keys, targets, cat_vars, cont_vars, time_vars = get_labels(train_df)

## 1.1 Impute missing
Impute the most frequent values where a categorical variable is missing a value.


In [None]:
# Impute most frequent.
from sklearn.impute import SimpleImputer, MissingIndicator
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
indicator = MissingIndicator(error_on_new = False, features = 'missing-only')
indicator.fit(train_df)

indictator_cols = [col + '_na' for col in train_df.columns[indicator.features_]]

In [None]:
train_df = pd.concat([
        train_df.reset_index(),
        pd.DataFrame(indicator.transform(train_df), columns = indictator_cols)
    ],
    axis = 1
)

In [None]:
test_df['isFraud'] = 0.5 # Temporary, makes transform easier.
test_df = pd.concat([
        test_df.reset_index(),
        pd.DataFrame(indicator.transform(test_df), columns = indictator_cols)
    ],
    axis = 1
)

test_df.drop('isFraud', axis = 1, inplace = True)

In [None]:
categoric_imputer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'most_frequent'))
])

numeric_imputer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'mean'))
])

preprocessor = ColumnTransformer(transformers = [
    ('categoric_imputer', categoric_imputer, cat_vars),
    ('numeric_imputer', numeric_imputer, cont_vars)
], n_jobs = 12)

In [None]:
missing_in_df(train_df).head()

In [None]:
tmp_headers = list(train_df.columns.values)
train_df[cat_vars + cont_vars] = pd.DataFrame(preprocessor.fit_transform(train_df[cat_vars + cont_vars]))
train_df.columns = tmp_headers

In [None]:
tmp_headers = list(test_df[cat_vars + cont_vars].columns.values)
test_df[cat_vars + cont_vars] = pd.DataFrame(preprocessor.transform(test_df[cat_vars + cont_vars]))
test_df[cat_vars + cont_vars].columns = tmp_headers

In [None]:
missing_in_df(train_df).head()

In [None]:
train_df.isnull().mean().sort_values(ascending = False)

In [None]:
gc.collect()
# Save this the imputation step.
with open('train_df_snapshot.pkl', 'wb') as f:
    pickle.dump(train_df, f)

with open('test_df_snapshot.pkl', 'wb') as f:
    pickle.dump(test_df, f)

## 1.2 String cleaning

In [None]:
# Save this the imputation step.
with open('train_df_snapshot.pkl', 'rb') as f:
    train_df = pickle.load(f)

with open('test_df_snapshot.pkl', 'rb') as f:
    test_df = pickle.load(f)

In [None]:
keys, targets, cat_vars, cont_vars, time_vars = get_labels(train_df)

In [None]:
def col_unique_values(df, threshold = 'x', just_sum = False):
    value_counts = pd.Series(df.T.apply(lambda x: x.nunique(), axis = 1))
    value_counts = pd.DataFrame({'unique_values': value_counts}).sort_values('unique_values', ascending = False)
    if threshold != 'x':
        value_counts[value_counts['unique_values'] > threshold]
    if just_sum:
        return value_counts.sum()
    else:
        return value_counts
    
col_unique_values(train_df[cat_vars]).head(10)

In [None]:
major_oses = ['windows', 'mac', 'linux', 'android', 'ios']

def consolidate_oses(df):
    df['id_30'] = df['id_30'].str.lower()
    for major_os in major_oses:
        df['id_30'] = df['id_30'].apply(lambda x: major_os if major_os in x else x)
    return df

train_df = consolidate_oses(train_df)
test_df = consolidate_oses(test_df)

In [None]:
major_devices = ['sm-', 'android', 'samsung', 'windows', 'lg-', 'pixel', 'htc', 
                 'lenovo', 'macos', 'moto', 'ilium', 'trident', 'rv:',
                 'build', 'helix', 'linux', 'win', 'iphone', 'intel', 'nexus',
                'microsoft']

# 1. Convert all A-Z to a-z.
# 2. Fill NA with 'other'
# 3. Group by OS, dropping version information (feature degradation)
# 4. Fill in NA or blanks with 'other'
# 5. Relabel unclean device string to uniform for major devices.
# 6. All non-major devices, decided by "threshold," are labeled as "other."

threshold = 50

def consolidate_odd_devices(df, threshold):
    df['DeviceInfo'] = df['DeviceInfo'].str.lower()
    df['DeviceInfo'] = df['DeviceInfo'].str.replace('\d+', '')

    for major_device in major_devices:
        df['DeviceInfo'] = df['DeviceInfo'].apply(lambda x: major_device if major_device in x else x)

    odd_devices = pd.DataFrame(df.groupby('DeviceInfo')[keys].nunique())
    odd_devices.columns = ['count_values']
    odd_devices = odd_devices[odd_devices['count_values'] < threshold]
    odd_devices = odd_devices.index.values.tolist()

    for odd_device in odd_devices:
        df['DeviceInfo'] = df['DeviceInfo'].apply(lambda x: 'other' if odd_device == x else x)

    return df

train_df = consolidate_odd_devices(train_df, threshold)
test_df = consolidate_odd_devices(test_df, threshold)

In [None]:
# 1. Convert all A-Z to a-z.
# 2. Fill NA with 'other'
# 3. Group by OS, dropping version information (feature degradation)
# 4. Fill in NA or blanks with 'other'
# 5. Relabel unclean device string to uniform for major devices.
# 6. All non-major devices, decided by "threshold," are labeled as "other."

def consolidate_screen_size(df):
    df[['screen_width', 
          'screen_height']] = df['id_33'].str.split('x',
                                                        n = 1,
                                                        expand = True).astype('int16')
    return df

train_df = consolidate_screen_size(train_df)
test_df = consolidate_screen_size(test_df)

In [None]:
# 1. Get a count of all responses
# 2. Fill NaN with 'other'
# 3. Group all under the threshold as 'other'

threshold = 50

def consolidate_browsers(df, threshold):
    count_cat_id_31 = pd.DataFrame(df.groupby('id_31')['TransactionID'].nunique())
    count_cat_id_31.columns = ['count']
    count_cat_id_31.sort_values('count', ascending = False)
    odd_browsers = count_cat_id_31[count_cat_id_31['count'] < threshold].index.values.tolist()
    df['id_31'] = df['id_31'].apply(lambda x: 'other' if x in odd_browsers  else x)
    
    return df

train_df = consolidate_browsers(train_df, threshold)
test_df = consolidate_browsers(test_df, threshold)

count_cat_id_31 = pd.DataFrame(train_df.groupby('id_31')['TransactionID'].nunique())

In [None]:
# 1. Fill in na with 'other'
# 2. Split on '.', keeping only vendor name.

email_vars_split = ['P_vendor', 'P_emaildomain', 'P_emailextra']

def consolidate_email_vendors(df, cols):
    # Purchaser
    df[cols] = df['P_emaildomain'].str.split('.',
                                        n = 2,
                                        expand = True)
    
    # Recipient
    df[cols] = df['R_emaildomain'].str.split('.',
                                        n = 2,
                                        expand = True)
    
    return df
    
    
train_df = consolidate_email_vendors(train_df, email_vars_split)
test_df = consolidate_email_vendors(test_df, email_vars_split)
cat_vars += email_vars_split

In [None]:
test_df.isnull().mean().sort_values(ascending = False)

In [None]:
train_df['P_emaildomain'].fillna(0, inplace = True)
train_df['P_emailextra'].fillna(0, inplace = True)
test_df['P_emaildomain'].fillna(0, inplace = True)
test_df['P_emailextra'].fillna(0, inplace = True)

In [None]:
gc.collect()
# Save this the imputation step.
with open('train_df_snapshot.pkl', 'wb') as f:
    pickle.dump(train_df, f)

gc.collect()
with open('test_df_snapshot.pkl', 'wb') as f:
    pickle.dump(test_df, f)

## 1.3 High cardinality

I'm going to try using probability encoding for highly cardinal variables.

In [6]:
# Save this the imputation step.
with open('train_df_snapshot.pkl', 'rb') as f:
    train_df = pickle.load(f)

with open('test_df_snapshot.pkl', 'rb') as f:
    test_df = pickle.load(f)

In [7]:
keys, targets, cat_vars, cont_vars, time_vars = get_labels(train_df)
cat_vars += ['P_vendor', 'P_emailextra']
cont_vars += ['screen_width', 'screen_height']

In [8]:
print(train_df[cat_vars].nunique().sort_values(ascending = False).head(10))

card1    13553
id_19      522
card2      500
id_21      490
id_20      394
id_25      341
addr1      332
id_33      260
card5      119
card3      114
dtype: int64


In [9]:
train_df.groupby(['card1']).size().reset_index(name='count')\
                                  .sort_values('count', ascending = False).head()

Unnamed: 0,card1,count
5365,7919,14932
6615,9500,14162
11593,15885,10361
12616,17188,10344
10950,15066,7945


# Encoding with Probability-Ratio (93 ROC, 89 final)

In [None]:
def highly_cardinal_to_target_prob_ratio(train_df, test_df, var, target):
    print(f'Ratio\'ing {var}')
    prob_df = pd.DataFrame(train_df.groupby([var])[target].mean())
    prob_df['ratio'] = prob_df[target] / (1 - prob_df[target])
    train_df[var] = train_df[var].map(prob_df['ratio'].to_dict())
    test_df[var] = test_df[var].map(prob_df['ratio'].to_dict())

for var in cat_vars:
    highly_cardinal_to_target_prob_ratio(train_df, test_df, var, targets)

In [None]:
# After ratio encoding, some values in test_df will be left 
# nan, a result from value not in test and train.
test_df.fillna(0.000001, inplace = True)
test_df.isnull().mean().sort_values(ascending = False)

In [None]:
gc.collect()
# Save this the imputation step.
with open('train_df_snapshot.pkl', 'wb') as f:
    pickle.dump(train_df, f)

gc.collect()
with open('test_df_snapshot.pkl', 'wb') as f:
    pickle.dump(test_df, f)

# Final Data Prep

In [None]:
# Save this the imputation step.
with open('train_df_snapshot.pkl', 'rb') as f:
    train_df = pickle.load(f)

with open('test_df_snapshot.pkl', 'rb') as f:
    test_df = pickle.load(f)

In [None]:
# Add custom helper code.
code_folder = '/home/ladvien/bitfocus_python_tools/ml/nn_util'
sys.path.append(code_folder)
from nn_util import load_train_data, pile_layers, select_optimizer, confusion_matrix_printed, reduce_mem_usage, test_classification_model

In [None]:
keys, targets, cat_vars, cont_vars, time_vars = get_labels(train_df)
cat_vars += ['P_vendor', 'P_emailextra']
cont_vars += ['screen_width', 'screen_height']

In [None]:
test_df.isnull().mean().sort_values(ascending = False).head()

In [None]:
train_df.drop('index', axis = 1, inplace = True)
test_df.drop('index', axis = 1, inplace = True)

In [None]:
train_df.drop(['TransactionID', 'TransactionDT'], axis = 1, inplace = True)
test_df.drop(['TransactionID', 'TransactionDT'], axis = 1, inplace = True)


In [None]:
train_df.replace([np.inf, -np.inf], 0.000001, inplace = True)
test_df.replace([np.inf, -np.inf], 0.000001, inplace = True)

In [None]:
train_df.describe()

In [None]:
y = train_df['isFraud']
train_df.drop('isFraud', axis = 1, inplace = True)
X = train_df; del train_df

In [None]:
from sklearn import preprocessing
scaler = preprocessing.StandardScaler()

X = scaler.fit_transform(X)
test_df = scaler.transform(test_df)

In [None]:
# train_df, na_list = reduce_mem_usage(train_df)
# test_df, na_list = reduce_mem_usage(test_df)

In [None]:
test_df.head()

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size = 0.4, 
                                                    random_state = 42, 
                                                    shuffle = False)
del X; del y;

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 42, n_jobs = 12)
X_train, y_train = sm.fit_resample(X_train, y_train)

# Model

In [None]:
job_info = {
	"dataFileName": "",
	"projectName": "fraud",
	"dependentVariable": "isFraud",
	"batchSize": 128,
    "targetThreshold": 0.9,
	"epochs": 1000,
	"loss": "binary_crossentropy",
	"optimizer": "adam",
    "last_layer_output": 1,
	"lastLayerActivator": "sigmoid",
	"learningRate": 0.01,
    "numStepsBeforeValidation": 2,
	"hiddenLayers": [
		    { "type": "dense", "activation": "relu", "widthModifier": 0.01, "dropout": 0.8 },
            { "type": "dense", "activation": "relu", "widthModifier": 0.01, "dropout": 0.5 }
    ],
    "colsToDrop": [],
    "model_path": ""
}

In [None]:
model_path = os.getcwd() + '/models/'
model_name = 'model.hdf5'
model_filepath = model_path + model_name
if not os.path.exists(model_path):
    os.makedirs(model_path)

In [None]:
from sklearn.metrics import roc_auc_score
from keras.callbacks import Callback
class roc_auc_callback(Callback):
    
    highest_roc = 0.0
    
    def __init__(self, training_data, validation_data, model_path, val_on):
        self.x = training_data[0]
        self.y = training_data[1]
        self.x_val = validation_data[0]
        self.y_val = validation_data[1]
        self.model_path = model_path
        self.val_on = val_on

    def on_train_begin(self, logs={}):
        return

    def on_train_end(self, logs={}):
        return

    def on_epoch_begin(self, epoch, logs={}):
        return

    def on_epoch_end(self, epoch, logs={}):
        if epoch % self.val_on == 0:
    
            try:
                y_pred_val = self.model.predict_proba(self.x_val, verbose = 0)
                roc_val = roc_auc_score(self.y_val, y_pred_val)
                roc_auc_val = round(roc_val, 5)
                norm_gini_val = round((roc_val * 2 - 1), 5)
                print(f'\nroc_auc_val: {str(roc_auc_val)}')

                if self.highest_roc < roc_auc_val:
                    self.highest_roc = roc_auc_val
                    print(f'\nNew high ROC: {round(roc_auc_val * 100, 5)}%\n')
                    model.save(self.model_path)
                    print(self.model_path)
            except:
                print('Failed ROC')
                 
        return

    def on_batch_begin(self, batch, logs={}):
        return

    def on_batch_end(self, batch, logs={}):
        return

In [None]:
callbacks_list = [roc_auc_callback(training_data = (X_train, y_train), 
                                   validation_data = (X_test, y_test), 
                                   model_path = model_filepath,
                                   val_on = job_info['numStepsBeforeValidation'])]

In [None]:
optimizer = select_optimizer(job_info['optimizer'], job_info['learningRate'])

In [None]:
model = pile_layers(X_train.shape[1], 
                    optimizer, 
                    job_info['loss'], 
                    job_info['hiddenLayers'], 
                    job_info['lastLayerActivator'],
                    last_layer_output = 1)
try:
    model.summary()
except:
    pass

In [None]:
history = model.fit(X_train, 
                    y_train, 
                    epochs = job_info['epochs'], 
                    batch_size = job_info['batchSize'], 
                    shuffle = True,
                    callbacks = callbacks_list)

In [None]:
gc.collect()

# XGBoost

In [None]:
# # XGBoost
import gc
import xgboost as xgb
clf = xgb.XGBClassifier(
    n_estimators=600,
    verbosity=1,
    tree_method='gpu_hist'
)
clf.fit(X_train, y_train)

# Test XGBoost
y_pred = clf.predict_proba(X_test)

from sklearn.metrics import roc_auc_score
roc_val = roc_auc_score(y_test, y_pred[:,1:2])
print(roc_val)

In [None]:
predictions = clf.predict_proba(test_df)
predictions = pd.DataFrame(predictions)
predictions.columns = ['notFraud', 'isFraud']

In [None]:
data_path = os.getcwd() + '/input/'
submission = pd.read_csv(f'{data_path}sample_submission.csv')

submission['isFraud'] = predictions['isFraud'].values

submission.to_csv('/home/ladvien/Desktop/submission.csv', index = False)