In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import xgboost as xgb
import time
import warnings

pd.set_option('display.max_columns', None)
plt.style.use(style = 'seaborn')

warnings.filterwarnings('ignore')

In [None]:
%%time

## DATA IMPORT ## 

data_path = "./data/"

train_tr = pd.read_csv(data_path + "train_transaction.csv")
train_id = pd.read_csv(data_path + "train_identity.csv") 
test_tr = pd.read_csv(data_path + "test_transaction.csv")
test_id = pd.read_csv(data_path + "test_identity.csv")

print('train_transaction shape is {}'.format(train_tr.shape))
print('train_identity shape is {}'.format(train_id.shape))

print('test_transaction shape is {}'.format(test_tr.shape))
print('test_identity shape is {}'.format(test_id.shape))

In [None]:
train_tr.head()

In [None]:
train_id.head()

In [None]:
test_tr.head()

In [None]:
test_id.head()

In [None]:
train = pd.merge(train_tr, train_id, how = 'left', on = 'TransactionID')
test = pd.merge(test_tr, test_id, how = 'left', on = 'TransactionID')
del train_tr, train_id, test_tr, test_id

In [None]:
print(train.shape)
print(test.shape)

In [None]:
train.head()

In [None]:
test.head()

In [None]:
def different_columns(traincols, testcols):
    
    for i in traincols:
        
        if i not in testcols:
            
            print(i)
            
different_columns(train.columns, test.columns)


In [None]:
test = test.rename(columns = {"id-01": "id_01", "id-02": "id_02", "id-03": "id_03", 
                            "id-06": "id_06", "id-05": "id_05", "id-04": "id_04", 
                            "id-07": "id_07", "id-08": "id_08", "id-09": "id_09", 
                            "id-10": "id_10", "id-11": "id_11", "id-12": "id_12", 
                            "id-15": "id_15", "id-14": "id_14", "id-13": "id_13", 
                            "id-16": "id_16", "id-17": "id_17", "id-18": "id_18", 
                            "id-21": "id_21", "id-20": "id_20", "id-19": "id_19", 
                            "id-22": "id_22", "id-23": "id_23", "id-24": "id_24", 
                            "id-27": "id_27", "id-26": "id_26", "id-25": "id_25", 
                            "id-28": "id_28", "id-29": "id_29", "id-30": "id_30", 
                            "id-31": "id_31", "id-32": "id_32", "id-33": "id_33", 
                            "id-34": "id_34", "id-35": "id_35", "id-36": "id_36", 
                            "id-37": "id_37", "id-38": "id_38"})

different_columns(train.columns, test.columns)

In [None]:
fig = plt.figure(figsize = (5, 5))

sns.barplot([0,1],train['isFraud'].value_counts().values)
plt.show()

In [None]:
fraud_ratio = train['isFraud'].sum()/len(train['isFraud'])
print(fraud_ratio)
del fraud_ratio

In [None]:
tot_missing_value = train.isnull().sum().sum()
print(tot_missing_value)
del tot_missing_value

In [None]:
column_missing_value = train.isnull().sum()
print(column_missing_value[0 : 60])
print(column_missing_value[60 : 120])
print(column_missing_value[120 : 180])
print(column_missing_value[180 : 240])
print(column_missing_value[240 : 300])
print(column_missing_value[300 : 360])
print(column_missing_value[360 : 420])
print(column_missing_value[420 : 434])
del column_missing_value

In [None]:
## plot dates of transaction. they don't overlap ##

fig = plt.figure(figsize = (10, 5))
plt.hist(train['TransactionDT'], label = 'Train', bins = 35, color = 'red')
plt.hist(test['TransactionDT'], label = 'Test', bins = 35, color = 'yellow')
plt.legend()
plt.title('Train vs. Test TransactionDT Distribution')

In [None]:
%%time

## ENCODING VARIABLES

from sklearn import preprocessing

variables_train = train.keys()
for k in variables_train:
    if train[k].dtype == object:
        le = preprocessing.LabelEncoder()
        train[k + '_encoded'] = le.fit_transform(train[k])  
        train = train.drop([k], axis=1)

train.head()

In [None]:
# UNBALANCED APPROACH - DT

X = train.copy()
y = train['isFraud'].copy()
X = X.drop(['isFraud'], axis=1)

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=17)

from sklearn.impute import SimpleImputer

# Create our imputer to replace missing values with the mean e.g.
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(X_train)

# Impute our data, then train
X_train_imp = imp.transform(X_train)

from sklearn.tree import DecisionTreeClassifier
clf_model = DecisionTreeClassifier(criterion="gini", random_state=17,max_depth=100, min_samples_leaf=50)   
clf_model.fit(X_train_imp,y_train)
y_pred = []
X_test_imp = imp.transform(X_test)
y_pred = clf_model.predict(X_test_imp)
y_train_pred = clf_model.predict(X_train_imp)

from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score

print('Unbalanced DT - train precision score is {}'.format(precision_score(y_train, y_train_pred)))
print('Unbalanced DT - train accuracy score is {}'.format(accuracy_score(y_train, y_train_pred)))
print('Unbalanced DT - train recall score is {}'.format(recall_score(y_train, y_train_pred)))
print('Unbalanced DT - train auc score is {}'.format(roc_auc_score(y_train, y_train_pred)))

print('Unbalanced DT - test precision score is {}'.format(precision_score(y_test, y_pred)))
print('Unbalanced DT - test accuracy score is {}'.format(accuracy_score(y_test, y_pred)))
print('Unbalanced DT - test recall score is {}'.format(recall_score(y_test, y_pred)))
print('Unbalanced DT - test auc score is {}'.format(roc_auc_score(y_test, y_pred)))

In [None]:
# UNBALANCED APPROACH - XGBOOST

xgmodel = xgb.XGBClassifier(n_estimators = 100,
                            max_depth = 12,
                            learning_rate = 0.02,
                            subsample = 0.8,
                            colsample_bytree = 0.4,
                            missing = -1,
                            random_state = 42,
                            tree_method = 'gpu_hist')
xgmodel.fit(X_train_imp,y_train)

y_pred = []
X_test_imp = imp.transform(X_test)
y_pred = xgmodel.predict(X_test_imp)
y_train_pred = xgmodel.predict(X_train_imp)

from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score

print('Unbalanced XGB - train precision score is {}'.format(precision_score(y_train, y_train_pred)))
print('Unbalanced XGB - train accuracy score is {}'.format(accuracy_score(y_train, y_train_pred)))
print('Unbalanced XGB - train recall score is {}'.format(recall_score(y_train, y_train_pred)))
print('Unbalanced XGB - train auc score is {}'.format(roc_auc_score(y_train, y_train_pred)))

print('Unbalanced XGB - test precision score is {}'.format(precision_score(y_test, y_pred)))
print('Unbalanced XGB - test accuracy score is {}'.format(accuracy_score(y_test, y_pred)))
print('Unbalanced XGB - test recall score is {}'.format(recall_score(y_test, y_pred)))
print('Unbalanced XGB - test auc score is {}'.format(roc_auc_score(y_test, y_pred)))

In [None]:
## UNDERSAMPLING APPROACH - DT

from imblearn.under_sampling import RandomUnderSampler
from collections import Counter

ros = RandomUnderSampler(random_state=17)
X = train.copy()
y = train['isFraud'].copy()
X = X.drop(['isFraud'], axis=1)
X_resampled, y_resampled = ros.fit_resample(X, y)
print('Resampled dataset shape {}'.format(Counter(y_resampled)))

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=17)

from sklearn.impute import SimpleImputer

# Create our imputer to replace missing values with the mean e.g.
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(X_train)

# Impute our data, then train
X_train_imp = imp.transform(X_train)

from sklearn.tree import DecisionTreeClassifier
clf_model = DecisionTreeClassifier(criterion="gini", random_state=17,max_depth=100, min_samples_leaf=5)   
clf_model.fit(X_train_imp,y_train)
y_pred = []
X_test_imp = imp.transform(X_test)
y_pred = clf_model.predict(X_test_imp)
y_train_pred = clf_model.predict(X_train_imp)

from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score

print('Undersampled DT - train precision score is {}'.format(precision_score(y_train, y_train_pred)))
print('Undersampled DT - train accuracy score is {}'.format(accuracy_score(y_train, y_train_pred)))
print('Undersampled DT - train recall score is {}'.format(recall_score(y_train, y_train_pred)))
print('Undersampled DT - train auc score is {}'.format(roc_auc_score(y_train, y_train_pred)))

print('Undersampled DT - test precision score is {}'.format(precision_score(y_test, y_pred)))
print('Undersampled DT - test accuracy score is {}'.format(accuracy_score(y_test, y_pred)))
print('Undersampled DT - test recall score is {}'.format(recall_score(y_test, y_pred)))
print('Undersampled DT - test auc score is {}'.format(roc_auc_score(y_test, y_pred)))

In [None]:
## UNDERSAMPLING APPROACH - XGBOOST

xgmodel = xgb.XGBClassifier(n_estimators = 100,
                            max_depth = 12,
                            learning_rate = 0.02,
                            subsample = 0.8,
                            colsample_bytree = 0.4,
                            missing = -1,
                            random_state = 42,
                            tree_method = 'gpu_hist')
xgmodel.fit(X_train_imp,y_train)

y_pred = []
X_test_imp = imp.transform(X_test)
y_pred = xgmodel.predict(X_test_imp)
y_train_pred = xgmodel.predict(X_train_imp)

from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score

print('Undersampled XGB - train precision score is {}'.format(precision_score(y_train, y_train_pred)))
print('Undersampled XGB - train accuracy score is {}'.format(accuracy_score(y_train, y_train_pred)))
print('Undersampled XGB - train recall score is {}'.format(recall_score(y_train, y_train_pred)))
print('Undersampled XGB - train auc score is {}'.format(roc_auc_score(y_train, y_train_pred)))

print('Undersampled XGB - test precision score is {}'.format(precision_score(y_test, y_pred)))
print('Undersampled XGB - test accuracy score is {}'.format(accuracy_score(y_test, y_pred)))
print('Undersampled XGB - test recall score is {}'.format(recall_score(y_test, y_pred)))
print('Undersampled XGB - test auc score is {}'.format(roc_auc_score(y_test, y_pred)))

In [None]:
## OVERSAMPLING APPROACH - DT

from imblearn.over_sampling import RandomOverSampler
from collections import Counter

ros = RandomOverSampler(random_state=17)
X = train.copy()
y = train['isFraud'].copy()
X = X.drop(['isFraud'], axis=1)
X_resampled, y_resampled = ros.fit_resample(X, y)
print('Resampled dataset shape {}'.format(Counter(y_resampled)))

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=17)

from sklearn.impute import SimpleImputer

# Create our imputer to replace missing values with the mean e.g.
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
imp = imp.fit(X_train)

# Impute our data, then train
X_train_imp = imp.transform(X_train)

from sklearn.tree import DecisionTreeClassifier
clf_model = DecisionTreeClassifier(criterion="gini", random_state=17,max_depth=100, min_samples_leaf=50)   
clf_model.fit(X_train_imp,y_train)
y_pred = []
X_test_imp = imp.transform(X_test)
y_pred = clf_model.predict(X_test_imp)
y_train_pred = clf_model.predict(X_train_imp)

from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score

print('Oversampled DT - train precision score is {}'.format(precision_score(y_train, y_train_pred)))
print('Oversampled DT - train accuracy score is {}'.format(accuracy_score(y_train, y_train_pred)))
print('Oversampled DT - train recall score is {}'.format(recall_score(y_train, y_train_pred)))
print('Oversampled DT - train auc score is {}'.format(roc_auc_score(y_train, y_train_pred)))

print('Oversampled DT - test precision score is {}'.format(precision_score(y_test, y_pred)))
print('Oversampled DT - test accuracy score is {}'.format(accuracy_score(y_test, y_pred)))
print('Oversampled DT - test recall score is {}'.format(recall_score(y_test, y_pred)))
print('Oversampled DT - test auc score is {}'.format(roc_auc_score(y_test, y_pred)))

## OVERSAMPLING IS BETTER THAN UNDERSAMPLING.

In [None]:
## OVERSAMPLING APPROACH - XGBOOST

xgmodel = xgb.XGBClassifier(n_estimators = 100,
                            max_depth = 12,
                            learning_rate = 0.02,
                            subsample = 0.8,
                            colsample_bytree = 0.4,
                            missing = -1,
                            random_state = 42,
                            tree_method = 'gpu_hist')
xgmodel.fit(X_train_imp,y_train)

y_pred = []
X_test_imp = imp.transform(X_test)
y_pred = xgmodel.predict(X_test_imp)
y_train_pred = xgmodel.predict(X_train_imp)

from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score

print('Oversampled XGB - train precision score is {}'.format(precision_score(y_train, y_train_pred)))
print('Oversampled XGB - train accuracy score is {}'.format(accuracy_score(y_train, y_train_pred)))
print('Oversampled XGB - train recall score is {}'.format(recall_score(y_train, y_train_pred)))
print('Oversampled XGB - train auc score is {}'.format(roc_auc_score(y_train, y_train_pred)))

print('Oversampled XGB - test precision score is {}'.format(precision_score(y_test, y_pred)))
print('Oversampled XGB - test accuracy score is {}'.format(accuracy_score(y_test, y_pred)))
print('Oversampled XGB - test recall score is {}'.format(recall_score(y_test, y_pred)))
print('Oversampled XGB - test auc score is {}'.format(roc_auc_score(y_test, y_pred)))

In [None]:
#from sklearn.model_selection import RandomizedSearchCV

#clf_model = DecisionTreeClassifier(criterion="gini")
#distrib = dict(max_depth = [10,100,500], min_samples_leaf=[5,10,20,50])
#clf = RandomizedSearchCV(clf_model, distrib, random_state=17)
#search = clf.fit(X_train_imp,y_train)
#search.best_params_

# search of best params output - {'min_samples_leaf': 5, 'max_depth': 500}
#from sklearn.ensemble import RandomForestClassifier
#clf_model = RandomForestClassifier(max_depth=100, min_samples_leaf=50, n_estimators = 500)   
#clf_model.fit(X_train_imp,y_train)
#y_pred = []
#X_test_imp = imp.transform(X_test)
#y_pred = clf_model.predict(X_test_imp)


#from sklearn.metrics import precision_score
#from sklearn.metrics import accuracy_score
#from sklearn.metrics import roc_auc_score
#from sklearn.metrics import recall_score

#print('precision score is {}'.format(precision_score(y_test, y_pred)))
#print('accuracy score is {}'.format(accuracy_score(y_test, y_pred)))
#print('recall score is {}'.format(recall_score(y_test, y_pred)))
#print('auc score is {}'.format(roc_auc_score(y_test, y_pred)))

# RANDOMFOREST RESULTS, 500 trees, max depth 100, min leaves 50
# precision score is 0.9363672902660041
# accuracy score is 0.9202065356151102

In [None]:
#tree_best_clf = DecisionTreeClassifier(criterion="gini", 
#                                       max_depth = search.best_params_['max_depth'], 
#                                       min_samples_leaf = search.best_params_['min_samples_leaf'])

tree_best_clf = DecisionTreeClassifier(criterion="gini", 
                                       max_depth = 500, 
                                       min_samples_leaf = 5)

tree_best_clf.fit(X_train_imp,y_train)
X_test_imp = imp.transform(X_test)
y_pred = tree_best_clf.predict(X_test_imp)
y_train_pred = tree_best_clf.predict(X_train_imp)

from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score

print('Oversampled BESTDT - train precision score is {}'.format(precision_score(y_train, y_train_pred)))
print('Oversampled BESTDT - train accuracy score is {}'.format(accuracy_score(y_train, y_train_pred)))
print('Oversampled BESTDT - train recall score is {}'.format(recall_score(y_train, y_train_pred)))
print('Oversampled BESTDT - train auc score is {}'.format(roc_auc_score(y_train, y_train_pred)))

print('Oversampled BESTDT - test precision score is {}'.format(precision_score(y_test, y_pred)))
print('Oversampled BESTDT - test accuracy score is {}'.format(accuracy_score(y_test, y_pred)))
print('Oversampled BESTDT - test recall score is {}'.format(recall_score(y_test, y_pred)))
print('Oversampled BESTDT - test auc score is {}'.format(roc_auc_score(y_test, y_pred)))

In [None]:
## MASSIVE ESTIMATORS XGB TRIAL - OVERSAMPLED APPROACH
xgmodel = xgb.XGBClassifier(n_estimators = 100,
                            max_depth = 12,
                            learning_rate = 0.02,
                            subsample = 0.8,
                            colsample_bytree = 0.4,
                            missing = -1,
                            random_state = 42,
                            tree_method = 'gpu_hist')
xgmodel.fit(X_train_imp,y_train)

y_pred = []
X_test_imp = imp.transform(X_test)
y_pred = xgmodel.predict(X_test_imp)

from sklearn.metrics import precision_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import recall_score

print('precision score is {}'.format(precision_score(y_test, y_pred)))
print('accuracy score is {}'.format(accuracy_score(y_test, y_pred)))
print('recall score is {}'.format(recall_score(y_test, y_pred)))
print('auc score is {}'.format(roc_auc_score(y_test, y_pred)))

In [None]:
## PCA ##