Summary of Fraud Detection Code Lab
- Data is highly imbalanced
- Features of interval_after_signup and time related raw and aggregates are highly predictive of fraud
- Made actionable operation recommendations/proposal for business

# Load data and package

In [0]:
import pandas as pd
import numpy as np
import time
import matplotlib.pyplot as plt
import pandas as pd
from sklearn import metrics
from sklearn.metrics import roc_curve
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import SMOTE
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import f1_score, roc_auc_score, roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix
from sklearn.model_selection import GridSearchCV

In [0]:
#save data to my drive: "/content/drive/My Drive/fraudData"
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')
#check if files are ready to use
!ls "/content/drive/My Drive/fraudData"

In [0]:
ipToCountry = pd.read_csv('drive/My Drive/fraudData/IpAddress_to_Country.csv')
fraud_data = pd.read_csv('drive/My Drive/fraudData/imbalancedFraudDF.csv')
fraud_data.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
3,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0
4,159135,2015-05-21 06:03:03,2015-07-09 08:05:14,42,ALEYXFXINSXLZ,Ads,Chrome,M,18,2809315000.0,0


# Exploratory data analyses (EDA)

In [0]:
#Distribution of the label column
fraud_data['class'].value_counts()

0    136961
1      1415
Name: class, dtype: int64

In [0]:
# You can install pandas_profiling using the pip package manager by running:
# pip install pandas-profiling
# will give warnings on missing, correlation, constant value(0 variance), etc, see http://nbviewer.jupyter.org/github/JosPolfliet/pandas-profiling/blob/master/examples/meteorites.ipynb

#1 
#Inline summary report without saving report as object
import pandas_profiling
pandas_profiling.ProfileReport(fraud_data)

#2
#simpler version without installing pandas_profiling
fraud_data.describe().transpose()  #only for numeric data

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
user_id,138376.0,200149.0,115226.8,2.0,100894.8,200000.5,299745.2,400000.0
purchase_value,138376.0,36.93899,18.32109,9.0,22.0,35.0,49.0,154.0
age,138376.0,33.12587,8.623645,18.0,27.0,33.0,39.0,76.0
ip_address,138376.0,2154381000.0,1250563000.0,52093.496895,1085079000.0,2156471000.0,3249150000.0,4294850000.0
class,138376.0,0.01022576,0.1006045,0.0,0.0,0.0,0.0,1.0


In [0]:
# count of NaN in each column
fraud_data.isna().sum()
#fraud_data.isnull().sum(axis = 0)

user_id           0
signup_time       0
purchase_time     0
purchase_value    0
device_id         0
source            0
browser           0
sex               0
age               0
ip_address        0
class             0
dtype: int64

In [0]:
ipToCountry.head()

Unnamed: 0,lower_bound_ip_address,upper_bound_ip_address,country
0,16777216.0,16777471,Australia
1,16777472.0,16777727,China
2,16777728.0,16778239,China
3,16778240.0,16779263,Australia
4,16779264.0,16781311,China


In [0]:
def BS(array, target):
    #find the last smaller lower_bound_ip_address than ip_address
    l = 0
    r = len(array)-1
    while (l <= r):
        mid = (l+r)//2
        if array[mid] > target:
            r = mid -1
        else:
            result = mid
            l = mid + 1

    country = ipToCountry[ipToCountry['lower_bound_ip_address'] == array[result]]['country'].to_numpy()[0]
    return country

start = time.time()
countries = []

array = ipToCountry['lower_bound_ip_address'].to_numpy()

for i in range(len(fraud_data)):
    ip_address = fraud_data.loc[i, 'ip_address']#number
   
    #use Binary search to optimize mapping ip_address with countries
    country = 'NA'
    if (ip_address <= ipToCountry.tail(1)['upper_bound_ip_address'].to_numpy()[0] and 
        ip_address > ipToCountry.head(1)['lower_bound_ip_address'].to_numpy()[0]):
        country = BS(array, ip_address)
        
    countries.append(country)

fraud_data['country'] = countries
runtime = time.time() - start 

print("Mapping took", runtime, "seconds.")

Mapping took 167.28551077842712 seconds.


In [0]:
fraud_data.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0,Japan
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0,United States
2,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0,
3,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0,United States
4,159135,2015-05-21 06:03:03,2015-07-09 08:05:14,42,ALEYXFXINSXLZ,Ads,Chrome,M,18,2809315000.0,0,Canada


Check if column user_id is unique(no dup) for time related aggregates?

In [0]:
print(fraud_data.user_id.nunique())#138376
print(len(fraud_data.index))#138376
#all of the user_id has only the first 1 transaction, no way to do time based aggregates, 
#e.g. amount/counts in past 1 day for this user

138376
138376


# Feature processing
## Time related features: combine or extract new features
- interval_after_signup
- signup_days_of_year
- signup_seconds_of_day
- purchase_days_of_year
- purchase_seconds_of_day


In [0]:
#time related features: can be done before split, as they has no interaction between other rows, solely based on other columns of the same row
fraud_data['interval_after_signup'] = (pd.to_datetime(fraud_data['purchase_time']) - pd.to_datetime(
        fraud_data['signup_time'])).dt.total_seconds()

fraud_data['signup_days_of_year'] = pd.DatetimeIndex(fraud_data['signup_time']).dayofyear

#bed time operation
fraud_data['signup_seconds_of_day'] = pd.DatetimeIndex(fraud_data['signup_time']).second + 60 * pd.DatetimeIndex(
    fraud_data['signup_time']).minute + 3600 * pd.DatetimeIndex(fraud_data['signup_time']).hour

fraud_data['purchase_days_of_year'] = pd.DatetimeIndex(fraud_data['purchase_time']).dayofyear
fraud_data['purchase_seconds_of_day'] = pd.DatetimeIndex(fraud_data['purchase_time']).second + 60 * pd.DatetimeIndex(
    fraud_data['purchase_time']).minute + 3600 * pd.DatetimeIndex(fraud_data['purchase_time']).hour

fraud_data = fraud_data.drop(['user_id','signup_time','purchase_time'], axis=1)

In [0]:
fraud_data.head()
#note there are NAs in country

Unnamed: 0,purchase_value,device_id,source,browser,sex,age,ip_address,class,country,interval_after_signup,signup_days_of_year,signup_seconds_of_day,purchase_days_of_year,purchase_seconds_of_day
0,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0,Japan,4506682.0,55,82549,108,10031
1,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0,United States,17944.0,158,74390,159,5934
2,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0,,492085.0,118,76405,124,50090
3,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0,United States,4361461.0,202,25792,252,67253
4,42,ALEYXFXINSXLZ,Ads,Chrome,M,18,2809315000.0,0,Canada,4240931.0,141,21783,190,29114


In [0]:
print(fraud_data.source.value_counts())

SEO       55766
Ads       54913
Direct    27697
Name: source, dtype: int64


To avoid data leakage, do feature conversiton after train/test split.

-What if convert source column by OHE now?
it's a sneak peek of the levels of this column in test data, what if there is another new level "newsPaper" in source column in test, or one column in test data has larger range compared to that of train data? data leakage, should not know this ahead of time in training



In [0]:
y = fraud_data['class']
X = fraud_data.drop(['class'], axis=1)

#split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
print("X_train.shape:", X_train.shape)
print("y_train.shape:", y_train.shape)

X_train.shape: (110700, 13)
y_train.shape: (110700,)


In [0]:
X_train['country'].value_counts(ascending=True)
#drawback: collision in the same bucket(no differentiation for these countries)

Yemen                                 1
British Indian Ocean Territory        1
Monaco                                1
Myanmar                               1
San Marino                            1
                                  ...  
United Kingdom                     3254
Japan                              5705
China                              8876
NA                                14878
United States                     43173
Name: country, Length: 177, dtype: int64

In [0]:
X_train.head()

Unnamed: 0,purchase_value,device_id,source,browser,sex,age,ip_address,country,interval_after_signup,signup_days_of_year,signup_seconds_of_day,purchase_days_of_year,purchase_seconds_of_day
29343,12,OULPAZAFRFPXP,Ads,Chrome,M,42,3690922000.0,Korea Republic of,3499664.0,183,67384,224,24648
12190,10,AIIWMFEYQQIEB,Ads,Opera,M,29,1686759000.0,United States,6766039.0,5,78146,84,18585
19388,34,VUVETBUPCIWJE,Direct,Chrome,M,53,4138429000.0,,5870515.0,197,81354,265,76669
89104,48,QCFULAJOYKFUU,Ads,Chrome,M,29,96173370.0,France,2145618.0,160,30920,185,16538
82082,44,IHRWLMIJMEEEU,Ads,FireFox,M,24,1936025000.0,China,7079059.0,111,71897,193,66156


Convert 'source' and 'browser' categorical features with high cadinality to numericals (one-hot encoding)

In [0]:
#one hot encoding for categorical features

#converting needs to be done after split
X_train = pd.get_dummies(X_train, columns=['source', 'browser']) #['source', 'browser'] will be auto dropped by get_dummies 
X_train['sex'] = (X_train.sex == 'M').astype(int)

Frequency encoding for 'device_id', 'ip_address' and 'country'

In [0]:
#frequency encoding for categorical features, first count frequency, then map frequency to raw data
#will be scaled later

# the more a device is shared, the more suspicious
X_train_device_id_mapping = X_train.device_id.value_counts(dropna=False)
X_train['n_dev_shared'] = X_train.device_id.map(X_train_device_id_mapping)# number of times device_id occurred in train data

# the more a ip is shared, the more suspicious
X_train_ip_address_mapping = X_train.ip_address.value_counts(dropna=False)
X_train['n_ip_shared'] = X_train.ip_address.map(X_train_ip_address_mapping)

# the less visit from a country, the more suspicious
X_train_country_mapping = X_train.country.value_counts(dropna=False)
X_train['n_country_shared'] = X_train.country.map(X_train_country_mapping)#lots of NAs in country column, #without dropna=False will produce nan in this col


X_train = X_train.drop(['device_id','ip_address','country'], axis=1)

In [0]:
#Do the same conversion for test data, but based on test data (data leak), not from training
X_test = pd.get_dummies(X_test, columns=['source', 'browser'])
X_test['sex'] = (X_test.sex == 'M').astype(int)

# the more a device is shared, the more suspicious
X_test['n_dev_shared'] = X_test.device_id.map(X_test.device_id.value_counts(dropna=False))

# the more a ip is shared, the more suspicious
X_test['n_ip_shared'] = X_test.ip_address.map(X_test.ip_address.value_counts(dropna=False))

# the less visit from a country, the more suspicious
X_test['n_country_shared'] = X_test.country.map(X_test.country.value_counts(dropna=False))

X_test = X_test.drop(['device_id','ip_address','country'], axis=1)

In [0]:
X_train.head()

Unnamed: 0,purchase_value,sex,age,interval_after_signup,signup_days_of_year,signup_seconds_of_day,purchase_days_of_year,purchase_seconds_of_day,source_Ads,source_Direct,source_SEO,browser_Chrome,browser_FireFox,browser_IE,browser_Opera,browser_Safari,n_dev_shared,n_ip_shared,n_country_shared
29343,12,1,42,3499664.0,183,67384,224,24648,1,0,0,1,0,0,0,0,1,1,3075
12190,10,1,29,6766039.0,5,78146,84,18585,1,0,0,0,0,0,1,0,1,1,43173
19388,34,1,53,5870515.0,197,81354,265,76669,0,1,0,1,0,0,0,0,1,1,14878
89104,48,1,29,2145618.0,160,30920,185,16538,1,0,0,1,0,0,0,0,1,1,2324
82082,44,1,24,7079059.0,111,71897,193,66156,1,0,0,0,1,0,0,0,1,1,8876


Do feature normalization for 'device_id', 'ip_address' and 'country'

In [0]:
# # normalize (min-max) to [0,1], standardize(StandardScaler) to normal, mu=0,var = 1 can < 0, so we do normalize here

# needs to be brought to the same scale for models like LR with regularization(that are not tree based)

#Compute the train minimum and maximum to be used for later scaling:
scaler = preprocessing.MinMaxScaler().fit(X_train[['n_dev_shared', 'n_ip_shared', 'n_country_shared']]) 
#print(scaler.data_max_)

#transform the training data and use them for the model training
X_train[['n_dev_shared', 'n_ip_shared', 'n_country_shared']] = scaler.transform(X_train[['n_dev_shared', 'n_ip_shared', 'n_country_shared']])

#before the prediction of the test data, apply the same scaler obtained from above on X_test, not fitting a brandnew scaler on test
X_test[['n_dev_shared', 'n_ip_shared', 'n_country_shared']] = scaler.transform(X_test[['n_dev_shared', 'n_ip_shared', 'n_country_shared']])

In [0]:
X_train.n_dev_shared.value_counts(dropna=False)

0.0    105427
0.2      4774
0.4       324
0.6       124
0.8        45
1.0         6
Name: n_dev_shared, dtype: int64

In [0]:
X_test.n_dev_shared.value_counts(dropna=False)

0.0    27330
0.2      334
0.4       12
Name: n_dev_shared, dtype: int64

# Model training /Tuning

[link text](https://)## Simple Logistic Regression model

In [0]:
# instantiate the model (using the default parameters)
logreg = LogisticRegression()

# fit the model with data
logreg.fit(X_train,y_train)

# predict on test
y_pred=logreg.predict(X_test)

In [0]:
cm = metrics.confusion_matrix(y_test, y_pred)
cmDF = pd.DataFrame(cm, columns=['pred_0', 'pred_1'], index=['true_0', 'true_1'])
print(cmDF)

        pred_0  pred_1
true_0   27389       0
true_1     287       0


## Simple random forest model

In [0]:
classifier_RF = RandomForestClassifier(random_state=0)

classifier_RF.fit(X_train, y_train)

# predict class labels 0/1 for the test set
predicted = classifier_RF.predict(X_test)

# generate class probabilities, used for roc_auc_score
probs = classifier_RF.predict_proba(X_test)

# generate evaluation metrics
print("%s: %r" % ("accuracy_score is: ", accuracy_score(y_test, predicted)))
print("%s: %r" % ("roc_auc_score is: ", roc_auc_score(y_test, probs[:, 1])))
print("%s: %r" % ("f1_score is: ", f1_score(y_test, predicted )))#string to int

print ("confusion_matrix is: ")
cm = confusion_matrix(y_test, predicted)
cmDF = pd.DataFrame(cm, columns=['pred_0', 'pred_1'], index=['true_0', 'true_1'])
print(cmDF)
print('recall =',float(cm[1,1])/(cm[1,0]+cm[1,1]))
print('precision =', float(cm[1,1])/(cm[1,1] + cm[0,1]))#1.0

accuracy_score is: : 0.9948692007515537
roc_auc_score is: : 0.751059169078153
f1_score is: : 0.6712962962962962
confusion_matrix is: 
        pred_0  pred_1
true_0   27389       0
true_1     142     145
recall = 0.5052264808362369
precision = 1.0


## Random forest model with SMOTE train data

SMOTE sampling
- SMOTE after train/test split, otherwise test data can not reflect true distribution in reality
- Do SMOTE oversampling on trainin data
- Sampling_strategy = number of samples in the majority class is the same as the number of samples in the minority class after resampling

In [0]:
smote = SMOTE(random_state=12)
x_train_sm, y_train_sm = smote.fit_sample(X_train, y_train)

unique, counts = np.unique(y_train_sm, return_counts=True)

print("X_train_sm.shape:", x_train_sm.shape)
print(np.asarray((unique, counts)).T)



X_train_sm.shape: (219144, 19)
[[     0 109572]
 [     1 109572]]


In [0]:
#RF on smoted training data
classifier_RF_sm = RandomForestClassifier(random_state=0)

classifier_RF_sm.fit(x_train_sm, y_train_sm)

# predict class labels for the test set
predicted_sm = classifier_RF_sm.predict(X_test)

# generate class probabilities
probs_sm = classifier_RF_sm.predict_proba(X_test)

# generate evaluation metrics
print("%s: %r" % ("accuracy_score_sm is: ", accuracy_score(y_test, predicted_sm)))
print("%s: %r" % ("roc_auc_score_sm is: ", roc_auc_score(y_test, probs_sm[:, 1])))
print("%s: %r" % ("f1_score_sm is: ", f1_score(y_test, predicted_sm )))#string to int

print ("confusion_matrix_sm is: ")
cm_sm = confusion_matrix(y_test, predicted_sm)
cmDF = pd.DataFrame(cm_sm, columns=['pred_0', 'pred_1'], index=['true_0', 'true_1'])
print(cmDF)
print('recall or sens_sm =',float(cm_sm[1,1])/(cm_sm[1,0]+cm_sm[1,1]))
print('precision_sm =', float(cm_sm[1,1])/(cm_sm[1,1] + cm_sm[0,1]))

#Here, no significant difference compared with no SMOTE

accuracy_score_sm is: : 0.9948330683624801
roc_auc_score_sm is: : 0.7668476993548746
f1_score_sm is: : 0.6697459584295612
confusion_matrix_sm is: 
        pred_0  pred_1
true_0   27388       1
true_1     142     145
recall or sens_sm = 0.5052264808362369
precision_sm = 0.9931506849315068


## XGBoost Model

In [0]:
import xgboost as xgb
classifier_XGB = xgb.XGBClassifier(random_state=0, learning_rate=0.1)
classifier_XGB.fit(X_train, y_train)

# predict class labels 0/1 for the test set
predicted = classifier_XGB.predict(X_test)

# generate class probabilities, used for roc_auc_score
probs = classifier_XGB.predict_proba(X_test)

# generate evaluation metrics
print("%s: %r" % ("accuracy_score is: ", accuracy_score(y_test, predicted)))
print("%s: %r" % ("roc_auc_score is: ", roc_auc_score(y_test, probs[:, 1])))
print("%s: %r" % ("f1_score is: ", f1_score(y_test, predicted )))#string to int

print ("confusion_matrix is: ")
cm = confusion_matrix(y_test, predicted)
cmDF = pd.DataFrame(cm, columns=['pred_0', 'pred_1'], index=['true_0', 'true_1'])
print(cmDF)
print('recall =',float(cm[1,1])/(cm[1,0]+cm[1,1]))
print('precision =', float(cm[1,1])/(cm[1,1] + cm[0,1]))#1.0


accuracy_score is: : 0.9948692007515537
roc_auc_score is: : 0.7949134695469569
f1_score is: : 0.6712962962962962
confusion_matrix is: 
        pred_0  pred_1
true_0   27389       0
true_1     142     145
recall = 0.5052264808362369
precision = 1.0


In [0]:
classifier_XGB

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

# Parameter tuning by GridSearchCV

Eval metrics for GridSearchCV over all fits upon combination of parameters and cv

In [0]:
# Eval metrics to be calculated for each combination of parameters and cv, they are all calculated(by scoring), but not necessarily used to pick the optimal(by refit)
scorers = {
    'precision_score': make_scorer(precision_score),
    'recall_score': make_scorer(recall_score),
    'f1_score': make_scorer(f1_score, pos_label=1)
}

In [0]:
def grid_search_wrapper(model, parameters, refit_score='f1_score'):
    """
    fits a GridSearchCV classifier using refit_score for optimization(refit on the best model according to refit_score)
    prints classifier performance metrics
    """

    grid_search = GridSearchCV(model, parameters, scoring=scorers, refit=refit_score,
                           cv=3, return_train_score=True, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # make the predictions
    y_pred = grid_search.predict(X_test)
    y_prob = grid_search.predict_proba(X_test)[:, 1]
    
    print('Best params for {}'.format(refit_score))
    print(grid_search.best_params_)

    # confusion matrix on the test data.
    print('\nConfusion matrix optimized for {} on the train data:'.format(refit_score))
    y_train_pred = grid_search.predict(X_train)
    cm = confusion_matrix(y_train, y_train_pred)
    cmDF = pd.DataFrame(cm, columns=['pred_0', 'pred_1'], index=['true_0', 'true_1'])
    print(cmDF)

    # confusion matrix on the test data.
    print('\nConfusion matrix optimized for {} on the test data:'.format(refit_score))
    cm = confusion_matrix(y_test, y_pred)
    cmDF = pd.DataFrame(cm, columns=['pred_0', 'pred_1'], index=['true_0', 'true_1'])
    print(cmDF)
    
    print("\t%s: %r" % ("roc_auc_score is: ", roc_auc_score(y_test, y_prob)))
    print("\t%s: %r" % ("f1_score is: ", f1_score(y_test, y_pred)))#string to int

    print('recall = ', float(cm[1,1]) / (cm[1,0] + cm[1,1]))
    print('precision = ', float(cm[1,1]) / (cm[1, 1] + cm[0,1]))

    return grid_search

##Optimizing on f1_score on LR

In [0]:
# C: inverse of regularization strength, smaller values specify stronger regularization
LRGrid = {"C" : np.logspace(-2,2,5), "penalty":["l1","l2"]}# l1 lasso l2 ridge
#param_grid = {'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}
logRegModel = LogisticRegression(random_state=0)

grid_search_LR_f1 = grid_search_wrapper(logRegModel, LRGrid, refit_score='f1_score')

Best params for f1_score
{'C': 0.01, 'penalty': 'l2'}

Confusion matrix optimized for f1_score on the train data:
        pred_0  pred_1
true_0  109572       0
true_1    1128       0

Confusion matrix optimized for f1_score on the test data:
        pred_0  pred_1
true_0   27389       0
true_1     287       0
	roc_auc_score is: : 0.7505919045045043
	f1_score is: : 0.0
recall =  0.0
precision =  nan




### Optimizing on f1_score on RF

In [0]:
parameters = {        
'max_depth': [None, 5, 15],
'n_estimators' :  [10,150],
'class_weight' : [{0: 1, 1: w} for w in [0.2, 1, 100]]
}

clf = RandomForestClassifier(random_state=0)

In [0]:
grid_search_rf_f1 = grid_search_wrapper(clf, parameters, refit_score='f1_score')

Best params for f1_score
{'class_weight': {0: 1, 1: 0.2}, 'max_depth': None, 'n_estimators': 150}

Confusion matrix optimized for f1_score on the train data:
        pred_0  pred_1
true_0  109572       0
true_1       0    1128

Confusion matrix optimized for f1_score on the test data:
        pred_0  pred_1
true_0   27389       0
true_1     142     145
	roc_auc_score is: : 0.774580908966353
	f1_score is: : 0.6712962962962962
recall =  0.5052264808362369
precision =  1.0


In [0]:
best_rf_model_f1 = grid_search_rf_f1.best_estimator_
best_rf_model_f1

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight={0: 1, 1: 0.2}, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=150, n_jobs=None, oob_score=False,
                       random_state=0, verbose=0, warm_start=False)

In [0]:
results_f1 = pd.DataFrame(grid_search_rf_f1.cv_results_)
results_sortf1 = results_f1.sort_values(by='mean_test_f1_score', ascending=False)
results_sortf1[['mean_test_precision_score', 'mean_test_recall_score', 'mean_test_f1_score', 'mean_train_precision_score', 'mean_train_recall_score', 'mean_train_f1_score','param_max_depth', 'param_class_weight', 'param_n_estimators']].round(3).head()

Unnamed: 0,mean_test_precision_score,mean_test_recall_score,mean_test_f1_score,mean_train_precision_score,mean_train_recall_score,mean_train_f1_score,param_max_depth,param_class_weight,param_n_estimators
9,1.0,0.527,0.69,1.0,0.527,0.69,5.0,"{0: 1, 1: 1}",150
7,1.0,0.527,0.69,1.0,1.0,1.0,,"{0: 1, 1: 1}",150
13,1.0,0.527,0.69,1.0,1.0,1.0,,"{0: 1, 1: 100}",150
3,1.0,0.527,0.69,1.0,0.527,0.69,5.0,"{0: 1, 1: 0.2}",150
11,1.0,0.527,0.69,1.0,0.586,0.739,15.0,"{0: 1, 1: 1}",150


##Check feature importance
Find out the most importance features are:
- interval_after_signup
- purchase_days_of_year

In [0]:
#check feature importance
pd.DataFrame(best_rf_model_f1.feature_importances_, index = X_train.columns, columns=['importance']).sort_values('importance', ascending=False)

Unnamed: 0,importance
interval_after_signup,0.398899
purchase_days_of_year,0.137748
purchase_seconds_of_day,0.079756
signup_seconds_of_day,0.079487
signup_days_of_year,0.057544
n_ip_shared,0.051497
purchase_value,0.044803
n_dev_shared,0.039265
age,0.037788
n_country_shared,0.027534


##Optimizing recall_score on RF

In [0]:
grid_search_rf_recall = grid_search_wrapper(clf, parameters, refit_score='recall_score')

Best params for recall_score
{'class_weight': {0: 1, 1: 100}, 'max_depth': 5, 'n_estimators': 150}

Confusion matrix optimized for recall_score on the train data:
        pred_0  pred_1
true_0  105337    4235
true_1     388     740

Confusion matrix optimized for recall_score on the test data:
        pred_0  pred_1
true_0   27146     243
true_1     132     155
	roc_auc_score is: : 0.7920126890382886
	f1_score is: : 0.4525547445255475
recall =  0.5400696864111498
precision =  0.38944723618090454


In [0]:
best_RF_model_recall = grid_search_rf_recall.best_estimator_
best_RF_model_recall

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                       class_weight={0: 1, 1: 100}, criterion='gini',
                       max_depth=5, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=150, n_jobs=None, oob_score=False,
                       random_state=0, verbose=0, warm_start=False)

In [0]:
# predict class labels for the test set
predictedBest_recall = best_RF_model_recall.predict(X_test)

# generate class probabilities
probsBest_recall = best_RF_model_recall.predict_proba(X_test)

results_recall = pd.DataFrame(grid_search_rf_recall.cv_results_)# recall score is different from above, as above is metric on test data, this is performance on cv data
results_sortrecall = results_recall.sort_values(by='mean_test_recall_score', ascending=False)
results_sortrecall[['mean_test_precision_score', 'mean_test_recall_score', 'mean_test_f1_score', 'mean_train_precision_score', 'mean_train_recall_score', 'mean_train_f1_score','param_max_depth', 'param_class_weight', 'param_n_estimators']].round(3).head()

Unnamed: 0,mean_test_precision_score,mean_test_recall_score,mean_test_f1_score,mean_train_precision_score,mean_train_recall_score,mean_train_f1_score,param_max_depth,param_class_weight,param_n_estimators
15,0.158,0.635,0.253,0.163,0.656,0.262,5.0,"{0: 1, 1: 100}",150
14,0.162,0.632,0.258,0.166,0.652,0.264,5.0,"{0: 1, 1: 100}",10
16,0.751,0.528,0.618,0.842,0.816,0.826,15.0,"{0: 1, 1: 100}",10
17,0.965,0.527,0.682,0.983,0.833,0.902,15.0,"{0: 1, 1: 100}",150
1,1.0,0.527,0.69,1.0,1.0,1.0,,"{0: 1, 1: 0.2}",150


##Optimizing on recall_score on XGB

In [0]:
parameters = {'max_depth': [1, 5, 10],
              'n_estimators' :  [5, 10, 30],
              'learning_rate' : [0.5, 0.1,0.05]
              }
classifier_XGB = xgb.XGBClassifier(random_state=0)
grid_search_xgb_recall = grid_search_wrapper(classifier_XGB, parameters, refit_score='recall_score')




Best params for recall_score
{'learning_rate': 0.5, 'max_depth': 1, 'n_estimators': 5}

Confusion matrix optimized for recall_score on the train data:
        pred_0  pred_1
true_0  109572       0
true_1     534     594

Confusion matrix optimized for recall_score on the test data:
        pred_0  pred_1
true_0   27389       0
true_1     142     145
	roc_auc_score is: : 0.7526132404181185
	f1_score is: : 0.6712962962962962
recall =  0.5052264808362369
precision =  1.0


# Finding and recommentation

 What kinds of users are more likely to be classified as at risk?

In [0]:
#for task 3, based on the above var importance
trainDF = pd.concat([X_train, y_train], axis=1)
pd.crosstab(trainDF["n_dev_shared"],trainDF["class"])

#Fingdings: the larger n_dev_shared, the higher rate of fraud

class,0,1
n_dev_shared,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,104966,461
0.2,4403,371
0.4,152,172
0.6,37,87
0.8,13,32
1.0,1,5


In [0]:
fraud_data.groupby("class")[['interval_after_signup']].mean()
#interval_after_signup on frauds are significantly lower compared to normal cases

Unnamed: 0_level_0,interval_after_signup
class,Unnamed: 1_level_1
0,5191179.0
1,2570226.0


In [0]:
fraud_data.groupby("class")[['interval_after_signup']].median()
#more than half of fraud happened 1s after signed up

Unnamed: 0_level_0,interval_after_signup
class,Unnamed: 1_level_1
0,5194911.0
1,1.0


In [0]:
fraud_data[fraud_data['class'] == 1].groupby("interval_after_signup").count()
#Findings: interval_after_signup of more than half of fraud is 1 second, means happend 1 second after signed up!

Unnamed: 0_level_0,purchase_value,device_id,source,browser,sex,age,ip_address,class,country,signup_days_of_year,signup_seconds_of_day,purchase_days_of_year,purchase_seconds_of_day
interval_after_signup,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
1.0,739,739,739,739,739,739,739,739,739,739,739,739,739
54963.0,1,1,1,1,1,1,1,1,1,1,1,1,1
87771.0,1,1,1,1,1,1,1,1,1,1,1,1,1
113412.0,1,1,1,1,1,1,1,1,1,1,1,1,1
121448.0,1,1,1,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
10292598.0,1,1,1,1,1,1,1,1,1,1,1,1,1
10295716.0,1,1,1,1,1,1,1,1,1,1,1,1,1
10312138.0,1,1,1,1,1,1,1,1,1,1,1,1,1
10350095.0,1,1,1,1,1,1,1,1,1,1,1,1,1


How to use the predictions from the model?

In [0]:
#probsBest_recall: class probabilities from the best RF model with optimized recall
t = (10 * probsBest_recall[:, 1]).astype(int)  # 10* for better display
unique, counts = np.unique(t, return_counts=True)
print(np.asarray((unique, counts)).T)

#Recommendation
      #green: 1 - 3 pass
      #grey: 4-7 need manual investigation
      #red: 8,9 decline

[[    2 24233]
 [    3  2947]
 [    4    98]
 [    5   187]
 [    6    66]
 [    7     1]
 [    8    17]
 [    9   127]]
