# Identifying Fraudulent Activities

In [1]:
import numpy as np
import pandas as pd

## Load Data

In [2]:
data = pd.read_csv('Fraud_Data.csv', parse_dates=['signup_time', 'purchase_time'])
data.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:00,2015-04-18 02:47:00,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:00,2015-06-08 01:38:00,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:00,2015-01-01 18:52:00,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1
3,150084,2015-04-28 21:13:00,2015-05-04 13:54:00,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
4,221365,2015-07-21 07:09:00,2015-09-09 18:40:00,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 11 columns):
 #   Column          Non-Null Count   Dtype         
---  ------          --------------   -----         
 0   user_id         151112 non-null  int64         
 1   signup_time     151112 non-null  datetime64[ns]
 2   purchase_time   151112 non-null  datetime64[ns]
 3   purchase_value  151112 non-null  int64         
 4   device_id       151112 non-null  object        
 5   source          151112 non-null  object        
 6   browser         151112 non-null  object        
 7   sex             151112 non-null  object        
 8   age             151112 non-null  int64         
 9   ip_address      151112 non-null  float64       
 10  class           151112 non-null  int64         
dtypes: datetime64[ns](2), float64(1), int64(4), object(4)
memory usage: 12.7+ MB


In [4]:
data.describe().iloc[:, 1:3]

Unnamed: 0,purchase_value,age
count,151112.0,151112.0
mean,36.935372,33.140704
std,18.322762,8.617733
min,9.0,18.0
25%,22.0,27.0
50%,35.0,33.0
75%,49.0,39.0
max,154.0,76.0


In [5]:
ip_country = pd.read_csv('IpAddress_to_Country.csv')
ip_country.head()

Unnamed: 0,lower_bound_ip_address,upper_bound_ip_address,country
0,16777216.0,16777471,Australia
1,16777472.0,16777727,China
2,16777728.0,16778239,China
3,16778240.0,16779263,Australia
4,16779264.0,16781311,China


In [6]:
ip_country.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138846 entries, 0 to 138845
Data columns (total 3 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   lower_bound_ip_address  138846 non-null  float64
 1   upper_bound_ip_address  138846 non-null  int64  
 2   country                 138846 non-null  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 3.2+ MB


## Data Preprocessing

In [7]:
def match_country_to_ip(ip=None):
    match = (ip <= ip_country['upper_bound_ip_address']) & ( ip >= ip_country['lower_bound_ip_address'])
    if match.any():
        return ip_country['country'][match].values[0]
    else:
        return 'NA'

In [8]:
data['country'] = data['ip_address'].apply(lambda x: match_country_to_ip(x))

In [45]:
data.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country,time_diff,device_num_x,ip_num_x,device_num_y,ip_num_y
0,22058,2015-02-24 22:55:00,2015-04-18 02:47:00,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0,Japan,13920,1,1,1,1
1,333320,2015-06-07 20:39:00,2015-06-08 01:38:00,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0,United States,17940,1,1,1,1
2,1359,2015-01-01 18:52:00,2015-01-01 18:52:00,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1,United States,0,12,12,12,12
3,150084,2015-04-28 21:13:00,2015-05-04 13:54:00,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0,,60060,1,1,1,1
4,221365,2015-07-21 07:09:00,2015-09-09 18:40:00,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0,United States,41460,1,1,1,1


## Feature Engineering

In [46]:
time_diff = data['purchase_time'] - data['signup_time']
time_diff = time_diff.apply(lambda x: x.seconds)
data['time_diff'] = time_diff

In [47]:
device_num = data[['user_id', 'device_id']].groupby('device_id').count().reset_index()
device_num = device_num.rename(columns={'user_id': 'device_num'})
data = data.merge(device_num, how='left', on='device_id')

In [48]:
ip_num = data[['user_id', 'ip_address']].groupby('ip_address').count().reset_index()
ip_num = ip_num.rename(columns={'user_id': 'ip_num'})
data = data.merge(ip_num, how='left', on='ip_address')

In [49]:
features = ['purchase_value', 'source', 'browser', 'sex', 'age', 'country', 'time_diff', 'device_num', 'ip_num']
label = 'class'
X = data[features]
y = data[label]

In [50]:
X.head()

Unnamed: 0,purchase_value,source,browser,sex,age,country,time_diff,device_num,ip_num
0,34,SEO,Chrome,M,39,Japan,13920,1,1
1,16,Ads,Chrome,F,53,United States,17940,1,1
2,15,SEO,Opera,M,53,United States,0,12,12
3,44,SEO,Safari,M,41,,60060,1,1
4,39,Ads,Safari,M,45,United States,41460,1,1


In [51]:
X_num = X.drop(['source', 'browser', 'sex', 'country'], axis=1)
X_cat = X[['source', 'browser', 'sex', 'country']]

X_cat = pd.get_dummies(X_cat)
X = pd.concat([X_num, X_cat], axis=1)

X.head()

Unnamed: 0,purchase_value,age,time_diff,device_num,ip_num,source_Ads,source_Direct,source_SEO,browser_Chrome,browser_FireFox,...,country_United States,country_Uruguay,country_Uzbekistan,country_Vanuatu,country_Venezuela,country_Viet Nam,country_Virgin Islands (U.S.),country_Yemen,country_Zambia,country_Zimbabwe
0,34,39,13920,1,1,0,0,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,16,53,17940,1,1,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
2,15,53,0,12,12,0,0,1,0,0,...,1,0,0,0,0,0,0,0,0,0
3,44,41,60060,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,39,45,41460,1,1,1,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0


## Build Classifier

In [52]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.25)

In [56]:
from sklearn.ensemble import RandomForestClassifier

forest = RandomForestClassifier(n_estimators=500,
                                max_leaf_nodes=16,
                                bootstrap=True,
                                oob_score=True,
                                n_jobs=-1, random_state=42)
%time forest.fit(X_train, y_train)

CPU times: total: 1min 41s
Wall time: 22.5 s


RandomForestClassifier(max_leaf_nodes=16, n_estimators=500, n_jobs=-1,
                       oob_score=True, random_state=42)

In [57]:
from sklearn.metrics import accuracy_score

accuracy_score(y_test, forest.predict(X_test))

0.9568796654137328

In [58]:
forest.feature_importances_

array([8.10466981e-03, 7.04791069e-03, 3.65058846e-01, 2.97946517e-01,
       2.68067768e-01, 1.80732067e-03, 3.14950908e-03, 1.53314030e-03,
       2.31649230e-03, 7.82171974e-04, 1.65717348e-03, 4.39021887e-04,
       8.85060376e-04, 1.56923228e-03, 2.11362294e-03, 0.00000000e+00,
       0.00000000e+00, 2.26137561e-05, 2.10026817e-07, 0.00000000e+00,
       6.52211120e-06, 6.33917533e-06, 6.86272246e-06, 7.69918192e-06,
       4.23862278e-06, 0.00000000e+00, 0.00000000e+00, 3.66039954e-05,
       0.00000000e+00, 1.73036611e-07, 1.14950228e-03, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 9.21678078e-04,
       0.00000000e+00, 3.25868034e-06, 0.00000000e+00, 6.49215716e-05,
       0.00000000e+00, 0.00000000e+00, 5.48908016e-05, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 6.34406029e-04,
       0.00000000e+00, 0.00000000e+00, 5.75347513e-04, 1.86080311e-04,
       9.05800241e-05, 0.00000000e+00, 0.00000000e+00, 2.84391792e-04,
      