In [63]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix

In [2]:
fraud = pd.read_csv('Fraud_Data.csv')
ip2country = pd.read_csv('IpAddress_to_Country.csv')

Q1 Determine users' countries based on their IP addresses

In [3]:
ip2country.tail()

Unnamed: 0,lower_bound_ip_address,upper_bound_ip_address,country
138841,3758092000.0,3758093311,Hong Kong
138842,3758093000.0,3758094335,India
138843,3758095000.0,3758095871,China
138844,3758096000.0,3758096127,Singapore
138845,3758096000.0,3758096383,Australia


In [4]:
fraud.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0


In [5]:
ips = fraud['ip_address']
countries = []
for ip in ips:
    temp = ip2country[(ip >= ip2country['lower_bound_ip_address']) 
                      & (ip <= ip2country['upper_bound_ip_address'])]['country'].values
    if len(temp) == 1:
        countries.append(temp[0])
    else:
        countries.append('')

In [6]:
fraud['country'] = countries

In [7]:
fraud.groupby('country').size().nlargest(10)

country
United States        58049
                     21966
China                12038
Japan                 7306
United Kingdom        4490
Korea Republic of     4162
Germany               3646
France                3161
Canada                2975
Brazil                2961
dtype: int64

In [8]:
fraud.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0,Japan
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0,United States
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1,United States
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0,
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0,United States


Q2 Build a model to predict whether an activity is fraudulent or not. Explain how different assumptions about the cost of false positives vs false negatives would impact the model.

In [9]:
# feature engineering. Difference between signup_time and purchase_time. Uniqueness of device_id and ip_address.

In [10]:
from datetime import datetime

In [11]:
fraud['signup_time'] = pd.to_datetime(fraud['signup_time'])
fraud['purchase_time'] = pd.to_datetime(fraud['purchase_time'])

In [12]:
diff = fraud['purchase_time'] - fraud['signup_time']
fraud['purchase_signup_diff'] = diff.dt.total_seconds()

In [13]:
fraud['device_id_count'] = fraud['device_id'].map(fraud['device_id'].value_counts())
fraud['ip_address_count'] = fraud['ip_address'].map(fraud['ip_address'].value_counts())
fraud['country'] = fraud['country'].replace('', "Not_found")
fraud = fraud.drop(['user_id', 'signup_time', 'purchase_time', 'device_id', 'ip_address'], axis=1)

In [27]:
fraud.head()

Unnamed: 0,purchase_value,source,browser,sex,age,class,country,purchase_signup_diff,device_id_count,ip_address_count
0,34,SEO,Chrome,M,39,0,Japan,4506682.0,1,1
1,16,Ads,Chrome,F,53,0,United States,17944.0,1,1
2,15,SEO,Opera,M,53,1,United States,1.0,12,12
3,44,SEO,Safari,M,41,0,Not_found,492085.0,1,1
4,39,Ads,Safari,M,45,0,United States,4361461.0,1,1


In [35]:
# only keep 50 most popular countries
tomap = fraud.groupby('country').size().sort_values(ascending=False)[50:].index
mapping = dict.fromkeys(tomap, 'Others')
fraud = fraud.replace({'country':mapping})
fraud_rf = pd.get_dummies(fraud, drop_first=True)

In [58]:
# train test split and random forest model
y = fraud_rf['class']
X = fraud_rf.drop(['class'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .34, random_state=42)

In [59]:
forest = RandomForestClassifier(n_estimators=100, oob_score=True, random_state=0)
forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=True, random_state=0, verbose=0, warm_start=False)

In [61]:
# OOB Result
forest.oob_score_

0.9550600102273069

In [78]:
# Test Result
forest.score(X_test, y_test)

0.9563829580178672

In [68]:
y_pred = forest.predict(X_test)

In [71]:
result = confusion_matrix(y_test, y_pred)

In [77]:
result

array([[46535,    88],
       [ 2153,  2603]])

In [None]:
fp = result[0][1]
fn = result[1][0]