In [106]:
import pandas as pd
import numpy as np
from encoder import Encoder

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss
from sklearn.ensemble import (RandomForestClassifier, 
                              GradientBoostingClassifier, 
                              AdaBoostClassifier)

import seaborn as sns
sns.set(style="ticks", color_codes=True)
import matplotlib.pyplot as plt
%matplotlib inline

In [123]:
ip_data = pd.read_csv('data/IpAddress_to_Country.csv')

In [124]:
ip_data.tail()

Unnamed: 0,lower_bound_ip_address,upper_bound_ip_address,country
138841,3758092000.0,3758093311,Hong Kong
138842,3758093000.0,3758094335,India
138843,3758095000.0,3758095871,China
138844,3758096000.0,3758096127,Singapore
138845,3758096000.0,3758096383,Australia


In [125]:
ip_data['ip_range'] = list(zip(ip_data.lower_bound_ip_address, ip_data.upper_bound_ip_address))

In [126]:
fraud_data = pd.read_csv('data/Fraud_Data.csv')

In [127]:
fraud_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151112 entries, 0 to 151111
Data columns (total 11 columns):
user_id           151112 non-null int64
signup_time       151112 non-null object
purchase_time     151112 non-null object
purchase_value    151112 non-null int64
device_id         151112 non-null object
source            151112 non-null object
browser           151112 non-null object
sex               151112 non-null object
age               151112 non-null int64
ip_address        151112 non-null float64
class             151112 non-null int64
dtypes: float64(1), int64(4), object(6)
memory usage: 12.7+ MB


In [122]:
fraud_data.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0


### Determine Country based on IP address

In [128]:
v = ip_data.loc[:, 'lower_bound_ip_address':'upper_bound_ip_address'].apply(tuple, 1).tolist()

In [129]:
idx = pd.IntervalIndex.from_tuples(v, closed='both')

In [130]:
v = ip_data.loc[:, 'lower_bound_ip_address':'upper_bound_ip_address'].apply(tuple, 1).tolist()
idx = pd.IntervalIndex.from_tuples(v, closed='both')
fraud_data['country'] = ip_data.loc[idx.get_indexer(fraud_data['ip_address'].values), 'country'].values

Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return getattr(section, self.name)[new_key]


In [131]:
fraud_data.iloc[17]

user_id                        119824
signup_time       2015-03-20 00:31:27
purchase_time     2015-04-05 07:31:46
purchase_value                     55
device_id               WFIIFCPIOGMHT
source                            Ads
browser                        Safari
sex                                 M
age                                38
ip_address                     131424
class                               0
country                           NaN
Name: 17, dtype: object

In [132]:
test = fraud_data.iloc[17]['ip_address']

In [133]:
list(ip_data[(ip_data['lower_bound_ip_address'] < test) & 
        (ip_data['upper_bound_ip_address'] > test)]['country'])

[]

### Clean Data 

In [202]:
fraud_data.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,country
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0,Japan
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0,United States
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1,United States
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0,
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0,United States


In [173]:
#fraud_data['country'].value_counts()

In [203]:
fraud_data['signup_time'] = pd.to_datetime(fraud_data['signup_time'], infer_datetime_format=True)

In [204]:
fraud_data['purchase_time'] = pd.to_datetime(fraud_data['purchase_time'], infer_datetime_format=True)

In [206]:
fraud_data['time_to_purchase'] = fraud_data['purchase_time'] - fraud_data['signup_time']

In [207]:
fraud_data['days_to_purchase'] = fraud_data['time_to_purchase'].apply(lambda x: x.days)

In [209]:
y_labels = fraud_data['class']

In [212]:
clean_fraud_data = fraud_data.drop(['class', 'user_id', 'device_id', 
                                    'time_to_purchase', 'signup_time', 
                                    'purchase_time', 'ip_address'], axis=1)

In [214]:
clean_fraud_data.head()

Unnamed: 0,purchase_value,source,browser,sex,age,country,days_to_purchase
0,34,SEO,Chrome,M,39,Japan,52
1,16,Ads,Chrome,F,53,United States,0
2,15,SEO,Opera,M,53,United States,0
3,44,SEO,Safari,M,41,,5
4,39,Ads,Safari,M,45,United States,50


### Train-Test-Split and Dummify

In [216]:
X_train, X_test, y_train, y_test = train_test_split(clean_fraud_data, y_labels, test_size=.3)

In [217]:
encoder = Encoder(thresh=400)

In [236]:
encoder.fit(X_train)

<encoder.Encoder at 0x13aef00b8>

In [237]:
encoded_Xt = encoder.transform(X_train)

In [238]:
encoded_Xt.columns

Index(['purchase_value', 'age', 'days_to_purchase', 'source_SEO', 'source_Ads',
       'browser_Chrome', 'browser_IE', 'browser_Safari', 'browser_FireFox',
       'sex_M', 'country_United States', 'country_China', 'country_Japan',
       'country_United Kingdom', 'country_Korea Republic of',
       'country_Germany', 'country_France', 'country_Brazil', 'country_Canada',
       'country_Italy', 'country_Australia', 'country_Netherlands',
       'country_Russian Federation', 'country_India',
       'country_Taiwan; Republic of China (ROC)', 'country_Sweden',
       'country_Mexico', 'country_Spain', 'country_South Africa',
       'country_Switzerland', 'country_Poland', 'country_Indonesia',
       'country_Argentina', 'country_Norway', 'country_Colombia'],
      dtype='object')

In [239]:
encoded_X_test = encoder.transform(X_test)

In [240]:
encoded_X_test.columns

Index(['purchase_value', 'age', 'days_to_purchase', 'source_SEO', 'source_Ads',
       'browser_Chrome', 'browser_IE', 'browser_Safari', 'browser_FireFox',
       'sex_M', 'country_United States', 'country_China', 'country_Japan',
       'country_United Kingdom', 'country_Korea Republic of',
       'country_Germany', 'country_France', 'country_Brazil', 'country_Canada',
       'country_Italy', 'country_Australia', 'country_Netherlands',
       'country_Russian Federation', 'country_India',
       'country_Taiwan; Republic of China (ROC)', 'country_Sweden',
       'country_Mexico', 'country_Spain', 'country_South Africa',
       'country_Switzerland', 'country_Poland', 'country_Indonesia',
       'country_Argentina', 'country_Norway', 'country_Colombia'],
      dtype='object')

### First Model: Logistic Regression 

In [241]:
lr = LogisticRegression()

In [242]:
lr.fit(encoded_Xt, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [246]:
y_preds = lr.predict_proba(encoded_Xt)[:, 1]

In [247]:
lr_y_preds = lr.predict_proba(encoded_X_test)[:, 1]

In [250]:
lr_log_loss = log_loss(y_test, lr_y_preds)
print(f"log loss for logistic regression model was {lr_log_loss}")

log loss for logistic regression model was 0.27655233904817683
