In [1]:
import numpy as np
import pandas as pd
import pprint as pp
from collections import Counter

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

from imblearn.over_sampling import RandomOverSampler

from statsmodels.api import Logit
from statsmodels.tools.tools import add_constant

from bs4 import BeautifulSoup

In [2]:
'''
Read file into dataframe.
Check all information of the data.
'''
df = pd.read_json('data.json')

In [3]:
df.shape

(14337, 44)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14337 entries, 0 to 14336
Data columns (total 44 columns):
acct_type             14337 non-null object
approx_payout_date    14337 non-null int64
body_length           14337 non-null int64
channels              14337 non-null int64
country               14256 non-null object
currency              14337 non-null object
delivery_method       14321 non-null float64
description           14337 non-null object
email_domain          14337 non-null object
event_created         14337 non-null int64
event_end             14337 non-null int64
event_published       14238 non-null float64
event_start           14337 non-null int64
fb_published          14337 non-null int64
gts                   14337 non-null float64
has_analytics         14337 non-null int64
has_header            8928 non-null float64
has_logo              14337 non-null int64
listed                14337 non-null object
name                  14337 non-null object
name_length      

In [5]:
df.head()

Unnamed: 0,acct_type,approx_payout_date,body_length,channels,country,currency,delivery_method,description,email_domain,event_created,...,ticket_types,user_age,user_created,user_type,venue_address,venue_country,venue_latitude,venue_longitude,venue_name,venue_state
0,fraudster_event,1266062400,3852,5,US,USD,0.0,"<p><a href=""http://s432.photobucket.com/albums...",gmail.com,1262739706,...,"[{u'event_id': 527017, u'cost': 25.0, u'availa...",36,1259613950,1,717 Washington Avenue,US,25.777471,-80.133433,INK Nightclub - South Beach,FL
1,premium,1296720000,3499,0,US,USD,1.0,"<p>Join us for a quick, one-night, community-b...",ruf.org,1293832670,...,"[{u'event_id': 786878, u'cost': 35.0, u'availa...",149,1280942776,3,,US,32.776566,-79.930922,"The Charleston, SC area",SC
2,premium,1296172800,2601,8,US,USD,1.0,"<h3><span class=""subcategory""><strong>Teacher ...",pvsd.k12.ca.us,1291090956,...,"[{u'event_id': 787337, u'cost': 93.51, u'avail...",214,1272559388,3,10100 Pioneer Blvd Suite 100,US,33.944201,-118.080419,Los Angeles County Office of Education,CA
3,premium,1388966400,12347,6,IE,EUR,1.0,"<p style=""margin-bottom: 1.3em; padding-bottom...",irishtabletennis.com,1360681570,...,"[{u'event_id': 885645, u'cost': 25.0, u'availa...",889,1283870102,3,,,,,,
4,premium,1297900800,2417,11,US,USD,0.0,<p>Writers and filmmakers need to understand t...,artsandbusinesscouncil.org,1291994666,...,"[{u'event_id': 1114349, u'cost': 150.0, u'avai...",35,1288984065,3,One Marina Park Drive,US,42.353848,-71.044276,Fish & Richardson,MA


In [6]:
df.acct_type.unique()

array([u'fraudster_event', u'premium', u'spammer_warn', u'fraudster',
       u'spammer_limited', u'spammer_noinvite', u'locked', u'tos_lock',
       u'tos_warn', u'fraudster_att', u'spammer_web', u'spammer'], dtype=object)

In [7]:
'''
Each transaction has a description.
Try print the description of our first transaction.
'''
pp.pprint(df.description[0])

u'<p><a href="http://s432.photobucket.com/albums/qq49/digusonline/?action=view&amp;current=supersunday.jpg" target="_blank"><img src="http://i432.photobucket.com/albums/qq49/digusonline/supersunday.jpg" border="0" alt="Photobucket" /></a></p>\r\n<p>\xa0</p>\r\n<p style="text-align: center;"><font size="3"><strong>Party Starz Entertaintment &amp; Diverse International Group Presents...<br /></strong></font><br /><font face="tahoma,arial,helvetica,sans-serif" size="4"><strong>The Official\xa0"99 Hour No Sleep" Super Bowl </strong></font></p>\r\n<p style="text-align: center;"><font face="tahoma,arial,helvetica,sans-serif" size="4"><strong>Weekend </strong></font><font face="tahoma,arial,helvetica,sans-serif" size="4"><strong>Grand Finale</strong></font></p>\r\n<p><br /><span style="font-family: Times New Roman,serif;"><font size="3"><font face="tahoma,arial,helvetica,sans-serif">No matter who wins or loses, this post-game party cannot be missed! Enjoy the drink specials and all night musi

In [8]:
df['has_header'].dtypes

dtype('float64')

In [9]:
'''
Focus on columns with numerical values.
'''
num = [col for col in df.columns if str(df[col].dtypes) in ['int64','float64']]

In [10]:
num

[u'approx_payout_date',
 u'body_length',
 u'channels',
 u'delivery_method',
 u'event_created',
 u'event_end',
 u'event_published',
 u'event_start',
 u'fb_published',
 u'gts',
 u'has_analytics',
 u'has_header',
 u'has_logo',
 u'name_length',
 u'num_order',
 u'num_payouts',
 u'object_id',
 u'org_facebook',
 u'org_twitter',
 u'sale_duration',
 u'sale_duration2',
 u'show_map',
 u'user_age',
 u'user_created',
 u'user_type',
 u'venue_latitude',
 u'venue_longitude']

In [11]:
'''
Create a label column that has boolean values for whether an action is fraud or not.
'''
ndf = df

In [12]:
ndf['label'] = ndf.acct_type.apply(lambda x: 1 if x in ['fraudster_event','fraudster','fraudster_att'] else 0)

In [14]:
'''
Split the data into training set and testing set.
Then oversample the fraud cases because this is a VERY unbalanced dataset.
'''
X_train, X_test, y_train, y_test = train_test_split(ndf[num], ndf['label'])

In [23]:
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

In [16]:
ros = RandomOverSampler(random_state=42)
X_ros, y_ros = ros.fit_sample(X_train, y_train)

In [17]:
y_ros.mean()
# Now balanced

0.5

In [18]:
print X_ros.shape, y_ros.shape

(19574, 27) (19574,)


In [19]:
X_ros = pd.DataFrame(X_ros, columns = X_train.columns)

In [20]:
'''
Build a random forest classifier model.
'''
model = RandomForestClassifier()

In [24]:
model.fit(X_ros, y_ros)
cross_val_score(model, X_test, y_test)

array([ 0.96822742,  0.96401674,  0.96314908])

In [25]:
cross_val_score(model, X_test, y_test, scoring='f1')

array([ 0.81339713,  0.75789474,  0.8       ])

In [26]:
cross_val_score(model, X_test, y_test, scoring='precision')

array([ 0.88764045,  0.82795699,  0.96103896])

In [27]:
cross_val_score(model, X_test, y_test, scoring='recall')

array([ 0.74545455,  0.71559633,  0.66972477])

In [None]:
# Random forest model has really good accuracy score!
# But recall score is not so good

In [28]:
'''
Now try a logistic regression model.
'''
log = LogisticRegression()

In [29]:
log.fit(X_ros, y_ros)
print 'Accuracy:', cross_val_score(log, X_test, y_test)
print 'f1:', cross_val_score(log, X_test, y_test, scoring='f1')
print 'Precision:', cross_val_score(log, X_test, y_test, scoring='precision')
print 'Recall:', cross_val_score(log, X_test, y_test, scoring='recall')

Accuracy: [ 0.91889632  0.91129707  0.91708543]
f1: [ 0.23622047  0.11666667  0.16806723]
Precision: [ 0.88235294  0.63636364  1.        ]
Recall: [ 0.13636364  0.06422018  0.09174312]


In [None]:
# Logistic regression model with current features is awful

In [31]:
'''
Look at feature importance from our random forest model.
'''
feat = model.fit(X_ros, y_ros).feature_importances_
sorted(zip(X_train.columns, feat), key= lambda x: x[1])[::-1]

[(u'sale_duration2', 0.18769256339798449),
 (u'sale_duration', 0.16602612879137046),
 (u'user_age', 0.1191321267335446),
 (u'gts', 0.097864227523428299),
 (u'num_order', 0.048402431089319513),
 (u'org_facebook', 0.037301215046494307),
 (u'body_length', 0.030158957200319143),
 (u'num_payouts', 0.02894737200055526),
 (u'name_length', 0.028554882200383463),
 (u'venue_longitude', 0.025594649661269165),
 (u'delivery_method', 0.025167632719196014),
 (u'event_published', 0.024787095792643203),
 (u'user_created', 0.024449636240561196),
 (u'venue_latitude', 0.024017021039345329),
 (u'user_type', 0.022074270581711825),
 (u'org_twitter', 0.020299407692757419),
 (u'object_id', 0.015927775804413485),
 (u'approx_payout_date', 0.015518712752445535),
 (u'event_start', 0.014993235992162285),
 (u'event_created', 0.013583042884630516),
 (u'event_end', 0.010774187833102297),
 (u'channels', 0.008224097475975186),
 (u'has_logo', 0.0053879890540278535),
 (u'has_header', 0.0018294456750781597),
 (u'show_map',

In [None]:
# Top features are: Transaction duration, user age, gts, number of orders, and org facebook 

In [34]:
ndf[num].describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
approx_payout_date,14337.0,1350933000.0,24013000.0,1171256000.0,1334221000.0,1355805000.0,1370646000.0,1523488000.0
body_length,14337.0,3672.522,5758.113,0.0,750.0,1990.0,4280.0,65535.0
channels,14337.0,6.176676,3.860333,0.0,5.0,6.0,8.0,13.0
delivery_method,14321.0,0.4395643,0.5914866,0.0,0.0,0.0,1.0,3.0
event_created,14337.0,1345825000.0,24041070.0,1170332000.0,1329754000.0,1351043000.0,1365346000.0,1382012000.0
event_end,14337.0,1350501000.0,24012990.0,1170824000.0,1333789000.0,1355373000.0,1370214000.0,1523056000.0
event_published,14238.0,1340854000.0,87503860.0,0.0,1329499000.0,1351099000.0,1365478000.0,1381608000.0
event_start,14337.0,1350205000.0,23987870.0,1170819000.0,1333406000.0,1355000000.0,1370048000.0,1418850000.0
fb_published,14337.0,0.1274325,0.3334685,0.0,0.0,0.0,0.0,1.0
gts,14337.0,2430.231,9142.308,0.0,116.41,431.93,1547.26,306293.9


In [35]:
'''
Now look at columns with texts.
'''
obj = [col for col in df.columns if col not in num]

In [36]:
len(obj)

18

In [37]:
obj

[u'acct_type',
 u'country',
 u'currency',
 u'description',
 u'email_domain',
 u'listed',
 u'name',
 u'org_desc',
 u'org_name',
 u'payee_name',
 u'payout_type',
 u'previous_payouts',
 u'ticket_types',
 u'venue_address',
 u'venue_country',
 u'venue_name',
 u'venue_state',
 'label']

In [38]:
'''
Look for unique values.
'''
uni = {}
for col in obj:
    if col not in ['previous_payouts','ticket_types']:
        uni[col] = len(df[col].unique())

In [39]:
uni

{u'acct_type': 12,
 u'country': 73,
 u'currency': 7,
 u'description': 13095,
 u'email_domain': 5772,
 'label': 2,
 u'listed': 2,
 u'name': 13940,
 u'org_desc': 5845,
 u'org_name': 9501,
 u'payee_name': 2481,
 u'payout_type': 3,
 u'venue_address': 10142,
 u'venue_country': 70,
 u'venue_name': 9788,
 u'venue_state': 443}

In [45]:
'''
Focus on description, use beautiful soup to parse the information.
'''
html_doc = df['description']

In [46]:
html_doc[0]

u'<p><a href="http://s432.photobucket.com/albums/qq49/digusonline/?action=view&amp;current=supersunday.jpg" target="_blank"><img src="http://i432.photobucket.com/albums/qq49/digusonline/supersunday.jpg" border="0" alt="Photobucket" /></a></p>\r\n<p>\xa0</p>\r\n<p style="text-align: center;"><font size="3"><strong>Party Starz Entertaintment &amp; Diverse International Group Presents...<br /></strong></font><br /><font face="tahoma,arial,helvetica,sans-serif" size="4"><strong>The Official\xa0"99 Hour No Sleep" Super Bowl </strong></font></p>\r\n<p style="text-align: center;"><font face="tahoma,arial,helvetica,sans-serif" size="4"><strong>Weekend </strong></font><font face="tahoma,arial,helvetica,sans-serif" size="4"><strong>Grand Finale</strong></font></p>\r\n<p><br /><span style="font-family: Times New Roman,serif;"><font size="3"><font face="tahoma,arial,helvetica,sans-serif">No matter who wins or loses, this post-game party cannot be missed! Enjoy the drink specials and all night musi

In [47]:
soup = BeautifulSoup(html_doc[0], 'html.parser')

In [48]:
print(soup.prettify())

<p>
 <a href="http://s432.photobucket.com/albums/qq49/digusonline/?action=view&amp;current=supersunday.jpg" target="_blank">
  <img alt="Photobucket" border="0" src="http://i432.photobucket.com/albums/qq49/digusonline/supersunday.jpg"/>
 </a>
</p>
<p>
</p>
<p style="text-align: center;">
 <font size="3">
  <strong>
   Party Starz Entertaintment &amp; Diverse International Group Presents...
   <br/>
  </strong>
 </font>
 <br/>
 <font face="tahoma,arial,helvetica,sans-serif" size="4">
  <strong>
   The Official "99 Hour No Sleep" Super Bowl
  </strong>
 </font>
</p>
<p style="text-align: center;">
 <font face="tahoma,arial,helvetica,sans-serif" size="4">
  <strong>
   Weekend
  </strong>
 </font>
 <font face="tahoma,arial,helvetica,sans-serif" size="4">
  <strong>
   Grand Finale
  </strong>
 </font>
</p>
<p>
 <br/>
 <span style="font-family: Times New Roman,serif;">
  <font size="3">
   <font face="tahoma,arial,helvetica,sans-serif">
    No matter who wins or loses, this post-game party

In [49]:
soup1 = BeautifulSoup(html_doc[1], 'html.parser')
print(soup1.prettify())

<p>
 Join us for a quick, one-night, community-building trip to Charleston, SC.
</p>
<p>
 <strong>
  COST:
 </strong>
</p>
<p>
 The trip costs a total of $35, which includes breakfast and lunch on Saturday and housing. You will be responsible to pay for your own dinner on Friday and Saturday. Early Bird tickets are available until noon on January 20 and include a great Winter Getaway T-Shirt. So don't wait to register!
</p>
<p>
 Registration ends at noon on January 27.
</p>
<p>
 A
 <em>
  very limited number
 </em>
 of scholarships are available. Contact your campus minister as soon as possible to see if you qualify.
</p>
<p>
 <strong>
  TRANSPORTATION:
 </strong>
</p>
<p>
 We are working to secure two fifteen passenger buses, which will be filled on a first-come, first-served basis. Payment must be received before a slot on a bus can be reserved. You will be responsible for your own transportation if a bus slot is not available.
</p>
<p>
 <strong>
  WHAT TO BRING:
 </strong>
</p>
<p>
