In [57]:
import pandas as pd
import numpy as np
from sklearn.ensemble import (RandomForestClassifier, 
                            GradientBoostingClassifier, 
                            AdaBoostClassifier)
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score

from imblearn.over_sampling import SMOTE

Using TensorFlow backend.


In [14]:
df = pd.read_pickle('../data/clean/clean.pkl')
df = df.drop(['description', 'email_domain'], axis=1)

In [15]:
df.columns

Index(['body_length', 'fb_published', 'has_analytics', 'has_logo', 'listed',
       'name_length', 'num_payouts', 'org_facebook', 'org_twitter', 'show_map',
       'user_age', 'fraud', 'time_to_event', 'event_length', 'channels__0',
       'channels__4', 'channels__5', 'channels__6', 'channels__7',
       'channels__8', 'channels__9', 'channels__10', 'channels__11',
       'channels__12', 'channels__13', 'delivery_method__0.0',
       'delivery_method__1.0', 'delivery_method__3.0', 'user_type__1',
       'user_type__2', 'user_type__3', 'user_type__4', 'user_type__5',
       'user_type__103', 'currency__AUD', 'currency__CAD', 'currency__EUR',
       'currency__GBP', 'currency__MXN', 'currency__NZD', 'currency__USD',
       'payout_type__', 'payout_type__ACH', 'payout_type__CHECK',
       'pop_country', 'country_is_null', 'venue_country_is_null',
       'country_is_venue', 'avg_ticket_price', 'tot_ticket_available',
       'avg_cost_per_ticket'],
      dtype='object')

In [25]:
df.time_to_event = df.time_to_event.astype('int')
df.event_length = df.event_length.astype('int')

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14337 entries, 0 to 14336
Data columns (total 51 columns):
body_length              14337 non-null int64
fb_published             14337 non-null int64
has_analytics            14337 non-null int64
has_logo                 14337 non-null int64
listed                   14337 non-null int64
name_length              14337 non-null int64
num_payouts              14337 non-null int64
org_facebook             14337 non-null float64
org_twitter              14337 non-null float64
show_map                 14337 non-null int64
user_age                 14337 non-null int64
fraud                    14337 non-null bool
time_to_event            14337 non-null int64
event_length             14337 non-null int64
channels__0              14337 non-null uint8
channels__4              14337 non-null uint8
channels__5              14337 non-null uint8
channels__6              14337 non-null uint8
channels__7              14337 non-null uint8
channels__8   

In [37]:
y = df['fraud'].values
df = df.drop('fraud', axis=1)
X = df.values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.20, stratify=y)


In [72]:
X_resampled, y_resampled = SMOTE().fit_resample(X, y)
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=.20)

In [69]:
from collections import Counter
print(sorted(Counter(y_resampled).items()))

[(False, 13044), (True, 13044)]


In [73]:
rf = RandomForestClassifier(n_jobs=-1)
rf.fit(X_train, y_train)


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [74]:
predictions = rf.predict(X_test)
acc = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
print(f'Accuracy score: {acc}\n Precision score: {precision}\n Recall score: {recall}')


Accuracy score: 0.9900344959754696
 Precision score: 0.9933749025720966
 Recall score: 0.9864551083591331


In [91]:
order = np.argsort(rf.feature_importances_)[::-1]
cols = df.columns
for o in order:
    print(f'{cols[o]} importances is: {rf.feature_importances_[o]}')


# cols = df.columns
# for col, importance in zip(cols, rf.feature_importances_):
#     print(f'{col} importance: {importance} \n')

time_to_event importances is: 0.135273879226691
payout_type__ importances is: 0.11765416829583537
user_type__1 importances is: 0.10545234547042938
num_payouts importances is: 0.0747474797490158
user_age importances is: 0.05692771027614728
delivery_method__0.0 importances is: 0.05659007105058648
country_is_venue importances is: 0.049479635440193695
user_type__3 importances is: 0.03841848606485981
avg_ticket_price importances is: 0.03677892906118678
delivery_method__1.0 importances is: 0.03580300840594217
org_facebook importances is: 0.024570963153684507
has_logo importances is: 0.02358465392946967
org_twitter importances is: 0.02342338416156829
channels__0 importances is: 0.020662098339148356
venue_country_is_null importances is: 0.019347970008397445
payout_type__ACH importances is: 0.017702253583629677
payout_type__CHECK importances is: 0.017648246602926018
avg_cost_per_ticket importances is: 0.016024948287948038
body_length importances is: 0.014626683460324816
event_length importances

In [75]:
gbc = GradientBoostingClassifier()
gbc.fit(X_train, y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=100,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=None, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

In [76]:
predictions = gbc.predict(X_test)
acc = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
print(f'Accuracy score: {acc}\n Precision score: {precision}\n Recall score: {recall}')


Accuracy score: 0.9842851667305481
 Precision score: 0.9882903981264637
 Recall score: 0.9798761609907121


In [77]:
abc = AdaBoostClassifier()
abc.fit(X_train, y_train)

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)

In [78]:
predictions = abc.predict(X_test)
acc = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
print(f'Accuracy score: {acc}\n Precision score: {precision}\n Recall score: {recall}')

Accuracy score: 0.9783441931774627
 Precision score: 0.977211278485902
 Recall score: 0.9791021671826625


In [79]:
logit = LogisticRegression(n_jobs=-1)
logit.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=-1, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [80]:
predictions = logit.predict(X_test)
acc = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
print(f'Accuracy score: {acc}\n Precision score: {precision}\n Recall score: {recall}')


Accuracy score: 0.5132234572633193
 Precision score: 0.664179104477612
 Recall score: 0.03444272445820434


In [81]:
svc = SVC()
svc.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)

In [82]:
predictions = svc.predict(X_test)
acc = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions)
recall = recall_score(y_test, predictions)
print(f'Accuracy score: {acc}\n Precision score: {precision}\n Recall score: {recall}')

Accuracy score: 0.7972403219624378
 Precision score: 0.8468181818181818
 Recall score: 0.7209752321981424


In [99]:
import psycopg2 as pg2

In [165]:
conn.close()

In [166]:
conn = pg2.connect(dbname='predictions', user='postgres', host='localhost', port='5432')
cur = conn.cursor()

In [167]:
query = '''
        CREATE TABLE fraud_predictions2(
        index numeric NOT NULL,
        predictions numeric NOT NULL,
        test_var numeric);
        '''
cur.execute(query)

In [158]:
query3 = "INSERT INTO fraud_predictions (predictions, test_var) VALUES (100, 102), (92, 87);"
cur.execute(query3)
conn.commit()


In [174]:
query2 = '''
        SELECT * FROM fraud_predictions;'''
cur.execute(query2)


In [175]:
for r in cur.fetchall():
    print(r)

(Decimal('100'), Decimal('102'))
(Decimal('92'), Decimal('87'))
(Decimal('100'), Decimal('102'))
(Decimal('92'), Decimal('87'))
(Decimal('100'), Decimal('102'))
(Decimal('92'), Decimal('87'))
(Decimal('100'), Decimal('102'))
(Decimal('92'), Decimal('87'))
(Decimal('100'), Decimal('102'))
(Decimal('92'), Decimal('87'))
(Decimal('100'), Decimal('102'))
(Decimal('92'), Decimal('87'))
(Decimal('100'), Decimal('102'))
(Decimal('92'), Decimal('87'))
(Decimal('100'), Decimal('102'))
(Decimal('92'), Decimal('87'))


In [None]:
def add_data(hunters_df):
    engine = create_engine('postgres://postgres:mysecretpassword@localhost:5432')
    hunters_df.to_sql('fraud_predictions', index=False, con=engine, if_exists='append')

def retrieve_data():
    conn = pg2.connection(dbname='fraud', user='postgres', password='mysecretpassword', host='localhost', port='5432')
    cur = conn.cursor()
    query = "SELECT * FROM fraud_predictions"
    cur.execute(query)
    data = []
    for r in cur.fetchall()[::-1]:
        data.append(r)
    conn.close()
    
    return data