In [58]:
import pandas as pd
import numpy as np
from spacy.lang.en import English
import spacy
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import string
from spacy.lang.en.stop_words import STOP_WORDS

from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import cross_val_score
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as smote_pipeline
from sklearn.model_selection import GridSearchCV


In [2]:
df = pd.read_json('./../data/data.json')
df

Unnamed: 0,acct_type,approx_payout_date,body_length,channels,country,currency,delivery_method,description,email_domain,event_created,...,ticket_types,user_age,user_created,user_type,venue_address,venue_country,venue_latitude,venue_longitude,venue_name,venue_state
0,fraudster_event,1266062400,3852,5,US,USD,0.0,"<p><a href=""http://s432.photobucket.com/albums...",gmail.com,1262739706,...,"[{'event_id': 527017, 'cost': 25.0, 'availabil...",36,1259613950,1,717 Washington Avenue,US,25.777471,-80.133433,INK Nightclub - South Beach,FL
1,premium,1296720000,3499,0,US,USD,1.0,"<p>Join us for a quick, one-night, community-b...",ruf.org,1293832670,...,"[{'event_id': 786878, 'cost': 35.0, 'availabil...",149,1280942776,3,,US,32.776566,-79.930922,"The Charleston, SC area",SC
2,premium,1296172800,2601,8,US,USD,1.0,"<h3><span class=""subcategory""><strong>Teacher ...",pvsd.k12.ca.us,1291090956,...,"[{'event_id': 787337, 'cost': 93.51, 'availabi...",214,1272559388,3,10100 Pioneer Blvd Suite 100,US,33.944201,-118.080419,Los Angeles County Office of Education,CA
3,premium,1388966400,12347,6,IE,EUR,1.0,"<p style=""margin-bottom: 1.3em; padding-bottom...",irishtabletennis.com,1360681570,...,"[{'event_id': 885645, 'cost': 25.0, 'availabil...",889,1283870102,3,,,,,,
4,premium,1297900800,2417,11,US,USD,0.0,<p>Writers and filmmakers need to understand t...,artsandbusinesscouncil.org,1291994666,...,"[{'event_id': 1114349, 'cost': 150.0, 'availab...",35,1288984065,3,One Marina Park Drive,US,42.353848,-71.044276,Fish & Richardson,MA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14332,fraudster_event,1361689200,667,0,US,USD,0.0,<p>Celebrates this wonderful holiday with the ...,yahoo.com,1360297993,...,"[{'event_id': 5456442, 'cost': 45.0, 'availabi...",0,1360297993,1,6100 North Charles Street,US,39.373780,-76.629921,Elkridge Club,MD
14333,premium,1365566400,5812,5,US,USD,1.0,"<table style=""width: 600px;"" border=""0"" cellsp...",me.com,1360367042,...,"[{'event_id': 5465112, 'cost': 90.0, 'availabi...",1374,1241625346,4,1775 E Mission Bay Dr,US,32.778906,-117.209791,Hilton San Diego Resort and Spa (Hilton Missio...,CA
14334,premium,1368781200,557,13,,USD,0.0,<p>What a way to start off your Mother's Day l...,yahoo.com,1360600330,...,"[{'event_id': 5465732, 'cost': 20.0, 'availabi...",84,1353386971,4,5991 Bullard Road,US,30.041819,-89.957130,Fleur De Lis Ballroom NUMBER 2,LA
14335,tos_warn,1361361600,10873,8,US,USD,0.0,"<p style=""font-family: 'Helvetica Neue', Helve...",velvetlist.com,1360376285,...,"[{'event_id': 5465810, 'cost': 79.33, 'availab...",812,1290204991,3,2297 Cedar Ave,US,40.862283,-73.911363,Salsa con fuego,NY


In [8]:
y = df['acct_type'].str.contains('fraud')
X = df.drop('acct_type', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X,
                                                   y,
                                                   test_size=0.20,
                                                   stratify=y)

In [39]:
punctuations = string.punctuation
nlp = spacy.load('en_core_web_lg')
stop_words = STOP_WORDS
parser = English()

In [40]:
def spacy_tokenizer(text):
    mytokens = parser(text)
    mytokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens]
    mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]
    return mytokens


In [46]:
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y=None, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}
    def clean_text(text):
        return text.strip().lower()

In [55]:
vectorizer = TfidfVectorizer(tokenizer=spacy_tokenizer,
                            ngram_range=(1, 2),
                            max_features=100000,
                            max_df=0.90)
vectorizer

TfidfVectorizer(max_df=0.9, max_features=100000, ngram_range=(1, 2),
                tokenizer=<function spacy_tokenizer at 0x7fd072062280>)

In [56]:
pipe = smote_pipeline(steps=[
    ('cleaner', predictors()),
    ('vectorizer', vectorizer),
    ('smoter', SMOTE(n_jobs=-1)),
    ('cls', MultinomialNB())
])
pipe

Pipeline(steps=[('cleaner', <__main__.predictors object at 0x7fcfb287db50>),
                ('vectorizer',
                 TfidfVectorizer(max_df=0.9, max_features=100000,
                                 ngram_range=(1, 2),
                                 tokenizer=<function spacy_tokenizer at 0x7fd072062280>)),
                ('smoter', SMOTE(n_jobs=-1)), ('cls', MultinomialNB())])

In [52]:
pipe.fit(X_train['description'], y_train)

KeyboardInterrupt: 

In [53]:
scores = cross_val_score(pipe, X_train['description'], y_train,
               scoring='f1', cv=5, n_jobs=-1)
print(scores.mean())
print(scores)

0.33686411282455453
[0.36220472 0.29394813 0.30853994 0.37325905 0.34636872]


In [57]:
#tf scores
scores = cross_val_score(pipe, X_train['description'], y_train,
               scoring='f1', cv=5, n_jobs=-1)
print(scores.mean())
print(scores)

0.36718502780029993
[0.34181818 0.34285714 0.35172414 0.38028169 0.41924399]


In [59]:
grid = {'vectorizer__max_df': [0.75, 0.80, 0.85, 0.90, 0.95],
       'vectorizer__ngram_range': [(1, 1), (1, 2), (2, 2), (2, 3)],
       'vectorizer__max_features': [1000, 5000, 10000, 100000]}

search = GridSearchCV(pipe, grid, scoring='f1', n_jobs=-1, cv=5)
search.fit(X_train['description'], y_train)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('cleaner',
                                        <__main__.predictors object at 0x7fcfb287db50>),
                                       ('vectorizer',
                                        TfidfVectorizer(max_df=0.9,
                                                        max_features=100000,
                                                        ngram_range=(1, 2),
                                                        tokenizer=<function spacy_tokenizer at 0x7fd072062280>)),
                                       ('smoter', SMOTE(n_jobs=-1)),
                                       ('cls', MultinomialNB())]),
             n_jobs=-1,
             param_grid={'vectorizer__max_df': [0.75, 0.8, 0.85, 0.9, 0.95],
                         'vectorizer__max_features': [1000, 5000, 10000,
                                                      100000],
                         'vectorizer__ngram_range': [(1, 1), (1, 2), (2, 2),
  

In [60]:
search.best_estimator_

Pipeline(steps=[('cleaner', <__main__.predictors object at 0x7fcf9662e910>),
                ('vectorizer',
                 TfidfVectorizer(max_df=0.95, max_features=100000,
                                 ngram_range=(2, 2),
                                 tokenizer=<function spacy_tokenizer at 0x7fd072062280>)),
                ('smoter', SMOTE(n_jobs=-1)), ('cls', MultinomialNB())])

In [61]:
search.best_score_

0.37679431798013263

In [62]:
search.best_params_

{'vectorizer__max_df': 0.95,
 'vectorizer__max_features': 100000,
 'vectorizer__ngram_range': (2, 2)}

In [63]:
np.mean([1, 2, 3])

2.0