In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.base import BaseEstimator, TransformerMixin
from src.pipeline_classes import Featurizer, Imputer, Standardizer, Dummifier
import src.model as model
import pickle

In [29]:
# unzip data.zip to inflate it into a .json file
!unzip data/data.zip
# move file from the working directory to the data subdirectory
!mv data.json data/

Archive:  data/data.zip
  inflating: data.json               


In [2]:
# load raw training data
X, y = model.load('data/data.json')

In [3]:
# look at data
X.head()

Unnamed: 0,body_length,channels,country,currency,delivery_method,description,email_domain,event_created,event_end,event_published,...,ticket_types,user_age,user_created,user_type,venue_address,venue_country,venue_latitude,venue_longitude,venue_name,venue_state
0,3852,5,US,USD,0.0,"<p><a href=""http://s432.photobucket.com/albums...",gmail.com,1262739706,1265630400,1263110000.0,...,"[{'event_id': 527017, 'cost': 25.0, 'availabil...",36,1259613950,1,717 Washington Avenue,US,25.777471,-80.133433,INK Nightclub - South Beach,FL
1,3499,0,US,USD,1.0,"<p>Join us for a quick, one-night, community-b...",ruf.org,1293832670,1296288000,1293833000.0,...,"[{'event_id': 786878, 'cost': 35.0, 'availabil...",149,1280942776,3,,US,32.776566,-79.930922,"The Charleston, SC area",SC
2,2601,8,US,USD,1.0,"<h3><span class=""subcategory""><strong>Teacher ...",pvsd.k12.ca.us,1291090956,1295740800,1291092000.0,...,"[{'event_id': 787337, 'cost': 93.51, 'availabi...",214,1272559388,3,10100 Pioneer Blvd Suite 100,US,33.944201,-118.080419,Los Angeles County Office of Education,CA
3,12347,6,IE,EUR,1.0,"<p style=""margin-bottom: 1.3em; padding-bottom...",irishtabletennis.com,1360681570,1388534400,1360683000.0,...,"[{'event_id': 885645, 'cost': 25.0, 'availabil...",889,1283870102,3,,,,,,
4,2417,11,US,USD,0.0,<p>Writers and filmmakers need to understand t...,artsandbusinesscouncil.org,1291994666,1297468800,1291995000.0,...,"[{'event_id': 1114349, 'cost': 150.0, 'availab...",35,1288984065,3,One Marina Park Drive,US,42.353848,-71.044276,Fish & Richardson,MA


In [4]:
y.head()

0     True
1    False
2    False
3    False
4    False
Name: fraud, dtype: bool

## Lets build a pipeline and fit it!

## STEP 1: Create Pipeline

<img src='images/step1.png'/>

In [5]:
pipe = Pipeline([
        ('featurizer', Featurizer()),
        ('imputer', Imputer()),
        ('dummifier', Dummifier()),
        ('standardizer', Standardizer()),
        ('model', RandomForestClassifier(n_estimators=500, 
                                         max_depth=25))
        ])

## STEP 2: Fit the entire pipeline

<img src='images/step2.png'/>

In [6]:
# fit the classes and train the model
pipe.fit(X, y)

Pipeline(memory=None,
     steps=[('featurizer', Featurizer(cols=['body_length', 'channels', 'country', 'currency', 'description', 'email_domain', 'event_created', 'event_end', 'event_published', 'event_start', 'fb_published', 'has_analytics', 'has_header', 'has_logo', 'listed', 'name', 'name_length', 'object_id', 'org_desc',...obs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

## STEP 3: Deploy the pipeline

We now have a fit pipeline with a fit model!

<img src='images/step3.png'/>

### We can now do one of two things:
1. We can use the pipeline to tranform our data and use the trained model to make predictions, or
2. We can pickle our pipeline object and move it to another machine to be used to tranform data and make predictions.

### Option 1: tranform and predict

In [7]:
new_data = pd.read_csv('data/new_data.csv')

In [8]:
# lets look at the new raw data:
new_data.head(1)

Unnamed: 0,body_length,channels,country,currency,delivery_method,description,email_domain,event_created,event_end,event_published,...,ticket_types,user_age,user_created,user_type,venue_address,venue_country,venue_latitude,venue_longitude,venue_name,venue_state
0,432,0,US,USD,0.0,"<p><span><span class=""fsl"">LOUD Championship E...",gmail.com,1365694066,1369018800,1365694000.0,...,"[{'event_id': 6225359, 'cost': 20.0, 'availabi...",1155,1265937792,1,905 Atlantic ave.,US,40.68097,-73.962861,Freecandy,NY


<img src='images/step4.png'/>

In [9]:
predictions = pipe.predict_proba(new_data)

In [10]:
success_probability = predictions.T[1]

In [11]:
#show first 10 probabilities
success_probability[:10]

array([1.63490814e-02, 8.00000000e-03, 0.00000000e+00, 2.12971926e-05,
       1.07589090e-02, 2.00000000e-03, 1.38305932e-01, 0.00000000e+00,
       2.05402746e-03, 0.00000000e+00])

### Option 2: Pickle, Send, Unpickle, Transform, Predict.

<img src='images/step5.png'/>

Save `pipe` object to a pickle file

In [12]:
output_file = 'data/pickled_pipe.pkl'

In [14]:
with open(output_file, 'wb') as f:
        pickle.dump(pipe, f)

You can now `move` the pickle file anywhere you want!

To unpickle the `pipe` object:

In [15]:
input_file = 'data/pickled_pipe.pkl'

In [16]:
unpickled_pipe = pickle.load(open(input_file, 'rb'))

In [17]:
unpickled_pipe.named_steps

{'featurizer': Featurizer(cols=['body_length', 'channels', 'country', 'currency', 'description', 'email_domain', 'event_created', 'event_end', 'event_published', 'event_start', 'fb_published', 'has_analytics', 'has_header', 'has_logo', 'listed', 'name', 'name_length', 'object_id', 'org_desc', 'org_facebook', 'org_name', 'org... 'venue_address', 'venue_country', 'venue_latitude', 'venue_longitude', 'venue_name', 'venue_state']),
 'imputer': Imputer(cols_dict={'body_length': 'cont', 'channels': 'cat', 'country': 'cat', 'currency': 'cat', 'fb_published': 'cat', 'has_analytics': 'cat', 'has_header': 'cat', 'has_logo': 'cat', 'listed': 'cat', 'name_length': 'cont', 'payout_type': 'cat', 'sale_duration': 'cont', 'show_map': 'cat', 'user_age': 'cont', 'user_type': 'cat', 'event_duration': 'cont', 'has_payee_name': 'cat', 'has_previous_payouts': 'cat', 'has_payout_type': 'cat', 'has_facebook': 'cat', 'has_twitter': 'cat'}),
 'dummifier': Dummifier(cols_to_dummy=['channels', 'country', 'currenc

## SUMMARY- All the steps:

<img src='images/step6.png'/>

## HOW TO BUILD A CUSTOM PIPELINE CLASS

### Basic structure:

In [23]:


class PipelineClass(BaseEstimator, TransformerMixin):
    def __init__(self, param=None):
        self.param = param
        
    def fit(self, X, y=None):
        return self
    
    def transform(X):
        X = X.copy()
        return X
    
    @staticmethod
    def helper_function(X):
        return X.shape
    
    

* The parameters need to be assigned in the `__init__` () method
* The name of 'param' needs to match the name 'self.params' exactly
* The fit method must return itself, even if there is no need to fit anything
* the class should inherit traits from `BaseEstimator` and `TransformerMixin`

### EXAMPLES

In [21]:


class Featurizer(BaseEstimator, TransformerMixin):
    """Clean incoming df to fit into model"""
    
    def __init__(self, cols=None):
        """INPUT: a data_type_dict to determine which columns are 
                  continueous and categorical
                  an optional cols list of columns to select"""
        if cols==None:
            self.cols = ['body_length', 
                            'channels', 
                            'country', 
                            'currency', 
                            'description', 
                            'email_domain', 
                            'event_created', 
                            'event_end',
                            'event_published', 
                            'event_start', 
                            'fb_published', 
                            'has_analytics',
                            'has_header', 
                            'has_logo', 
                            'listed', 
                            'name', 
                            'name_length', 
                            'object_id',
                            'org_desc', 
                            'org_facebook', 
                            'org_name', 
                            'org_twitter', 
                            'payee_name',
                            'payout_type', 
                            'previous_payouts', 
                            'sale_duration', 
                            'show_map',
                            'ticket_types', 
                            'user_age', 
                            'user_created', 
                            'user_type',
                            'venue_address', 
                            'venue_country', 
                            'venue_latitude', 
                            'venue_longitude',
                            'venue_name', 
                            'venue_state']
        else:
            self.cols = cols
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        """tranform and clean incoming training or test"""
        df = X.copy()
        df = df.loc[:,self.cols]
        df['event_duration'] = df['event_end']-df['event_start']
        df['has_payee_name'] = df['payee_name'].apply(self.is_empty)
        df['has_header'] = df['has_header'].fillna(0)
        df['has_previous_payouts'] = df['previous_payouts'].apply(self.is_empty)
        df['has_payout_type'] = df['payout_type'].apply(self.is_empty)
        df['has_facebook'] = df['org_facebook'].apply(self.is_not_zero)
        df['has_twitter'] = df['org_twitter'].apply(self.is_not_zero)
        df['country'] = df['country'].apply(self.replace_empty_with_none)
        drop_list = ['description',
                    'event_created',
                    'event_end',
                    'event_published',
                    'event_start',
                    'name',
                    'object_id',
                    'payee_name',
                    'ticket_types',
                    'user_created',
                    'venue_address',
                    'venue_country',
                    'venue_longitude',
                    'venue_latitude',
                    'venue_name',
                    'venue_state',
                    'previous_payouts',
                    'email_domain',
                    'org_name',
                    'org_twitter',
                    'org_facebook',
                    'org_desc']
        return df.drop(drop_list, axis=1)

    @staticmethod  
    def is_not_zero(x):
        if x == 0:
            return 0
        return 1

    @staticmethod
    def is_empty(x):
        if not x:
            return 0
        return 1

    @staticmethod
    def max_cost(row):
        """Find the hightest ticket price from a row in df['ticket_types']
        input: [{'event_id': 527017,
                'cost': 25.0,
                'availability': 1,
                'quantity_total': 800,
                'quantity_sold': 0},
                {'event_id': 527017,
                'cost': 50.0,
                'availability': 1,
                'quantity_total': 100,
                'quantity_sold': 0},
                {'event_id': 527017,
                'cost': 550.0,
                'availability': 1,
                'quantity_total': 20,
                'quantity_sold': 0}]
        output: 550.0 """
        maximum = 0
        for item in row:
            if item['cost'] >= maximum:
                maximum = item['cost']
        return maximum
    
    @staticmethod
    def replace_empty_with_none(x):
        if not x:
            return 'None'
        else: 
            return x
        
        

In [None]:


class Imputer(BaseEstimator, TransformerMixin):
    """Impute either mode or mean into cleaned and dummied data"""
    def __init__(self, cols_dict=None):
        if cols_dict==None:
            self.cols_dict = {'body_length':'cont', 
                                'channels':'cat', 
                                'country':'cat', 
                                'currency':'cat', 
                                'fb_published':'cat', 
                                'has_analytics':'cat', 
                                'has_header':'cat', 
                                'has_logo':'cat', 
                                'listed':'cat',
                                'name_length':'cont', 
                                'payout_type':'cat', 
                                'sale_duration':'cont', 
                                'show_map':'cat', 
                                'user_age':'cont',
                                'user_type':'cat', 
                                'event_duration':'cont', 
                                'has_payee_name':'cat', 
                                'has_previous_payouts':'cat',
                                'has_payout_type':'cat', 
                                'has_facebook':'cat', 
                                'has_twitter':'cat'}
        else:
            self.cols_dict = cols_dict

    def fit(self, X, y=None):
        """save the values to impute into each column"""
        df = X
        self.averages = {}
        for col, val in self.cols_dict.items():
            if val=='cat':
                self.averages[col] = 'None'
            if val=='cont':
                self.averages[col] = df.loc[:,col].mean()
        return self

    def transform(self, X):
        """for each column in df, impute the columns mean or mode if nan"""
        df = X.copy()
        for col in df.columns:
            df[col] = df[col].fillna(self.averages[col])
        return df
    
    

In [None]:


class Dummifier(BaseEstimator, TransformerMixin):
    """Dummify certain columns in a DataFrame"""
    def __init__(self, cols_to_dummy=None):
        if cols_to_dummy==None:
            self.cols_to_dummy = ['channels', 
                                  'country', 
                                  'currency', 
                                  'fb_published', 
                                  'has_analytics', 
                                  'has_header', 
                                  'has_logo', 
                                  'listed',
                                  'payout_type', 
                                  'show_map', 
                                  'user_type', 
                                  'has_payee_name', 
                                  'has_previous_payouts',
                                  'has_payout_type', 
                                  'has_facebook', 
                                  'has_twitter']
        else:
            self.cols_to_dummy = cols_to_dummy 
        self.unique_items = {}

    def fit(self, X, y=None):
        df = X
        for col in self.cols_to_dummy:
            self.unique_items[col] = df[col].unique()
        return self
            
    def transform(self, X):
        df = X.copy()
        dummy_df = pd.DataFrame()
        for col in self.cols_to_dummy:
            columns = self.unique_items[col]
            for item in columns:
                if item==None:
                    continue
                dummy_df[f'{col}_{item}'] = df[col]==item
            dummy_df = dummy_df.iloc[:,:-1]    
        df = df.drop(self.cols_to_dummy, axis=1)
        dummy_df = dummy_df.astype(int)
        df = pd.concat([df, dummy_df], axis=1)
        return df
    
    

In [None]:


class Standardizer(BaseEstimator, TransformerMixin):
    """Standardize continuous columns"""
    def __init__(self, continuous_cols=None):
        if continuous_cols==None:
            self.continous_cols = ['body_length', 'name_length', 
                                   'sale_duration', 'user_age', 
                                   'event_duration']
        else:
            self.continous_cols = continuous_cols

    def fit(self, X, y=None):
        df = X
        self.means = {}
        self.standard_devs = {}
        for col in self.continous_cols:
            self.means[col] = df[col].mean()
            self.standard_devs[col] = df[col].std()
        return self
    
    def transform(self, X):
        df = X.copy()
        for col in self.continous_cols:
            df[col] = (df[col]-self.means[col])/self.standard_devs[col]
        return df
    
    

### OTHER PIPELINE STRUCTURES

sklearn pipeline psueocode:

class Pipeline():
    def __init__(self, steps):
    self.steps = steps
    def fit(self, X, y=None):    
        X = X.copy()    
        for step in self.steps:    
            X = step.fit_transform(X)
        return self


### FINAL GOAL:

In [3]:
def fraud_pipeline():
    """instantiate a pipeline object"""
    pipeline = Pipeline([
        ('featurizer', Featurizer()),
        ('imputer', Imputer()),
        ('dummifier', Dummifier()),
        ('standardizer', Standardizer()),
        ('model', RandomForestClassifier(n_estimators=5000, 
                                         max_depth=25))
        ])
    return pipeline

def pickle_pipeline(pipeline, output_name):
    """Save fitted pipeline to pickle file"""
    with open(output_name, 'wb') as f:
        pickle.dump(pipeline, f)