In [6]:
from sklearn.preprocessing import OneHotEncoder
import pandas as pd
import numpy as np

In [32]:
data = pd.DataFrame({'a':[1.232,2.2323,3.4342],
                     'b':['apple','banana','banana'],
                     'c':['bob','rob','sally'],
                     'd':[232,656,343],
                     'e':[1,0,0]})
categ = ['b','c']
data

Unnamed: 0,a,b,c,d,e
0,1.232,apple,bob,232,1
1,2.2323,banana,rob,656,0
2,3.4342,banana,sally,343,0


In [33]:
enc = OneHotEncoder().fit(data[categ])  # fit to categorical vars
dummy_categ = enc.transform(data[categ])

# concatenate categorical features and year of birth feature
dummy_categ = pd.DataFrame(dummy_categ.toarray())
features = pd.concat([dummy_categ, data.drop(categ+['e'], axis=1)], axis=1)
features

Unnamed: 0,0,1,2,3,4,a,d
0,1.0,0.0,1.0,0.0,0.0,1.232,232
1,0.0,1.0,0.0,1.0,0.0,2.2323,656
2,0.0,1.0,0.0,0.0,1.0,3.4342,343


In [37]:
features.columns = [str(i) for i in features.columns]
features.columns

Index(['0', '1', '2', '3', '4', 'a', 'd'], dtype='object')

In [39]:
import pickle
import pandas as pd
import numpy as np
import yaml
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import log_loss
from sklearn.metrics import classification_report


def get_model(model_path, encoder_path):
    '''Opens pickled model and encoder for the data
    Args:
        model_path (str): path to pickled model
        encoder_path (str): path to pickled encoder
    Returns:
        model (): binary classifier logistic regression model
        encoder (sklearn.preprocessing._encoders.OneHotEncoder): encoder for categorical variables
    '''
    
    with open(model_path, "rb") as input_file:
        model = pickle.load(input_file)

    with open(encoder_path, "rb") as input_file:
        enc = pickle.load(input_file)
    
    return model, enc

def transform(encoder, cat_inputs, trans_price):
    '''Transforms raw input into encoded input for model use
    Args:
        encoder (sklearn.preprocessing._encoders.OneHotEncoder): encoder for categorical variables
        cat_inputs (:obj:`list` of :obj:`str`): categorical inputs of individual
        trans_price (float): stock price on the day of transaction
    Returns:
        test_new (2D :obj:`list` of :obj:`int): encoded inputs for model prediction
    '''
    test_new = encoder.transform([cat_inputs]).toarray()  # needs 2d array
    test_new = np.append(test_new[0], trans_price)  # encoder returns 2d array, need element inside
    test_new = [test_new]  # predict function expects 2d arrray
    return test_new

def predict_ind(model, encoder, cat_inputs, trans_price):
    '''Predicts the probabilities for a new model
    Args:
        model (sklearn.multiclass.OneVsRestClassifier): binary logistic regression model
        encoder (sklearn.preprocessing._encoders.OneHotEncoder): encoder for categorical variables
        cat_inputs (list): categorical inputs of individual
        trans_price (float): birth year for individual
    Returns:
        prediction (numpy.ndarray): probability of profitable investment
    '''
    test_new = transform(encoder, cat_inputs, trans_price)
    prediction = model.predict_proba(test_new)
    prediction = prediction[1]

    return prediction
m, e = get_model('../models/model.pkl','../models/encoder.pkl')

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations
