# Imports

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [259]:
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, log_loss, roc_auc_score, roc_curve, confusion_matrix
from sklearn.preprocessing import OrdinalEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, chi2, SelectFromModel

In [260]:
import feature_engine.missing_data_imputers as mdi
from feature_engine import categorical_encoders as ce
from feature_engine import discretisers as dsc
from feature_engine import outlier_removers as outr

In [261]:
from sqlalchemy import create_engine
from snowflake.sqlalchemy import URL
import psycopg2
import csv

In [262]:
from scipy.stats import randint as sp_randint
import xgboost as xgb

# Options

In [7]:
import warnings
warnings.filterwarnings("ignore")

In [8]:
pd.options.display.max_rows = 999
pd.options.display.max_columns = 999

# Pipeline

In [11]:
def get_data(query):
    print('Starting data collection phase...')
    # Snowflake
    url = URL(
        account = 'host',
        user = 'username',
        password = 'password',
        database = 'database_name',
        schema = 'schema_name',
        warehouse = 'warehouse_name',
        role='role_name',
    #     authenticator='https://xxxxx.okta.com',
    )
    engine = create_engine(url)
    conn = engine.connect()
    #PG
    conn = psycopg2.connect(host="host",database="prod", user="username", password="password")
    query = '''initial dataset query from DB'''
    base_data = pd.read_sql_query(query,conn)
    return base_data

In [403]:
def create_date_features(df, date, drop_initial_feature=True):
    """
    Creates time series features from datetime index
    """
    df['date'] = df[date].dt.strftime('%Y-%m-%d')
    df['hour'] = df[date].dt.strftime('%H')
    df['dayofweek'] = df[date].dt.strftime('%a')
    df['quarter'] = (df[date].dt.quarter).astype(str)
    df['month'] = df[date].dt.strftime('%b')
    df['year'] = df[date].dt.strftime('%Y')
    df['dayofyear'] = df[date].dt.strftime('%-j')
    df['dayofmonth'] = df[date].dt.strftime('%-d')
    df['weekofyear'] = df[date].dt.strftime('%W')
    X = df[['hour','dayofweek','month','year'
           'dayofyear','dayofmonth','weekofyear']]
    if drop_initial_feature:
        df.drop(date,inplace=True, axis=1)
    return X.columns.values.tolist()

In [13]:
def combine_features(data,feature1,feature2):
    data[f"{feature1 + feature2}"] = data[feature1] + data[feature2]
    return data

In [14]:
def impute_missing_values(missing_values_imputer_list,base_data):
    # set up the imputer to transform nulls into a Missing category
    imputer = mdi.CategoricalVariableImputer(variables=missing_values_imputer_list)
    # fit the imputer
    imputer.fit(base_data)
    # transform the data
    base_data = imputer.transform(base_data)
    return base_data

In [15]:
def rare_label_encode(encoder_list,base_data):
    # set up the encoder to group rarer labels
    encoder = ce.RareLabelCategoricalEncoder(tol=0.01, n_categories=10, variables=encoder_list)
    # fit the encoder
    encoder.fit(base_data)
    # transform the data
    base_data = encoder.transform(base_data)
    return base_data

In [16]:
def one_hot_encode(one_hot_list,base_data):
    # set up the encoder to OneHotEncode
    encoder = ce.OneHotCategoricalEncoder(drop_last=False)
    # fit the encoder
    encoder.fit(base_data)
    # transform the data
    base_data = encoder.transform(base_data)
    return base_data

In [303]:
def preprocess(base_data,feature_dimensions=None,feature_combinations=[],features_drop=[],drop_na=True,fill_na=False,
            missing_values_imputer_list=[],rare_label_encoder_list=[],one_hot_list=[], scaler=None, date=[]):
    print('Starting preprocessing phase...')
    if feature_dimensions:
        base_data = base_data[feature_dimensions]
    else:
        feature_dimensions = base_data.columns.values.tolist()
    if date in feature_dimensions:
        print('Creating date features...')
        date_features = create_date_features(base_data, date)
    else:
        date_features = []
    if feature_combinations:
        print('Creating feature combinations...')
        combine_features(base_data,feature_combinations[0], feature_combinations[1])
    print('Imputing missing observations...')
    missing_values_imputer_list=base_data.select_dtypes(include='object').columns.tolist()
    base_data = impute_missing_values(missing_values_imputer_list,base_data)
    if fill_na:
        print('Filling NAs...')
        base_data[fill_na].fillna(0,inplace=True)
    if drop_na:
        print('Dropping NAs...')
        base_data.dropna(inplace=True)
    print('Enconding rare labels...')
    rare_label_encoder_list=base_data.select_dtypes(include='object').columns.tolist()
    base_data = rare_label_encode(rare_label_encoder_list,base_data)
    print('Dropping unnecessary features...')
    if features_drop+ (date if date == [] else [date])+feature_combinations != []:
        base_data.drop(features_drop+(date if date == [] else [date])+feature_combinations, axis=1, inplace=True)
    print('OneHotEncoding features...')
    one_hot_list=base_data.select_dtypes(include='object').columns.tolist()
    base_data = one_hot_encode(one_hot_list+feature_combinations+date_features,
                               base_data) if one_hot_list+feature_combinations+date_features != [] else base_data
    if scaler:
        scaler = StandardScaler()
        scaler.fit(X_train)
        X_train = scaler.transform(X_train)
    return base_data

In [397]:
def feature_selection(base_data, method, target, k=None, test_size=0.3):
    print('Starting feature selection phase...')
    # Shuffle data before splitting
    base_data = base_data.sample(frac=1).reset_index(drop=True)
    if method == 'KBest':
        # SelectKBest
        X_new = SelectKBest(chi2, k=k).fit_transform(base_data.loc[:, base_data.columns != target], base_data[target])
        return  train_test_split(X_new, base_data[target], test_size=test_size)
    if method == 'SelectFromRF':
        X_train, X_test, y_train, y_test = train_test_split(base_data.loc[:, base_data.columns != target], 
                                                            base_data[target], test_size=test_size)
        RF = RandomForestClassifier(n_estimators = 100)
        sel = SelectFromModel(RF)
        sel.fit(X_train, y_train)
        selected_feat = X_train.columns[(sel.get_support())]
        return selected_feat

In [396]:
def model(X_train, y_train, model_type, model_params=None):
    print('Starting model fitting phase...')
    if model_type == 'classification':
        if 'LR' in locals() or 'LR' in globals():
            del LR
        LR = LogisticRegression()
        LR.fit(X_train, y_train)
        if 'RF' in locals():
            del RF
        RF = RandomForestClassifier()
        params = {"max_depth": [3, None],
                  "max_features": ['sqrt', 'log2', len(X_train.columns)],
                  "min_samples_split": sp_randint(2, 11),
                  "bootstrap": [True, False],
                  "criterion": ["gini", "entropy"],
                  "n_estimators": [100,200]}
        print('Initiating randomized search...')
        RRF = RandomizedSearchCV(RF, 
                                 params, 
                                 cv=3, 
                                 n_jobs=-1, 
                                 scoring='f1',
                                 n_iter=50
                                )
                                 # random search with 33% of the possible combinations if bigger than 100 param combinations possible, else run all combinations 
#                                  n_iter=np.product([len(v) for v in params.values()])/3 if np.product([len(v) for v in params.values()]) > 100 else np.product([len(v) for v in params.values()]))
        RRF.fit(X_train, y_train)
        if 'XGB' in locals() or 'XGB' in globals():
            del XGB
            del RXGB
        XGB = xgb.XGBClassifier()
        parameters = {
             "eta"    : [0.05, 0.10, 0.15, 0.20, 0.25, 0.30 ] ,
             "max_depth"        : [ 3, 4, 5, 6, 8, 10, 12, 15],
             "min_child_weight" : [ 1, 3, 5, 7 ],
             "gamma"            : [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
             "colsample_bytree" : [ 0.3, 0.4, 0.5 , 0.7 ]
             }
        RXGB = RandomizedSearchCV(
                                XGB,
                                parameters, 
                                n_jobs=4,
                                scoring="neg_log_loss",
                                cv=3)
        RXGB.fit(X_train, y_train)
        return {'logistic_regression': LR,'randomized Random Forest': RRF.best_estimator_, 'XGBoost': RXGB.best_estimator_}
    elif model_type == 'regression':
        pass
    elif model_type == 'clustering':
        pass

In [395]:
def evaluate(model, X_test, y_test):
    print('Starting evaluation phase...')
    # confusion matrix
    cm = confusion_matrix(y_test, model.predict(X_test))
    # classification report
    cr = classification_report(y_test, model.predict(X_test))
    # roc auc
    auc = roc_auc_score(y_test, model.predict(X_test))

    return [cm,cr,auc]

In [208]:
def save_model(data, models, evaluations, selected_feat):
    print('Saving model...')
    with open('saved_models.csv','a') as f:
        writer = csv.writer(f, delimiter='\n')
        writer.writerow([data.columns.values.tolist(),[v for v in models.values()],evaluations, selected_feat])

In [308]:
if __name__ == '__main__':
    db_data = get_data(query)
    base_data = db_data.copy()
    data = preprocess(base_data.drop('received_at',axis=1), 
                      feature_dimensions=['qualified','gender', 'phone_type', 'phone_country', 'email_provider', 'name_in_email'], 
                      missing_values_imputer_list=base_data.select_dtypes(include='object').columns.tolist(),
                      one_hot_list=base_data.select_dtypes(include='object').columns.tolist(),
                      rare_label_encoder_list=base_data.select_dtypes(include='object').columns.tolist(),
                      fill_na=base_data.select_dtypes(include=['int','float']).columns.tolist(),
                      feature_combinations=['phone_type','phone_country'],
                      date='received_at'
                     )
    X_train, X_test, y_train, y_test = train_test_split(data.loc[:, data.columns != 'qualified'], data['qualified'], test_size=0.3)
    selected_feat = feature_selection(data, 'SelectFromRF','qualified')
    X_train, X_test, y_train, y_test = train_test_split(data[selected_feat.values.tolist()], data['qualified'], test_size=0.3)
    models = model(X_train, y_train, 'classification')
    evaluations = []
    for model_name, model in models.items():
        evaluations.append([model_name, evaluate(model, X_test, y_test)])
    print(evaluations)
    save_model(data, models, evaluations, selected_feat)

Starting data collection phase...
Starting preprocessing phase...
Creating feature combinations...
Imputing missing observations...
Dropping NAs...
Enconding rare labels...
Dropping unnecessary features...
OneHotEncoding features...
Starting model fitting phase...
Initiating randomized search...
Starting evaluation phase...
Starting evaluation phase...
Starting evaluation phase...
[['logistic_regression', [array([[11974,  3110],
       [ 7260,  4313]]), '              precision    recall  f1-score   support\n\n           0       0.62      0.79      0.70     15084\n           1       0.58      0.37      0.45     11573\n\n    accuracy                           0.61     26657\n   macro avg       0.60      0.58      0.58     26657\nweighted avg       0.60      0.61      0.59     26657\n', 0.5832495260333429, 0.6669167291822956, 0.6033383345836459, 0.25881470367591897, 0.21530382595648911]], ['randomized Random Forest', [array([[10645,  4439],
       [ 6595,  4978]]), '              precisi

In [407]:
models

{'logistic_regression': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                    intercept_scaling=1, l1_ratio=None, max_iter=100,
                    multi_class='warn', n_jobs=None, penalty='l2',
                    random_state=None, solver='warn', tol=0.0001, verbose=0,
                    warm_start=False),
 'randomized Random Forest': RandomForestClassifier(bootstrap=False, class_weight=None, criterion='entropy',
                        max_depth=None, max_features=124, max_leaf_nodes=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=1, min_samples_split=5,
                        min_weight_fraction_leaf=0.0, n_estimators=100,
                        n_jobs=None, oob_score=False, random_state=None,
                        verbose=0, warm_start=False),
 'XGBoost': XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
               colsample_bynode=1, co