In [1]:
%matplotlib inline
import numpy as np
import scipy as sp
import matplotlib as mpl
import matplotlib.cm as cm
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)
import seaborn as sns
sns.set_style("whitegrid")
sns.set_context("poster")
import sklearn.model_selection
import os
import pickle
from sklearn import __version__ as sklearn_version
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import train_test_split, cross_validate, GridSearchCV, learning_curve
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.dummy import DummyRegressor
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
import datetime
from library.sb_utils import save_file

In [2]:
feature_data = pd.read_csv('../data/jds_features.csv')
feature_data.head()

Unnamed: 0,match_id,lead_id,agent_id,email_score,phone_score,contact_score,block_score,contact,app
0,40667,12289,174,10.0,62,36.0,47.949,0,0
1,144108,30625,176,95.0,98,96.5,73.384,0,0
2,4314,4446,91,85.0,98,91.5,58.878,0,0
3,89955,20851,228,85.0,99,92.0,55.408,0,0
4,77985,20544,168,85.0,50,67.5,77.282,0,0


In [3]:
feature_data.isnull().any()

match_id         False
lead_id          False
agent_id         False
email_score      False
phone_score      False
contact_score    False
block_score      False
contact          False
app              False
dtype: bool

In [4]:
## Extract contacted leads for model
contacted_leads = feature_data[feature_data['contact'] == 1]
contacted_leads.reset_index(drop = True, inplace = True)
print(feature_data.shape)
print(contacted_leads.shape)

(147222, 9)
(18689, 9)


In [5]:
contacted_leads.tail()

Unnamed: 0,match_id,lead_id,agent_id,email_score,phone_score,contact_score,block_score,contact,app
18684,132655,34142,230,80.0,68,74.0,72.635,1,0
18685,117320,28132,206,80.0,98,89.0,68.594,1,0
18686,28938,11026,192,85.0,92,88.5,57.353,1,0
18687,60304,17635,145,85.0,99,92.0,51.931,1,0
18688,23152,7835,131,60.0,19,39.5,61.039,1,0


In [6]:
contacted_leads.isnull().any()

match_id         False
lead_id          False
agent_id         False
email_score      False
phone_score      False
contact_score    False
block_score      False
contact          False
app              False
dtype: bool

In [7]:
## Do a 70/30 split for training and testing
len(contacted_leads) * .7, len(contacted_leads) * .3

(13082.3, 5606.7)

In [26]:
X_train, X_test, y_train, y_test = train_test_split(contacted_leads.drop(columns = 'app'),
                                                    contacted_leads['app'], test_size = 0.3,
                                                    random_state = 42)

In [9]:
X_train.shape, X_test.shape

((13082, 8), (5607, 8))

In [10]:
y_train.shape, y_test.shape

((13082,), (5607,))

In [27]:
## Drop non-numeric/non-features
non_feature_list = ['match_id', 'lead_id', 'agent_id', 'contact']
non_feature_train = X_train[non_feature_list]
non_feature_test = X_test[non_feature_list]

X_train.drop(columns = non_feature_list, inplace = True)
X_test.drop(columns = non_feature_list, inplace = True)
X_train.shape, X_test.shape

((13082, 4), (5607, 4))

In [12]:
X_train.dtypes

email_score      float64
phone_score        int64
contact_score    float64
block_score      float64
dtype: object

In [13]:
X_test.dtypes

email_score      float64
phone_score        int64
contact_score    float64
block_score      float64
dtype: object

In [28]:
## Basic logistic regression
clf = LogisticRegression(solver = 'lbfgs')
clf.fit(X_train, y_train)
accuracy_score(clf.predict(X_test), y_test)

0.942215088282504

In [29]:
## Grid search with cross validation to tune model
Cs = [0.001, 0.1, 1, 10, 100]
parameters = {'C':Cs}
clf = LogisticRegression(solver = 'lbfgs')
grid_cv = GridSearchCV(clf, parameters, cv = 10)
grid_cv.fit(X_train, y_train)
print('Best Parameters: ', grid_cv.best_params_)

Best Parameters:  {'C': 0.001}


In [30]:
clf = LogisticRegression(C = 0.001, solver = 'lbfgs')
clf.fit(X_train, y_train)
accuracy_score(clf.predict(X_test), y_test)

0.942215088282504

In [17]:
LR_pipe = make_pipeline(
    LogisticRegression(C = 0.001, solver = 'lbfgs')
)

In [18]:
## Save model
best_model = LR_pipe
best_model.version = '1.0'
best_model.pandas_version = pd.__version__
best_model.numpy_version = np.__version__
best_model.sklearn_version = sklearn_version
best_model.X_columns = [col for col in X_train.columns]
best_model.build_datetime = datetime.datetime.now()

In [19]:
modelpath = '../models'
save_file(best_model, 'lead_data_prediction_score.pkl', modelpath)

A file already exists with this name.

Do you want to overwrite? (Y/N)Y
Writing file.  "../models\lead_data_prediction_score.pkl"


In [31]:
## Train on rest of data
uncontacted_leads = feature_data[feature_data['contact'] == 0]
X_train, X_test, y_train, y_test = train_test_split(uncontacted_leads.drop(columns = 'app'),
                                                    uncontacted_leads['app'], test_size = 0.3,
                                                    random_state = 42)
X_test.drop(columns = non_feature_list, inplace = True)

In [32]:
result = pd.DataFrame(clf.predict(X_test))
result[result[0] == 1]

Unnamed: 0,0
