In [7]:
import pandas as pd
import numpy as np
import os
import pickle
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import __version__ as sklearn_version
#from sklearn.model_selection import cross_validate
#from sklearn.metrics import accuracy_score
from library.sb_utils import save_file

In [8]:
expected_model_version = '1.0'
model_path = '../models/lead_data_prediction_score.pkl'
if os.path.exists(model_path):
    with open(model_path, 'rb') as f:
        model = pickle.load(f)
    if model.version != expected_model_version:
        print("Expected model version doesn't match version loaded")
    if model.sklearn_version != sklearn_version:
        print("Warning: model created under different sklearn version")
else:
    print("Expected model not found")

In [9]:
lead_data = pd.read_csv('../data/jds_features.csv')
lead_data.tail()

Unnamed: 0,match_id,lead_id,agent_id,email_score,phone_score,contact_score,block_score,contact,app
147217,139294,35073,118,90.0,99,94.5,48.972,0,0
147218,31750,10278,182,80.0,96,88.0,69.576,0,0
147219,49027,12080,200,85.0,99,92.0,58.585,0,0
147220,113463,24797,206,90.0,72,81.0,43.887,0,0
147221,39340,11203,6,85.0,65,75.0,52.656,0,0


In [10]:
contacted_leads = lead_data[lead_data['contact'] == 1]
contacted_leads.reset_index(drop = True, inplace = True)
non_feature_list = ['match_id', 'lead_id', 'agent_id', 'contact']
contacted_leads_min = contacted_leads.copy()
contacted_leads_min.drop(columns = non_feature_list, inplace = True)
contacted_leads_min

Unnamed: 0,email_score,phone_score,contact_score,block_score,app
0,80.0,99,89.5,63.033,0
1,85.0,65,75.0,57.398,0
2,85.0,99,92.0,73.182,1
3,95.0,99,97.0,51.631,0
4,30.0,96,63.0,84.483,0
...,...,...,...,...,...
18684,80.0,68,74.0,72.635,0
18685,80.0,98,89.0,68.594,0
18686,85.0,92,88.5,57.353,0
18687,85.0,99,92.0,51.931,0


In [11]:
uncontacted_leads = lead_data[lead_data['contact'] == 0].copy()
uncontacted_leads.reset_index(drop = True, inplace = True)
non_feature_list = ['match_id', 'lead_id', 'agent_id', 'contact']
uncontacted_leads_min = uncontacted_leads.copy()
uncontacted_leads_min.drop(columns = non_feature_list, inplace = True)
uncontacted_leads_min

Unnamed: 0,email_score,phone_score,contact_score,block_score,app
0,10.0,62,36.0,47.949,0
1,95.0,98,96.5,73.384,0
2,85.0,98,91.5,58.878,0
3,85.0,99,92.0,55.408,0
4,85.0,50,67.5,77.282,0
...,...,...,...,...,...
128528,90.0,99,94.5,48.972,0
128529,80.0,96,88.0,69.576,0
128530,85.0,99,92.0,58.585,0
128531,90.0,72,81.0,43.887,0


In [12]:
## Refit model
X = contacted_leads_min.iloc[:, :-1]
y = contacted_leads_min.iloc[:, 4]

In [13]:
X.head().T

Unnamed: 0,0,1,2,3,4
email_score,80.0,85.0,85.0,95.0,30.0
phone_score,99.0,65.0,99.0,99.0,96.0
contact_score,89.5,75.0,92.0,97.0,63.0
block_score,63.033,57.398,73.182,51.631,84.483


In [14]:
model.fit(X, y)

Pipeline(memory=None,
         steps=[('simpleimputer',
                 SimpleImputer(add_indicator=False, copy=True, fill_value=None,
                               missing_values=nan, strategy='median',
                               verbose=0)),
                ('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('kneighborsclassifier',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=None, n_neighbors=5, p=2,
                                      weights='uniform'))],
         verbose=False)

In [15]:
y_pred = pd.DataFrame(model.predict(uncontacted_leads_min.iloc[:, :-1]))
uncontacted_leads_min['app_prediction'] = y_pred
uncontacted_leads_min['app_prediction'].value_counts()

0    128203
1       330
Name: app_prediction, dtype: int64

In [21]:
uncontacted_leads_min['match_id'] = uncontacted_leads['match_id']
uncontacted_leads_min['lead_id']=uncontacted_leads['lead_id']
uncontacted_leads_min['agent_id']=uncontacted_leads['agent_id']
uncontacted_leads_min['contact'] = uncontacted_leads['contact']
uncontacted_leads_min['score'] = uncontacted_leads_min['email_score'] + uncontacted_leads_min['phone_score'] + uncontacted_leads_min['contact_score'] + uncontacted_leads_min['block_score']
uncontacted_leads_min.sort_values(by=['app_prediction', 'score'], ascending = [False, False], inplace = True)

In [22]:
uncontacted_leads_min.head()

Unnamed: 0,email_score,phone_score,contact_score,block_score,app,app_prediction,match_id,lead_id,agent_id,contact,score
103038,95.0,99,97.0,79.202,0,1,138088,35403,12,0,370.202
12908,95.0,99,97.0,79.19,0,1,48135,13776,266,0,370.19
30729,95.0,99,97.0,79.183,0,1,89343,20214,183,0,370.183
78347,95.0,99,97.0,79.162,0,1,92517,14542,200,0,370.162
31043,95.0,99,97.0,79.131,0,1,43864,11429,103,0,370.131


In [23]:
datapath = '../data'
save_file(uncontacted_leads_min, 'uncontacted_lead_predictions.csv', datapath)

A file already exists with this name.

Do you want to overwrite? (Y/N)Y
Writing file.  "../data\uncontacted_lead_predictions.csv"
