# Playground
This is a copy of the repeat call prediction coding.

It is used to save time in development by retaining the model in memory

For production, code will be transferred to a .py script to run as a scheduled task


In [1]:
# Environment

import sys              # General system functions
import pyodbc           # Python ODBC library
import pandas as pd     # Pandas data frame library

from sklearn.preprocessing import StandardScaler                    # for scaling features to zero-mean/unit-variance
from sklearn.model_selection import train_test_split                # splitting of data into training/validation
from keras.utils.np_utils import to_categorical                     # one-hot encoding of outputs
from keras.models import Sequential                                 # Standard sequential model
from keras.layers.core import Dense, Activation, Dropout            # Usual layers required
from keras.optimizers import Adam                                   # Import the Adam optimiser
from keras.callbacks import EarlyStopping                           # Early stopping callback to avoid over-fit
# Constants
MODULE_NAME = 'call_analysis.py'
SERVER = 'wf-sql-01'
DATABASE = 'smart'
CONNECTION = 'DRIVER={SQL Server Native Client 11.0};'

REPEAT_DAYS = 14        # Number of calendar days to allow for a repeat call
TRAIN_SIZE = 0.8        # Percentage of training data to use for training, vs validation

# Field handling
# Categoricals are fields to 1-hot encode for deep learning
categoricals = ['BusinessType','Manufacturer', 'ProductId', 'DeviceType','LastOtherCallType','CreatedBy','CreatedDay',
                'AttendDay', 'PostCodeArea','FirstEngineer','SymptomCodeId']
# Drop fields are fields to dispose of from the dataframe, as not useful for prediction
drop_fields = ['Repeated','ID', 'Incident', 'IncidentType', 'CustomerId', 'LedgerCode', 'SiteId',
               'SerialNo', 'SymptomDescription']
# Time stamp fields - these will also need to be dropped from the dataframe, they're already encoded
time_stamp_fields = ['FirstUsed','Installed','LastBreakCall','LastOtherCall','InitialMeterReadingDate',
                     'LastMeterReadingDate','CreatedDateTime','CreatedTime','AttendDateTime']

# Global variables
dbconn = None

Using TensorFlow backend.


In [2]:
username = 'ReportsReader'

In [4]:
import getpass    # portable password input
password = getpass.getpass('Enter the password for ' + username +'\n')

Enter the password for ReportsReader
········


In [5]:
# Helper function - just stick quotes around the string, for SQL building
def quote_string(sString):
    if sString.find("'") < 0:
        return "'%s'" % sString
    else:
        return "'%s'" % sString.replace("'", "#")

def writelog(message):

    global dbconn

    # write to the message log
    sql = 'INSERT INTO zCALL_ANALYSIS_LOG (Description) VALUES (' + quote_string(message) + ')'
    #print(sql)
    cursor = dbconn.cursor()
    cursor.execute(sql)
    cursor.commit()

In [11]:
print('Starting analysis process')
global dbconn

# Connect to the database
s_conn = CONNECTION + 'SERVER='+SERVER+';DATABASE='+DATABASE+';'
s_conn += 'UID='+username+';PWD='+password
dbconn = pyodbc.connect(s_conn)

writelog('Python analysis started and connected to database OK')

# Read in the data
sql = 'select * from zCALL_ANALYSIS Order By [Incident] ASC'
df_all = pd.read_sql_query(sql, dbconn)                         # can read SQL direct to a data frame
print('Read %s rows, with %s columns' % (df_all.shape[0],df_all.shape[1]))

# Encode the data
# Need to set up unique names for values
for c in categoricals:
    df_all[c] = df_all[c].map(lambda x: ((str(c)) + '-' + str(x)))
# Now go over all of the columns and create one-hot encoding
for c in categoricals:
    one_hot = pd.get_dummies(df_all[c])
    df_all = df_all.join(one_hot)
print('Now have %s rows, with %s columns' % (df_all.shape[0], df_all.shape[1]))
# Extract the ground-truth values
#y_repeat = df_all['IncidentType'].map(lambda x: x == 'REPEAT')
y_repeat = df_all['Repeated'].map(lambda x: x == 'YES')

# Find the point at which predictions required (i.e recent incidents)
sql = 'select MIN(Incident) [MinIncident] from zCALL_ANALYSIS '
sql += 'where AttendDateTime >= DATEADD(d, -' + str(REPEAT_DAYS) + ', getdate())'
cursor = dbconn.cursor()
cursor.execute(sql)
rs = cursor.fetchall()
min_prediction_incident = rs[0].MinIncident

print('Minimum incidentId for prediction: %s' % min_prediction_incident)

min_prediction_index = df_all.shape[0]-1
while (df_all['Incident'][min_prediction_index] > min_prediction_incident):
    min_prediction_index -= 1
print('Found minimum incident at row %s, incidentId %s' % (min_prediction_index, df_all['Incident'][min_prediction_index]))

# Retain the list of incident IDs, for reporting later
df_incident = df_all['Incident']
print('Retained incident list with %s rows' % (df_incident.shape[0]))

# Drop out fields which are not required
for c in categoricals + drop_fields + time_stamp_fields:
    del df_all[c]
print('After dropping fields, now have %s rows, with %s columns' % (df_all.shape[0], df_all.shape[1]))

# Get rid of any NULL values in numerical fields, replace any remaining NaN values with -1
# This is to avoid breaking the algorithms later
df_all = df_all.fillna(-1)

# Scale the fields, zero-mean/unit-variance
X_scaler = StandardScaler().fit(df_all)
X_scaled = X_scaler.transform(df_all)

# Encode the target outputs
y_binary = to_categorical(y_repeat)

# Split out the training data / test data / prediction data
X_predict = X_scaled[min_prediction_index:]
y_predict = y_binary[min_prediction_index:]                  # may use this for known REPEATS in window
X_model   = X_scaled[0:min_prediction_index]
y_model   = y_binary[0:min_prediction_index]

# For playground, create an X_test set of 1000 items
X_test = X_model[len(X_model)-1000:]
y_test = y_model[len(y_model)-1000:]
X_model = X_model[0:len(X_model)-1000]
y_model = y_model[0:len(y_model)-1000]

X_train, X_validate, y_train, y_validate = train_test_split(X_model, y_model,
                                                            train_size=TRAIN_SIZE, test_size=1-TRAIN_SIZE)

print('Created training set of size %s, validation set of size %s' % (len(X_train), len(X_validate)))
print('Will predict on %s calls since cut-off date' % len(X_predict))
print('Total records: %s' % (len(X_train)+len(X_validate)+len(X_predict)))

# Create the network structure
nn_model = Sequential()
nn_model.add(Dense(df_all.shape[1], input_shape=(df_all.shape[1] * 1,)))
nn_model.add(Activation('relu'))
nn_model.add(Dropout(0.5))
nn_model.add(Dense(1000))
nn_model.add(Activation('relu'))
nn_model.add(Dense(200))
nn_model.add(Activation('relu'))
nn_model.add(Dense(100))
nn_model.add(Activation('relu'))
nn_model.add(Dense(50))
nn_model.add(Activation('relu'))
nn_model.add(Dense(10))
nn_model.add(Activation('relu'))
nn_model.add(Dense(2))
nn_model.add(Activation('softmax'))
nn_model.compile(loss='categorical_crossentropy',
                 optimizer=Adam(),
                 metrics=['accuracy'])
nn_model.summary()

# Train a model, with early stopping
nBatchSize = 32
nEpoch = 50
early_stop = EarlyStopping(monitor='val_loss',
                           min_delta=0,
                           patience=5,
                           verbose=0, mode='min')
nn_model.fit(X_train, y_train,
             batch_size=nBatchSize, epochs=nEpoch,
             verbose=1, validation_data=(X_validate, y_validate), callbacks=[early_stop])





Starting analysis process
Read 30466 rows, with 58 columns
Now have 30466 rows, with 1364 columns
Minimum incidentId for prediction: 255705
Found minimum incident at row 28333, incidentId 255705
Retained incident list with 30466 rows
After dropping fields, now have 30466 rows, with 1335 columns
Created training set of size 21866, validation set of size 5467
Will predict on 2133 calls since cut-off date
Total records: 29466
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 1335)              1783560   
_________________________________________________________________
activation_8 (Activation)    (None, 1335)              0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 1335)              0         
_________________________________________________________________
dense_9 (Dense)              (None, 1000)    

<keras.callbacks.History at 0x1b545688668>

In [12]:
# What does y_predict look like?
y_predict

array([[1., 0.],
       [1., 0.],
       [0., 1.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [17]:
# Test accuracy
# Use X_test / y_test data - will be known repeats
import numpy as np
predictions = nn_model.predict(X_test)
predicted_repeats = np.argmax(predictions,axis=1)
print(predictions[0:10])
print(predicted_repeats[0:10])

[[0.81726134 0.18273865]
 [0.72780913 0.27219087]
 [0.7037798  0.29622015]
 [0.4561587  0.54384136]
 [0.9917557  0.00824425]
 [0.7368154  0.2631846 ]
 [0.6708834  0.3291166 ]
 [0.48952138 0.5104786 ]
 [0.7305655  0.26943448]
 [0.71038246 0.2896175 ]]
[0 0 0 1 0 0 0 1 0 0]


In [19]:
# performance on plain argmax of confidence levels
true_positive = 0
false_positive = 0
true_negative = 0
false_negative = 0
for i in range(len(predictions)):
    if y_test[i][1] > 0.1:
        # the ground truth is a repeat
        if predicted_repeats[i] == 1:
            true_positive += 1
        else:
            false_negative += 1
    else:
        # the ground truth is not a repeat
        if predicted_repeats[i] == 1:
            false_positive += 1
        else:
            true_negative += 1 
print('Results are:')
print(' True positives  - correct predictions:   %s' % true_positive)
print(' True negatives  - correct predictions:   %s' % true_negative)
print(' False positives - incorrect predictions: %s' % false_positive)
print(' False negatives - incorrect predictions: %s' % false_negative)

Results are:
 True positives  - correct predictions:   62
 True negatives  - correct predictions:   681
 False positives - incorrect predictions: 32
 False negatives - correct predictions:   225


In [28]:
# test thresholds, with creation of an output file
file_name = 'results.csv'
with open(file_name,'w') as file:
    line = 'TruePositive,TrueNegative,FalsePositive,FalseNegative,Threshold\n'
    file.write(line)
    for t in range(1,20,1):
        threshold = float(t) / 20
        true_positive = 0
        false_positive = 0
        true_negative = 0
        false_negative = 0
        for i in range(len(predictions)):
            if y_test[i][1] > 0.1:
                # the ground truth is a repeat
                if predictions[i][1] >= threshold:
                    true_positive += 1
                else:
                    false_negative += 1
            else:
                # the ground truth is not a repeat
                if predictions[i][1] >= threshold:
                    false_positive += 1
                else:
                    true_negative += 1 
        print('Threshold setting = %.2f' % threshold)
        print(' True positives  - correct predictions:   %s' % true_positive)
        print(' True negatives  - correct predictions:   %s' % true_negative)
        print(' False positives - incorrect predictions: %s' % false_positive)
        print(' False negatives - incorrect predictions: %s' % false_negative)
        line = str(true_positive) + ',' + str(true_negative) + ',' + str(false_positive) + ',' + str(false_negative) 
        line += ',' + str(threshold) + '\n'
        file.write(line)
    file.close()

Threshold setting = 0.05
 True positives  - correct predictions:   267
 True negatives  - correct predictions:   104
 False positives - incorrect predictions: 609
 False negatives - incorrect predictions: 20
Threshold setting = 0.10
 True positives  - correct predictions:   253
 True negatives  - correct predictions:   181
 False positives - incorrect predictions: 532
 False negatives - incorrect predictions: 34
Threshold setting = 0.15
 True positives  - correct predictions:   239
 True negatives  - correct predictions:   254
 False positives - incorrect predictions: 459
 False negatives - incorrect predictions: 48
Threshold setting = 0.20
 True positives  - correct predictions:   213
 True negatives  - correct predictions:   322
 False positives - incorrect predictions: 391
 False negatives - incorrect predictions: 74
Threshold setting = 0.25
 True positives  - correct predictions:   194
 True negatives  - correct predictions:   407
 False positives - incorrect predictions: 306
 Fals

In [13]:
# Make predictions (note that some calls may already be repeats by this time)
count = 0
for r in range(len(X_predict)):
    if y_predict[r][1] > 0.1:
        print('Item ' + str(r) + ' is already a repeat')
        count += 1
        print('Model will predict this with confidence: ', nn_model.predict(X_predict[r:r+1])[0][1])
    if count == 10:
        break

Item 2 is already a repeat
Model will predict this with confidence:  0.5507701
Item 7 is already a repeat
Model will predict this with confidence:  0.16190197
Item 9 is already a repeat
Model will predict this with confidence:  0.047519617
Item 10 is already a repeat
Model will predict this with confidence:  0.33312327
Item 19 is already a repeat
Model will predict this with confidence:  0.23787631
Item 20 is already a repeat
Model will predict this with confidence:  0.5188767
Item 21 is already a repeat
Model will predict this with confidence:  0.27636012
Item 22 is already a repeat
Model will predict this with confidence:  0.33651233
Item 23 is already a repeat
Model will predict this with confidence:  0.314802
Item 25 is already a repeat
Model will predict this with confidence:  0.21586314


In [None]:
# Done
writelog('Python analysis process completed')
print('\nAnalysis process completed')