# Playground
This is a copy of the repeat call prediction coding.

It is used to save time in development by retaining the model in memory

For production, code will be transferred to a .py script to run as a scheduled task


In [1]:
# Environment

import sys              # General system functions
import pyodbc           # Python ODBC library
import pandas as pd     # Pandas data frame library

from sklearn.preprocessing import StandardScaler                    # for scaling features to zero-mean/unit-variance
from sklearn.model_selection import train_test_split                # splitting of data into training/validation
from keras.utils.np_utils import to_categorical                     # one-hot encoding of outputs
from keras.models import Sequential                                 # Standard sequential model
from keras.layers.core import Dense, Activation, Dropout            # Usual layers required
from keras.optimizers import Adam                                   # Import the Adam optimiser
from keras.callbacks import EarlyStopping                           # Early stopping callback to avoid over-fit
# Constants
MODULE_NAME = 'call_analysis.py'
SERVER = 'wf-sql-01'
DATABASE = 'smart'
CONNECTION = 'DRIVER={SQL Server Native Client 11.0};'

REPEAT_DAYS = 14        # Number of calendar days to allow for a repeat call
TRAIN_SIZE = 0.8        # Percentage of training data to use for training, vs validation

# Field handling
# Categoricals are fields to 1-hot encode for deep learning
categoricals = ['BusinessType','Manufacturer', 'ProductId', 'DeviceType','LastOtherCallType','CreatedBy','CreatedDay',
                'AttendDay', 'PostCodeArea','FirstEngineer','SymptomCodeId']
# Drop fields are fields to dispose of from the dataframe, as not useful for prediction
drop_fields = ['ID', 'Incident', 'IncidentType', 'CustomerId', 'LedgerCode', 'SiteId',
               'SerialNo', 'SymptomDescription']
# Time stamp fields - these will also need to be dropped from the dataframe, they're already encoded
time_stamp_fields = ['FirstUsed','Installed','LastBreakCall','LastOtherCall','InitialMeterReadingDate',
                     'LastMeterReadingDate','CreatedDateTime','CreatedTime','AttendDateTime']

# Global variables
dbconn = None

Using TensorFlow backend.


In [4]:
username = 'ReportsReader'

In [3]:
import getpass    # portable password input
password = getpass.getpass('Enter the password for ' + user +'\n')

Enter the password for ReportsReader
········


In [5]:
# Helper function - just stick quotes around the string, for SQL building
def quote_string(sString):
    if sString.find("'") < 0:
        return "'%s'" % sString
    else:
        return "'%s'" % sString.replace("'", "#")

def writelog(message):

    global dbconn

    # write to the message log
    sql = 'INSERT INTO zCALL_ANALYSIS_LOG (Description) VALUES (' + quote_string(message) + ')'
    #print(sql)
    cursor = dbconn.cursor()
    cursor.execute(sql)
    cursor.commit()

In [7]:
print('Starting analysis process')
global dbconn

# Connect to the database
s_conn = CONNECTION + 'SERVER='+SERVER+';DATABASE='+DATABASE+';'
s_conn += 'UID='+username+';PWD='+password
dbconn = pyodbc.connect(s_conn)

writelog('Python analysis started and connected to database OK')

# Read in the data
sql = 'select * from zCALL_ANALYSIS Order By [Incident] ASC'
df_all = pd.read_sql_query(sql, dbconn)                         # can read SQL direct to a data frame
print('Read %s rows, with %s columns' % (df_all.shape[0],df_all.shape[1]))

# Encode the data
# Need to set up unique names for values
for c in categoricals:
    df_all[c] = df_all[c].map(lambda x: ((str(c)) + '-' + str(x)))
# Now go over all of the columns and create one-hot encoding
for c in categoricals:
    one_hot = pd.get_dummies(df_all[c])
    df_all = df_all.join(one_hot)
print('Now have %s rows, with %s columns' % (df_all.shape[0], df_all.shape[1]))
# Extract the ground-truth values
y_repeat = df_all['IncidentType'].map(lambda x: x == 'REPEAT')

# Find the point at which predictions required (i.e recent incidents)
sql = 'select MIN(Incident) [MinIncident] from zCALL_ANALYSIS '
sql += 'where AttendDateTime >= DATEADD(d, -' + str(REPEAT_DAYS) + ', getdate())'
cursor = dbconn.cursor()
cursor.execute(sql)
rs = cursor.fetchall()
min_prediction_incident = rs[0].MinIncident

print('Minimum incidentId for prediction: %s' % min_prediction_incident)

min_prediction_index = df_all.shape[0]-1
while (df_all['Incident'][min_prediction_index] > min_prediction_incident):
    min_prediction_index -= 1
print('Found minimum incident at row %s, incidentId %s' % (min_prediction_index, df_all['Incident'][min_prediction_index]))

# Retain the list of incident IDs, for reporting later
df_incident = df_all['Incident']
print('Retained incident list with %s rows' % (df_incident.shape[0]))

# Drop out fields which are not required
for c in categoricals + drop_fields + time_stamp_fields:
    del df_all[c]
print('After dropping fields, now have %s rows, with %s columns' % (df_all.shape[0], df_all.shape[1]))

# Get rid of any NULL values in numerical fields, replace any remaining NaN values with -1
# This is to avoid breaking the algorithms later
df_all = df_all.fillna(-1)

# Scale the fields, zero-mean/unit-variance
X_scaler = StandardScaler().fit(df_all)
X_scaled = X_scaler.transform(df_all)

# Encode the target outputs
y_binary = to_categorical(y_repeat)

# Split out the training data / test data / prediction data
X_predict = X_scaled[min_prediction_index:]
y_predict = y_binary[min_prediction_index:]                  # may use this for known REPEATS in window
X_model   = X_scaled[0:min_prediction_index]
y_model   = y_binary[0:min_prediction_index]

X_train, X_validate, y_train, y_validate = train_test_split(X_model, y_model,
                                                            train_size=TRAIN_SIZE, test_size=1-TRAIN_SIZE)

print('Created training set of size %s, validation set of size %s' % (len(X_train), len(X_validate)))
print('Will predict on %s calls since cut-off date' % len(X_predict))
print('Total records: %s' % (len(X_train)+len(X_validate)+len(X_predict)))

# Create the network structure
nn_model = Sequential()
nn_model.add(Dense(df_all.shape[1], input_shape=(df_all.shape[1] * 1,)))
nn_model.add(Activation('relu'))
nn_model.add(Dropout(0.5))
nn_model.add(Dense(1000))
nn_model.add(Activation('relu'))
nn_model.add(Dense(200))
nn_model.add(Activation('relu'))
nn_model.add(Dense(100))
nn_model.add(Activation('relu'))
nn_model.add(Dense(50))
nn_model.add(Activation('relu'))
nn_model.add(Dense(10))
nn_model.add(Activation('relu'))
nn_model.add(Dense(2))
nn_model.add(Activation('softmax'))
nn_model.compile(loss='categorical_crossentropy',
                 optimizer=Adam(),
                 metrics=['accuracy'])
nn_model.summary()

# Train a model, with early stopping
nBatchSize = 32
nEpoch = 50
early_stop = EarlyStopping(monitor='val_loss',
                           min_delta=0,
                           patience=5,
                           verbose=0, mode='min')
nn_model.fit(X_train, y_train,
             batch_size=nBatchSize, epochs=nEpoch,
             verbose=1, validation_data=(X_validate, y_validate), callbacks=[early_stop])





Starting analysis process
Read 30403 rows, with 57 columns
Now have 30403 rows, with 1361 columns
Minimum incidentId for prediction: 255705
Found minimum incident at row 28333, incidentId 255705
Retained incident list with 30403 rows
After dropping fields, now have 30403 rows, with 1333 columns
Created training set of size 22666, validation set of size 5667
Will predict on 2070 calls since cut-off date
Total records: 30403
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_8 (Dense)              (None, 1333)              1778222   
_________________________________________________________________
activation_8 (Activation)    (None, 1333)              0         
_________________________________________________________________
dropout_2 (Dropout)          (None, 1333)              0         
_________________________________________________________________
dense_9 (Dense)              (None, 1000)    



Train on 22666 samples, validate on 5667 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50


<keras.callbacks.History at 0x2d660257da0>

In [8]:
# What does y_predict look like?
y_predict

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       ...,
       [1., 0.],
       [1., 0.],
       [1., 0.]], dtype=float32)

In [9]:
# Make predictions (note that some calls may already be repeats by this time)
count = 0
for r in range(len(X_predict)):
    if y_predict[r][1] > 0.1:
        print('Item ' + str(r) + ' is already a repeat')
        count += 1
        print('Model will predict this with confidence: ', nn_model.predict(X_predict[r:r+1])[0][1])
    if count == 10:
        break

Item 12 is already a repeat
Model will predict this with confidence:  0.9319371
Item 15 is already a repeat
Model will predict this with confidence:  0.0036825442
Item 21 is already a repeat
Model will predict this with confidence:  0.97741824
Item 28 is already a repeat
Model will predict this with confidence:  0.55748457
Item 35 is already a repeat
Model will predict this with confidence:  0.0372141
Item 37 is already a repeat
Model will predict this with confidence:  0.7121492
Item 42 is already a repeat
Model will predict this with confidence:  0.012182841
Item 69 is already a repeat
Model will predict this with confidence:  0.81635183
Item 71 is already a repeat
Model will predict this with confidence:  0.63231546
Item 124 is already a repeat
Model will predict this with confidence:  0.046593376


In [None]:
# Done
writelog('Python analysis process completed')
print('\nAnalysis process completed')