In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime, time

In [6]:
# Load the dataset from csv and store interesting columns, store timestamps in right format
train_data = pd.read_csv("Data/BPI_Challenge_2012.csv", usecols=['eventID ', 'case concept:name', 'event concept:name', 'event time:timestamp']).head(100000)
train_data['event time:timestamp'] = pd.to_datetime(train_data['event time:timestamp'], dayfirst=True)
train_data['case concept:name'] = train_data['case concept:name'].str.replace("Application_", "").astype('int32')

data = train_data.sort_values('event time:timestamp')

# Split data on time
train_data = data[:int(0.65*len(data))]
test_data = data[-int(0.35*len(data)):]

# Remove all cases from the training data that are in the test data
cond = train_data['case concept:name'].isin(test_data['case concept:name'])
cond2 = test_data['case concept:name'].isin(train_data['case concept:name'])
train_data.drop(train_data[cond].index, inplace=True)
test_data.drop(test_data[cond2].index, inplace=True)

# Sort both datasets on case. In the end we a ~75/25 split
train_data = train_data.sort_values('case concept:name', kind='mergesort')
test_data = test_data.sort_values('case concept:name', kind='mergesort')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


In [7]:
class NaivePredictor:

    def __init__(self):
        self.transition_modes = {}  # Stores most frequent transition based on last made transition
        self.transition_time_averages = {}  # Stores average time between each made transition
        self.prediction = {}    # Made predictions

    def fit(self, x):
        df = x[['eventID ', 'case concept:name', 'event concept:name', 'event time:timestamp']]

        # Calculate the time difference between each two rows in the same case
        df['timeDelta'] = df['event time:timestamp'].diff().dt.total_seconds()
        df.loc[df['case concept:name'] != df['case concept:name'].shift(1), 'timeDelta'] = np.nan
        
        # Store the transitions in seperate columns
        df['transitionToMake'] = df['event concept:name'] + '#' + df['event concept:name'].shift(-1)
        df['transitionMade'] = df['event concept:name'].shift(1) + '#' + df['event concept:name']

        self.df = df    # for debugging purposes

        # Calculate the mean of the time spent during each made transition and store them
        self.transition_time_averages = df.groupby('transitionMade')['timeDelta'].mean().fillna(0).to_dict()

        # Count how often a transition is made
        transition_counts = df.groupby(['event concept:name', 'transitionToMake'], sort=False)['eventID '].count()   

        # Calculate the most often made transition pairs and store them 
        self.transition_modes = transition_counts.loc[transition_counts.groupby('event concept:name').idxmax()].reset_index()['transitionToMake'].str.split('#', 1, expand=True).set_index(0)[1].to_dict()

    def predict(self, x):
        prediction = x[['eventID ', 'case concept:name', 'event concept:name', 'event time:timestamp']]

        # For all events, predict the next event within the case
        prediction['event'] = np.where(prediction['case concept:name'] == prediction['case concept:name'].shift(1), prediction['event concept:name'].shift(1).apply(lambda e: self.transition_modes[e] if not pd.isnull(e) and e in self.transition_modes  else np.nan), np.nan) 

        # Timestamp and event name of previous event is needed for the prediction in the next line
        prediction['timestamp'] = prediction['event time:timestamp'].shift(1)
        prediction['event concept:name'] = prediction['event concept:name'].shift(1)

        # For all events, predict the next event timestamp by adding the time average of the predicted next event to the previous timestamp, within the case
        prediction['timestamp'] = np.where(prediction['case concept:name'] == prediction['case concept:name'].shift(1), prediction.apply(lambda e: (e.timestamp + pd.Timedelta(seconds=self.transition_time_averages[e['event concept:name'] + '#' + e.event])) if(not pd.isnull(e.timestamp) and not pd.isnull(e.event) and e['event concept:name'] + '#' + e.event in self.transition_time_averages) else np.datetime64('NaT'), axis=1), np.datetime64('NaT'))

        prediction['event concept:name'] = prediction['event concept:name'].shift(-1)   # Restore event column

        self.prediction = prediction


In [8]:
def transition_accuracy(y, y_hat):
    n = y.shape[0]
    error = y_hat['event'] == y['event concept:name']
    return error.sum() / n

def time_delta_mse(y, y_hat):
    timedelta = abs((y_hat['timestamp'] - y['event time:timestamp']).dt.total_seconds())
    return timedelta.mean()


In [12]:
time_old = time.time()

predictor = NaivePredictor()

predictor.fit(train_data)

time_fit = time.time() - time_old

# df = predictor.df

print('\nTraining dataset size: {}'.format(train_data.shape))
print('Test dataset size: {}'.format(test_data.shape))

print('\nMost common transitions (first 10): {}'.format(list(predictor.transition_modes.items())[:10] ) )

print('\nAverage transition times (seconds) (first 10): {}'.format(list(predictor.transition_time_averages.items())[:10]))

predictor.predict(test_data)

prediction = predictor.prediction

print('\nTransition accuracy: {:.2f}%'.format(transition_accuracy(test_data, predictor.prediction) * 100))
print('\nTimestamp MAE: {}'.format(pd.to_timedelta(time_delta_mse(test_data, predictor.prediction), unit='s')))
print('\nTime spent on fitting: {}'.format(time_fit) + ' seconds')
print('\nTime spent on predicting {}'.format(time.time()-time_old) + ' seconds')

print('\nWriting predictions to a csv file...')
test_data['event prediction'] = predictor.prediction['event']
test_data['timestamp prediction'] = predictor.prediction['timestamp']
test_data.to_csv('first_predictor_output.csv')
print('\nDone.')


Training dataset size: (291962, 4)
Test dataset size: (150166, 6)

Most common transitions (first 10): [('A_Create Application', 'A_Submitted'), ('A_Submitted', 'W_Handle leads'), ('W_Handle leads', 'W_Handle leads'), ('W_Complete application', 'W_Complete application'), ('A_Concept', 'W_Complete application'), ('A_Accepted', 'O_Create Offer'), ('O_Create Offer', 'O_Created'), ('O_Created', 'O_Sent (mail and online)'), ('O_Sent (mail and online)', 'W_Complete application'), ('W_Call after offers', 'W_Call after offers')]

Average transition times (seconds) (first 10): [('A_Accepted#O_Create Offer', 505.54787655355437), ('A_Accepted#W_Complete application', 15631.848666666679), ('A_Cancelled#O_Cancelled', 0.058121951219512517), ('A_Cancelled#W_Call after offers', 0.073193548387096782), ('A_Cancelled#W_Call incomplete files', 0.097428571428571448), ('A_Cancelled#W_Complete application', 0.0315), ('A_Cancelled#W_Validate application', 0.10700000000000001), ('A_Complete#A_Cancelled', 890.