# Survival models

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import random

import statsmodels.duration.hazard_regression as sm

In [2]:
num_inc = 10
max_inc = 50
start = datetime.strptime('2018-01-01', "%Y-%M-%d")
end = datetime.strptime('2019-01-01', "%Y-%M-%d")

# Create data

In [102]:
# Random data
dates = [random.random() * (end - start) + start for _ in range(num_inc)]
types = ['one', 'two', 'three']
locs = ['loc1', 'loc2']
pers = ['id1', 'id2']
events = pd.DataFrame({'loc':random.choices(locs, k = num_inc),
                       'time':dates, 
                       'pers_id':random.choices(pers, k = num_inc), 
                       'type':random.choices(types, k = num_inc)})
events = events.sort_values(by=['pers_id', 'time']).reset_index(drop = True)
events['ev_id'] = range(1, num_inc+1)

In [103]:
# Calculate characteristics of previous events within each person
events['prev_event'] = events.groupby('pers_id').time.shift(1)

In [104]:
events

Unnamed: 0,loc,time,pers_id,type,ev_id,prev_event
0,loc2,2018-01-05 09:55:34.179694,id1,two,1,NaT
1,loc2,2018-03-16 10:01:24.163666,id1,one,2,2018-01-05 09:55:34.179694
2,loc1,2018-10-31 14:57:01.935528,id1,two,3,2018-03-16 10:01:24.163666
3,loc1,2018-12-21 21:08:04.562846,id1,three,4,2018-10-31 14:57:01.935528
4,loc2,2018-03-28 05:08:15.891421,id2,one,5,NaT
5,loc1,2018-04-27 06:40:30.204270,id2,three,6,2018-03-28 05:08:15.891421
6,loc2,2018-05-23 19:39:34.544640,id2,three,7,2018-04-27 06:40:30.204270
7,loc2,2018-08-17 01:55:27.863793,id2,one,8,2018-05-23 19:39:34.544640
8,loc2,2018-09-13 17:46:41.720826,id2,two,9,2018-08-17 01:55:27.863793
9,loc2,2018-11-12 18:18:28.388576,id2,one,10,2018-09-13 17:46:41.720826


In [105]:
# Running total by type
timed_totals = events.copy().sort_values(['pers_id', 'time'])
timed_totals['prev_count'] = timed_totals.groupby(['pers_id', 'type'])['type'].cumcount()
timed_totals = timed_totals.pivot_table(index = ['ev_id', 'time', 'pers_id'], 
                                        columns='type', values='prev_count')
timed_totals = timed_totals.reset_index().sort_values(['pers_id', 'time'])
timed_totals = timed_totals.groupby('pers_id').fillna(method = 'backfill')
timed_totals


type,ev_id,one,three,time,two
0,1,0.0,0.0,2018-01-05 09:55:34.179694,0.0
1,2,0.0,0.0,2018-03-16 10:01:24.163666,1.0
2,3,,0.0,2018-10-31 14:57:01.935528,1.0
3,4,,0.0,2018-12-21 21:08:04.562846,
4,5,0.0,0.0,2018-03-28 05:08:15.891421,0.0
5,6,1.0,0.0,2018-04-27 06:40:30.204270,0.0
6,7,1.0,1.0,2018-05-23 19:39:34.544640,0.0
7,8,1.0,,2018-08-17 01:55:27.863793,0.0
8,9,2.0,,2018-09-13 17:46:41.720826,0.0
9,10,2.0,,2018-11-12 18:18:28.388576,


In [106]:
# Rename columns
timed_totals.columns = ["old_" + s for s in timed_totals.columns.values]
timed_totals.rename(columns={'old_ev_id':'ev_id', 
                             'old_time':'time', 
                             'old_pers_id':'pers_id'}, inplace=True)

In [125]:
# Merge running counts back onto original list of incidents
events_tot = pd.merge(events, timed_totals, how = 'left', 
                      on = ['ev_id'])
# Grouped forward fill remaining NA's
old_types = ["old_" + s for s in types]
types_tot = events_tot.groupby("pers_id")[old_types].transform(lambda x: x.fillna(x.max())).fillna(0)
events_all = pd.concat([events, types_tot], axis = 1, ignore_index = True)
events_all.columns = np.concatenate((events.columns.values, old_types), axis = None)

In [126]:
events_all

Unnamed: 0,loc,time,pers_id,type,ev_id,prev_event,old_one,old_two,old_three
0,loc2,2018-01-05 09:55:34.179694,id1,two,1,NaT,0.0,0.0,0.0
1,loc2,2018-03-16 10:01:24.163666,id1,one,2,2018-01-05 09:55:34.179694,0.0,1.0,0.0
2,loc1,2018-10-31 14:57:01.935528,id1,two,3,2018-03-16 10:01:24.163666,0.0,1.0,0.0
3,loc1,2018-12-21 21:08:04.562846,id1,three,4,2018-10-31 14:57:01.935528,0.0,1.0,0.0
4,loc2,2018-03-28 05:08:15.891421,id2,one,5,NaT,0.0,0.0,0.0
5,loc1,2018-04-27 06:40:30.204270,id2,three,6,2018-03-28 05:08:15.891421,1.0,0.0,0.0
6,loc2,2018-05-23 19:39:34.544640,id2,three,7,2018-04-27 06:40:30.204270,1.0,0.0,1.0
7,loc2,2018-08-17 01:55:27.863793,id2,one,8,2018-05-23 19:39:34.544640,1.0,0.0,1.0
8,loc2,2018-09-13 17:46:41.720826,id2,two,9,2018-08-17 01:55:27.863793,2.0,0.0,1.0
9,loc2,2018-11-12 18:18:28.388576,id2,one,10,2018-09-13 17:46:41.720826,2.0,0.0,1.0
