In [1]:
import pandas as pd

In [2]:
dtype = {
    'device_id': 'int64',
    'auctionsCount': 'float64',
    'auctionsMostFreqDay': 'float64',
    'auctionsStdDevInterval': 'float64',
    'auctionsAproxFreq': 'float64',
    'auctionsLast': 'float64'
}

In [3]:
featuresAuctions = pd.read_csv('data/auctionsFeaturesFirstThreeDays.csv', usecols=dtype.keys(), dtype=dtype)

In [4]:
dtype = {
    'device_id': 'int64',
    'eventsCount': 'float64',
    'eventsMostFreqDay': 'float64',
    'eventsStdDevInterval': 'float64',
    'eventsAproxFreq': 'float64',
    'eventsLast': 'float64'
}

In [5]:
featuresEvents = pd.read_csv('data/eventsFeaturesFirstThreeDays.csv', usecols=dtype.keys(), dtype=dtype)

In [6]:
features = pd.merge(featuresAuctions, featuresEvents, how='outer')

In [7]:
dtype = {
    'device_id': 'int64',
    'st': 'float64'
}

In [8]:
labels = pd.read_csv('data/labelsFirstThreeDays.csv', usecols=dtype.keys(), dtype=dtype)

In [9]:
labels = labels.merge(features[['device_id']])

El id del device no me interesa, pero las filas labels y features deben coincidir para un dispositivo. Por eso ordeno por dispositivo y luego tiro los id.

In [10]:
labels = labels.sort_values(by='device_id').reset_index(drop=True)

In [11]:
features = features.sort_values(by='device_id').reset_index(drop=True)

In [12]:
labels = labels.drop('device_id', axis=1)

In [13]:
features = features.drop('device_id', axis=1)

In [14]:
labels['uncensored'] = labels['st'].notnull()

In [15]:
secondsInThreeDays = 3 * 24 * 60 * 60
labels['st'] = labels['st'].fillna(secondsInThreeDays)

In [16]:
features = features.fillna(0)

In [17]:
labels.head()

Unnamed: 0,st,uncensored
0,259200.0,False
1,259200.0,False
2,259200.0,False
3,259200.0,False
4,88.851558,True


In [18]:
labels['uncensored'].value_counts()

True     247656
False    159413
Name: uncensored, dtype: int64

In [19]:
features.head()

Unnamed: 0,auctionsCount,auctionsMostFreqDay,auctionsAproxFreq,auctionsStdDevInterval,auctionsLast,eventsCount,eventsMostFreqDay,eventsAproxFreq,eventsStdDevInterval,eventsLast
0,35.0,3.0,0.000185,26777.818183,189019.92075,88.0,3.0,0.000457,12501.025059,192746.512
1,8.0,3.0,4.2e-05,66376.255608,188652.769083,0.0,0.0,0.0,0.0,0.0
2,6.0,1.0,0.000572,4222.014848,10489.540987,8.0,2.0,5.4e-05,52213.999252,148055.839
3,1.0,3.0,4e-06,0.0,256765.420614,0.0,0.0,0.0,0.0,0.0
4,7.0,3.0,2.7e-05,46058.714104,257677.264901,13.0,1.0,0.000215,16645.82668,60350.055


In [20]:
n = 10000

In [21]:
labels = labels[:2*n]

In [22]:
features = features[:2*n]

In [23]:
labelsTrain = labels.iloc[:n]

In [24]:
labelsTest = labels.iloc[n:2*n]

In [25]:
featuresTrain = features.iloc[:n]

In [26]:
featuresTest = features.iloc[n:2*n]

In [27]:
from sksurv.linear_model import CoxPHSurvivalAnalysis
estimator = CoxPHSurvivalAnalysis(verbose=True)

In [28]:
from sksurv.util import Surv
util = Surv()
saLabels = util.from_dataframe('uncensored', 'st', labels)
saLabelsTrain = util.from_dataframe('uncensored', 'st', labelsTrain)
saLabelsTest = util.from_dataframe('uncensored', 'st', labelsTest)

In [29]:
estimator.fit(features, saLabels)

  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)


iter     10: optimization converged


  overwrite_a=False, overwrite_b=False, check_finite=False)


CoxPHSurvivalAnalysis(alpha=0, n_iter=100, tol=1e-09, verbose=True)

In [30]:
estimator.score(featuresTest, saLabelsTest)

0.7316257827642384

In [31]:
import numpy as np

In [32]:
def quantileOfStepSurvivalFunction (t, p, t0):
    i = np.searchsorted(t, t0, side='right')
    if i - 1 < 0: return p[0]
    else: return p[i-1]

In [33]:
quantiles = np.array([])
for i, x in features.iterrows():
    pred_surv = estimator.predict_survival_function(x)
    newQuantile = quantileOfStepSurvivalFunction(pred_surv[0].x, pred_surv[0].y, labels.loc[i]['st'])
    quantiles = np.append(quantiles, newQuantile)

In [34]:
quantilesTrain = quantiles[:n]
quantilesTest = quantiles[n:2*n]

In [35]:
from sklearn.linear_model import Ridge

quantileEstimator = Ridge()
quantileEstimator.fit(featuresTrain, quantilesTrain)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [36]:
quantileEstimator.score(featuresTrain, quantilesTrain)

0.18632831169594932

In [37]:
quantileEstimator.score(featuresTest, quantilesTest)

0.19767767868149266

In [38]:
predictedQuantiles = quantileEstimator.predict(featuresTest)

In [39]:
def preimageOfStepSurvivalFunction (t, p, p0):
    i = p.size - np.searchsorted(p[::-1], p0, side = "left")
    if i >= t.size: return t[t.size - 1]
    elif i - 1 < 0: return t[0]
    else: return (t[i] + t[i-1]) / 2

In [40]:
ecm = 0
errs = np.array([])
j = 0
for i, x in featuresTest[:1000].iterrows():
    pred_surv = estimator.predict_survival_function(x)
    err = abs(preimageOfStepSurvivalFunction(pred_surv[0].x, pred_surv[0].y, predictedQuantiles[j]) - labelsTest.loc[i]['st'])
    errs = np.append(errs, err)
    ecm += err**2
    j += 1

ecm /= 1000

In [41]:
ecm

9207840487.837994

In [42]:
errs.mean() / 60 / 60

20.055673685508058

Los resultados son parecidos a las otras opciones. Igual, todavia se puede mejorar.