In [1]:
import pandas as pd

In [2]:
location = 'data/'
featuresFile = location + 'features.csv'
labelsFile = location + 'labels.csv'
featuresPredictFile = location + 'featuresPredict.csv'
targetFile = location + 'target_competencia_ids.csv'

In [41]:
dtypeFeatures = {
    'device_id': 'str',
    'eventsCount': 'float64',
    'eventsMostFreqDay': 'float64',
    'eventsMeanInterval': 'float64',
    'eventsAproxFreq': 'float64',
    'eventsStdDevInterval': 'float64',
    'eventsLast': 'float64',
    'installsCount': 'float64',
    'installsMostFreqDay': 'float64',
    'installsMeanInterval': 'float64',
    'installsAproxFreq': 'float64',
    'installsStdDevInterval': 'float64',
    'installsLast': 'float64',
    'clicksCount': 'float64',
    'clicksMostFreqDay': 'float64',
    'clicksMeanInterval': 'float64',
    'clicksAproxFreq': 'float64',
    'clicksStdDevInterval': 'float64',
    'clicksLast': 'float64',
    'auctionsCount': 'float64',
    'auctionsMostFreqDay': 'float64',
    'auctionsMeanInterval': 'float64',
    'auctionsAproxFreq': 'float64',
    'auctionsStdDevInterval': 'float64',
    'auctionsLast': 'float64'
}
dtypeLabels = {
    'device_id': 'str',
    'st': 'float64',
    'sc': 'float64'
}
dtypeTarget = {
    'ref_hash': 'str',
    'obj': 'float64',
}

In [12]:
features = pd.read_csv(featuresFile, usecols=dtypeFeatures.keys(), dtype=dtypeFeatures)
labels = pd.read_csv(labelsFile, usecols=dtypeLabels.keys(), dtype=dtypeLabels)

In [13]:
labels = labels.sort_values(by='device_id').reset_index(drop=True)

In [14]:
features = features.sort_values(by='device_id').reset_index(drop=True)

In [17]:
(features['device_id'] == labels['device_id']).value_counts()

True    1734598
Name: device_id, dtype: int64

In [18]:
labels = labels.drop('device_id', axis=1)

In [19]:
features = features.drop('device_id', axis=1)

In [20]:
labels['st_uncensored'] = labels['st'].notnull()

In [22]:
labels['sc_uncensored'] = labels['sc'].notnull()

In [23]:
secondsInThreeDays = 3 * 24 * 60 * 60
labels = labels.fillna(secondsInThreeDays)

In [32]:
features = features.fillna(0)

In [35]:
labels['st_uncensored'].value_counts()

True     1006611
False     727987
Name: st_uncensored, dtype: int64

In [36]:
labels['sc_uncensored'].value_counts()

False    1509053
True      225545
Name: sc_uncensored, dtype: int64

In [38]:
from sksurv.linear_model import CoxPHSurvivalAnalysis
estimatorSt = CoxPHSurvivalAnalysis(verbose=True)
estimatorSc = CoxPHSurvivalAnalysis(verbose=True)

In [39]:
from sksurv.util import Surv as util
saLabelsSt = util.from_dataframe('st_uncensored', 'st', labels)
saLabelsSc = util.from_dataframe('sc_uncensored', 'sc', labels)

In [40]:
estimatorSt.fit(features, saLabelsSt)
estimatorSc.fit(features, saLabelsSc)

  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)


iter     10: optimization converged


  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)


iter      6: optimization converged


CoxPHSurvivalAnalysis(alpha=0, n_iter=100, tol=1e-09, verbose=True)

In [42]:
import numpy as np

In [43]:
def quantileOfStepSurvivalFunction (t, p, t0):
    i = np.searchsorted(t, t0, side='right')
    if i - 1 < 0: return p[0]
    else: return p[i-1]

In [None]:
quantilesSt = np.array([])
for i, x in features.iterrows():
    pred_surv = estimatorSt.predict_survival_function(x)
    newQuantile = quantileOfStepSurvivalFunction(pred_surv[0].x, pred_surv[0].y, labels.loc[i]['st'])
    quantiles = np.append(quantilesSt, newQuantile)

In [None]:
quantilesSc = np.array([])
for i, x in features.iterrows():
    pred_surv = estimatorSc.predict_survival_function(x)
    newQuantile = quantileOfStepSurvivalFunction(pred_surv[0].x, pred_surv[0].y, labels.loc[i]['sc'])
    quantilesSt = np.append(quantilesSc, newQuantile)

In [None]:
from sklearn.linear_model import Ridge
quantileEstimatorSt = Ridge()
quantileEstimatorSc = Ridge()

In [None]:
quantileEstimatorSt.fit(features, quantilesSt)
quantileEstimatorSc.fit(features, quantilesSc)

In [None]:
featuresPredict = pd.read_csv(featuresPredictFile, usecols=dtypeFeatures.keys(), dtype=dtypeFeatures)
target = pd.read_csv(featuresPredictFile, usecols=dtypeFeatures.keys(), dtype=dtypeFeatures)

In [None]:
target['device_id'] = target['ref_hash'].str[:-3]

In [None]:
target = target.drop('ref_hash', axis=1)

In [None]:
target.drop_duplicates('device_id')

In [None]:
featuresPredict = featuresPredict.merge(target)

In [None]:
featuresPredict = featuresPredict.fillna(0)

In [None]:
predictedQuantilesSt = quantileEstimatorSt.predict(featuresPredict.drop('device_id', axis=1))
predictedQuantilesSc = quantileEstimatorSc.predict(featuresPredict.drop('device_id', axis=1))

In [None]:
def preimageOfStepSurvivalFunction (t, p, p0):
    i = p.size - np.searchsorted(p[::-1], p0, side = "left")
    if i >= t.size: return t[t.size - 1]
    elif i - 1 < 0: return t[0]
    else: return (t[i] + t[i-1]) / 2

In [None]:
j = 0
predictedValuesSt = np.array([])
for i, x in featuresPredict.iterrows():
    pred_surv = estimatorSt.predict_survival_function(x)
    preimageOfStepSurvivalFunction(pred_surv[0].x, pred_surv[0].y, predictedQuantilesSt[j])
    j += 1

In [None]:
j = 0
predictedValuesSc = np.array([])
for i, x in featuresPredict.iterrows():
    pred_surv = estimatorSc.predict_survival_function(x)
    preimageOfStepSurvivalFunction(pred_surv[0].x, pred_surv[0].y, predictedQuantilesSc[j])
    j += 1