In [1]:
import pandas as pd

In [2]:
location = 'data/'
featuresFile = location + 'features.csv'
labelsFile = location + 'labels.csv'
featuresPredictFile = location + 'featuresPredict.csv'
targetFile = location + 'target_competencia_ids.csv'

In [3]:
dtypeFeatures = {
    'device_id': 'str',
    'eventsCount': 'float64',
    'eventsMostFreqDay': 'float64',
    'eventsMeanInterval': 'float64',
    'eventsAproxFreq': 'float64',
    'eventsStdDevInterval': 'float64',
    'eventsLast': 'float64',
    'installsCount': 'float64',
    'installsMostFreqDay': 'float64',
    'installsMeanInterval': 'float64',
    'installsAproxFreq': 'float64',
    'installsStdDevInterval': 'float64',
    'installsLast': 'float64',
    'clicksCount': 'float64',
    'clicksMostFreqDay': 'float64',
    'clicksMeanInterval': 'float64',
    'clicksAproxFreq': 'float64',
    'clicksStdDevInterval': 'float64',
    'clicksLast': 'float64',
    'auctionsCount': 'float64',
    'auctionsMostFreqDay': 'float64',
    'auctionsMeanInterval': 'float64',
    'auctionsAproxFreq': 'float64',
    'auctionsStdDevInterval': 'float64',
    'auctionsLast': 'float64'
}
dtypeLabels = {
    'device_id': 'str',
    'st': 'float64',
    'sc': 'float64'
}
dtypeTarget = {
    'ref_hash': 'str',
    'obj': 'float64',
}

In [4]:
features = pd.read_csv(featuresFile, usecols=dtypeFeatures.keys(), dtype=dtypeFeatures)
labels = pd.read_csv(labelsFile, usecols=dtypeLabels.keys(), dtype=dtypeLabels)

In [5]:
labels = labels.sort_values(by='device_id').reset_index(drop=True)

In [6]:
features = features.sort_values(by='device_id').reset_index(drop=True)

In [7]:
(features['device_id'] == labels['device_id']).value_counts()

True    1734598
Name: device_id, dtype: int64

In [8]:
labels = labels.drop('device_id', axis=1)

In [9]:
features = features.drop('device_id', axis=1)

In [10]:
labels['st_uncensored'] = labels['st'].notnull()

In [11]:
labels['sc_uncensored'] = labels['sc'].notnull()

In [12]:
secondsInThreeDays = 3 * 24 * 60 * 60
labels = labels.fillna(secondsInThreeDays)

In [13]:
features = features.fillna(0)

In [14]:
labels['st_uncensored'].value_counts()

True     1006611
False     727987
Name: st_uncensored, dtype: int64

In [15]:
labels['sc_uncensored'].value_counts()

False    1509053
True      225545
Name: sc_uncensored, dtype: int64

In [16]:
from sksurv.linear_model import CoxPHSurvivalAnalysis
estimatorSt = CoxPHSurvivalAnalysis(verbose=True)
estimatorSc = CoxPHSurvivalAnalysis(verbose=True)

In [17]:
from sksurv.util import Surv as util
saLabelsSt = util.from_dataframe('st_uncensored', 'st', labels)
saLabelsSc = util.from_dataframe('sc_uncensored', 'sc', labels)

In [18]:
n = 10000

In [19]:
estimatorSt.fit(features.iloc[:n], saLabelsSt[:n])
estimatorSc.fit(features[:n], saLabelsSc[:n])

  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)


iter     11: optimization converged


  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)
  overwrite_a=False, overwrite_b=False, check_finite=False)


iter     12: optimization converged


  overwrite_a=False, overwrite_b=False, check_finite=False)


CoxPHSurvivalAnalysis(alpha=0, n_iter=100, tol=1e-09, verbose=True)

In [20]:
import numpy as np

In [21]:
def quantileOfStepSurvivalFunction (t, p, t0):
    i = np.searchsorted(t, t0, side='right')
    if i - 1 < 0: return p[0]
    else: return p[i-1]

In [26]:
quantilesSt = np.array([])
for i, x in features[:n].iterrows():
    pred_surv = estimatorSt.predict_survival_function(x)
    newQuantile = quantileOfStepSurvivalFunction(pred_surv[0].x, pred_surv[0].y, labels.loc[i]['st'])
    quantilesSt = np.append(quantilesSt, newQuantile)

In [27]:
quantilesSc = np.array([])
for i, x in features[:n].iterrows():
    pred_surv = estimatorSc.predict_survival_function(x)
    newQuantile = quantileOfStepSurvivalFunction(pred_surv[0].x, pred_surv[0].y, labels.loc[i]['sc'])
    quantilesSc = np.append(quantilesSc, newQuantile)

In [35]:
from sklearn.linear_model import Ridge
quantileEstimatorSt = Ridge()
quantileEstimatorSc = Ridge()

In [37]:
quantileEstimatorSt.fit(features[:n], quantilesSt)
quantileEstimatorSc.fit(features[:n], quantilesSc)

Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [57]:
featuresPredict = pd.read_csv(featuresPredictFile, usecols=dtypeFeatures.keys(), dtype=dtypeFeatures)
target = pd.read_csv(targetFile, usecols=dtypeTarget.keys(), dtype=dtypeTarget)

In [58]:
target['device_id'] = target['ref_hash'].str[:-3]

In [59]:
target = target.drop('ref_hash', axis=1)

In [60]:
target = target.drop_duplicates('device_id')

In [61]:
featuresPredict = featuresPredict.merge(target[['device_id']], how='right')

In [62]:
featuresPredict = featuresPredict.fillna(0)

In [63]:
predictedQuantilesSt = quantileEstimatorSt.predict(featuresPredict.drop('device_id', axis=1))
predictedQuantilesSc = quantileEstimatorSc.predict(featuresPredict.drop('device_id', axis=1))

In [64]:
def preimageOfStepSurvivalFunction (t, p, p0):
    i = p.size - np.searchsorted(p[::-1], p0, side = "left")
    if i >= t.size: return t[t.size - 1]
    elif i - 1 < 0: return t[0]
    else: return (t[i] + t[i-1]) / 2

In [74]:
j = 0
predictedValuesSt = np.array([])
for i, x in featuresPredict.drop('device_id', axis=1).iterrows():
    pred_surv = estimatorSt.predict_survival_function(x)
    newSt = preimageOfStepSurvivalFunction(pred_surv[0].x, pred_surv[0].y, predictedQuantilesSt[j])
    predictedValuesSt = np.append(predictedValuesSt, newSt)
    j += 1

In [75]:
j = 0
predictedValuesSc = np.array([])
for i, x in featuresPredict.drop('device_id', axis=1).iterrows():
    pred_surv = estimatorSc.predict_survival_function(x)
    newSc = preimageOfStepSurvivalFunction(pred_surv[0].x, pred_surv[0].y, predictedQuantilesSc[j])
    predictedValuesSc = np.append(predictedValuesSc, newSc)
    j += 1

In [97]:
st = pd.DataFrame({'obj': predictedValuesSt, 'ref_hash': target['device_id']})

In [98]:
sc = pd.DataFrame({'obj': predictedValuesSc,'ref_hash': target['device_id']})

In [101]:
st['ref_hash'] = st['ref_hash'] + '_st'
sc['ref_hash'] = sc['ref_hash'] + '_sc'

In [108]:
result = pd.concat([st, sc], sort=True)

In [110]:
result[['ref_hash', 'obj']]

Unnamed: 0,ref_hash,obj
0,1000169251625791246_st,20708.697529
2,1000395625957344683_st,46660.795206
4,1003027494996471685_st,236821.004360
6,1006670001679961544_st,20674.976981
8,1007573308966476713_st,233583.495290
10,1010070503877148763_st,65122.382645
12,1010265377387765028_st,11192.108893
14,1010531372912327058_st,15041.238008
16,1011610998357271358_st,89257.387900
18,1013543838965040946_st,233647.932224


In [111]:
result.to_csv('data/result.csv', index=False)