In [2]:
import itertools 
import numpy as np

In [21]:
def get_mean_durations(bits):
    durations_ones = [sum(g) for b, g in itertools.groupby(bits) if b]
    durations_zeros = [sum(g) for b, g in itertools.groupby(1-bits) if b]
    print("durations of ones:",durations_ones)
    print("durations of zeros:",durations_zeros)
    
    if len(durations_ones) == 0:
        print("len(durations_ones_bX) = 0 (i.e. only zeros)")
        mean_durations_ones = 0
    else:
        mean_durations_ones = np.mean(durations_ones)
    
    if len(durations_zeros) == 0:
        print("len(durations_zeros_bX) = 0 (i.e. only ones)")
        mean_durations_zeros = 0
    else:
        mean_durations_zeros = np.mean(durations_zeros)        
    
    return mean_durations_ones,mean_durations_zeros,

#Example
bits = np.array([0,0,0,0,0,1,1,1,0,1,0,0,1,1,1,1,0])
print("Series observed:",bits)
print("Mean durations of series:",get_mean_durations(bits)) 

Series observed: [0 0 0 0 0 1 1 1 0 1 0 0 1 1 1 1 0]
durations of ones: [3, 1, 4]
durations of zeros: [5, 1, 2, 1]
Mean durations of series: (2.6666666666666665, 2.25)


In [38]:
def survival_probability(bits):
    if len(bits) == 1:
        print("Transition from 0 : NaN")
        print("Transition from 1 : NaN")
        return 'NaN','NaN'
    
    elif len(np.unique(bits)) == 1: #bits is a constant series
        if np.unique(bits) == np.array([0]): #bits is a constant series, equal to 0
            print("Transition from 0 :",1)
            print("Transition from 1 : NaN")
            return 1,'NaN'

        elif np.unique(bits) == np.array([1]): #bits is a constant series, equal to 1
            print("Transition from 0 : NaN")
            print("Transition from 1 :",1)
            return 'NaN',1
    
    #Now, case where 'everything is fine': there are 0 and 1
    diff = bits[1:] - bits[:-1]
    mult = bits[:-1] * bits[1:]
    transitions_all = diff + 2*mult
    
    count_0 = list(transitions_all).count(0) #transition 0->0
    count_1 = list(transitions_all).count(1) #transition 0->1
    count_2 = list(transitions_all).count(2) #transition 1->1
    count_minus1 = list(transitions_all).count(-1) #transition 1->0
    
    survival_probability_0 = count_0 / (count_0 + count_1) #proba, given x_t = 0, of having x_t+1 = 0
    survival_probability_1 = count_2 / (count_2 + count_minus1) #proba, given x_t = 1, of having x_t+1 = 1
    
    print("Transition from 0 :",survival_probability_0)
    print("Transition from 1 :",survival_probability_1)
    
    return survival_probability_0,survival_probability_1


#Example
print('-------------------Example 1 (normal case)----------------')
a_test = np.array([0,0,0,0,0,0,1,0,0,0,0,0,1,1,0]) 
survival_probability(a_test)

#Example 2: constant series
print('-------------------Example 2------------------------------')
a_test = np.array([1,1,1,1,1])
survival_probability(a_test)

#Example 3: constant series
print('-------------------Example 3------------------------------')
a_test = np.array([1])
survival_probability(a_test)

-------------------Example 1 (normal case)----------------
Transition from 0 : 0.8181818181818182
Transition from 1 : 0.3333333333333333
-------------------Example 2------------------------------
Transition from 0 : NaN
Transition from 1 : 1
-------------------Example 3------------------------------
Transition from 0 : NaN
Transition from 1 : NaN


('NaN', 'NaN')

In [23]:
#Explanation of the first 3 lines of the function survival_probability

#Simple temporal model: Markov Chain
list_test = np.array([0,1,1,0,0,1,1,1,1,1])
transitions = list_test[1:] - list_test[:-1]
print(transitions)
stays_one = list_test[1:] * list_test[:-1]
print(stays_one)
transitions_all = transitions + 2 * stays_one
print(transitions_all)

[ 1  0 -1  0  1  0  0  0  0]
[0 1 0 0 0 1 1 1 1]
[ 1  2 -1  0  1  2  2  2  2]


# Application to our problem

In [17]:
import pandas as pd
train = pd.read_csv('./data/train.csv')

In [39]:
for user, group in train.sort_values('ts_listen').groupby(['user_id']):
    #the variable "group" is all the songs that the user listened to
    print("user",user)
#     print(group)
    binary_list = group.sort_values('ts_listen')['is_listened'].values
    print("avg_user_listened =",np.mean(binary_list))
    survival_probability(binary_list)

user 0
avg_user_listened = 0.981478549945
Transition from 0 : 0.37606837606837606
Transition from 1 : 0.9882239070817874
user 1
avg_user_listened = 0.987980769231
Transition from 0 : 0.36
Transition from 1 : 0.9922128487994809
user 2
avg_user_listened = 0.960553856062
Transition from 0 : 0.42857142857142855
Transition from 1 : 0.9765297569153395
user 3
avg_user_listened = 0.959147106253
Transition from 0 : 0.5087719298245614
Transition from 1 : 0.9790732436472347
user 4
avg_user_listened = 0.236454109842
Transition from 0 : 0.7909727250784455
Transition from 1 : 0.3252730109204368
user 5
avg_user_listened = 0.988703703704
Transition from 0 : 0.25
Transition from 1 : 0.991384154336018
user 6
avg_user_listened = 0.998703223416
Transition from 0 : 0.14285714285714285
Transition from 1 : 0.9988868274582561
user 7
avg_user_listened = 0.164789001337
Transition from 0 : 0.8447645176040238
Transition from 1 : 0.21345707656612528
user 8
avg_user_listened = 0.978399228544
Transition from 0 : 0.3

ZeroDivisionError: division by zero

TODO: 
- Ici, on applique à l'ensemble des chansons d'un utilisateur. Il faudrait regarder sur les séries (écoute sans interruption ou presque: 1h max de pause). En effet, il n'y a à priori pas de relation entre le is_listened de la dernière chanson d'une série, et le is_listened de la première chanson de la prochaine série.