In [2]:
import numpy as np
import pandas as pd
import multiprocessing as mt
from functools import partial
%matplotlib inline

In [3]:
%%time
def load_data(filepath='../data/', 
              files=['members.csv', 'train.csv', 'transactions.csv', 'user_logs.csv'],
              nrows=[None, None, None, None]):

    d = []
    for f, r in zip(files, nrows):
        d.append(pd.read_csv(filepath+f, nrows=r).set_index('msno'))
    return d

members, train, transactions, user_logs = load_data(nrows=[None, None, None, None])

CPU times: user 6min 41s, sys: 40.9 s, total: 7min 22s
Wall time: 7min 27s


In [4]:
print('Unique dates in user logs: {}'.format(user_logs.date.nunique()))
print('Dates go from {} to {}'.format(user_logs.date.min(), user_logs.date.max()))
# create date mask with all possible dates
all_dates = [str(d).replace('-','')[:8] for d in pd.date_range('20150101', '20170228')]
date_mask = pd.DataFrame({
    'date': all_dates
})

Unique dates in user logs: 790
Dates go from 20150101 to 20170228


In [5]:
user_logs = user_logs.join(train, how='left')

In [None]:
user_logs.groupby('is_churn').agg({
    'date': ['min', 'max']
})

Unnamed: 0_level_0,date,date
Unnamed: 0_level_1,min,max
is_churn,Unnamed: 1_level_2,Unnamed: 2_level_2
0.0,20150101,20170228
1.0,20150101,20170228


In [None]:
user_logs.reset_index()[user_logs['is_churn'] == 1].groupby('msno').max().hist()

In [None]:
user_logs.reset_index()[user_logs['is_churn'] == 0].groupby('msno').max().hist()

In [None]:
user_logs.reset_index()[user_logs['is_churn'] == 1].msno.value_counts().hist()

In [None]:
user_logs.reset_index()[user_logs['is_churn'] == 0].msno.value_counts().hist()

In [None]:
train.head()

In [None]:
user_logs.join(train)

In [111]:
%%time
def create_user_sequence(df, msno):
    try:
        df = df.loc[[msno], :].sort_values('date')
        date_mask[date_mask['date'].astype(int) < (df['date'].max() + 1)].merge(df, how='outer').fillna(0)
        return (msno, df)
    except KeyError as e:
        print(e)
        return (msno, df.iloc[:0])
    
exseqs = [create_user_sequence(user_logs, members.sample()['msno'].values[0]) for i in range(10)]

CPU times: user 2min 32s, sys: 1.29 s, total: 2min 33s
Wall time: 2min 43s


In [155]:
exseqs[0][1].tail(7)

Unnamed: 0_level_0,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
htiCma9DAE7iLKx4uHw7lBhdT7o8P97/a46El8JudJM=,20170215,0,0,0,0,19,19,4798.389
htiCma9DAE7iLKx4uHw7lBhdT7o8P97/a46El8JudJM=,20170216,61,9,1,1,74,120,19980.079
htiCma9DAE7iLKx4uHw7lBhdT7o8P97/a46El8JudJM=,20170220,1,0,0,0,19,20,4630.135
htiCma9DAE7iLKx4uHw7lBhdT7o8P97/a46El8JudJM=,20170221,3,0,0,2,3,7,1220.413
htiCma9DAE7iLKx4uHw7lBhdT7o8P97/a46El8JudJM=,20170222,0,0,0,0,1,1,244.976
htiCma9DAE7iLKx4uHw7lBhdT7o8P97/a46El8JudJM=,20170225,11,2,1,3,43,48,11280.16
htiCma9DAE7iLKx4uHw7lBhdT7o8P97/a46El8JudJM=,20170226,0,0,0,1,2,3,697.207


In [153]:
train.set_index('msno').loc['htiCma9DAE7iLKx4uHw7lBhdT7o8P97/a46El8JudJM=']

is_churn    0
Name: htiCma9DAE7iLKx4uHw7lBhdT7o8P97/a46El8JudJM=, dtype: int64

Unnamed: 0,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
0,20150101,0.0,0.0,0.0,0.0,0.0,0.0,0.000
1,20150102,0.0,0.0,0.0,0.0,0.0,0.0,0.000
2,20150103,0.0,0.0,0.0,0.0,0.0,0.0,0.000
3,20150104,0.0,0.0,0.0,0.0,0.0,0.0,0.000
4,20150105,0.0,0.0,0.0,0.0,0.0,0.0,0.000
5,20150106,0.0,0.0,0.0,0.0,0.0,0.0,0.000
6,20150107,0.0,0.0,0.0,0.0,0.0,0.0,0.000
7,20150108,0.0,0.0,0.0,0.0,0.0,0.0,0.000
8,20150109,0.0,0.0,0.0,0.0,0.0,0.0,0.000
9,20150110,0.0,0.0,0.0,0.0,0.0,0.0,0.000


In [103]:
user_logs.sample(10)

In [97]:
user_dates = user_logs.groupby('msno').agg({
    'date': ['min', 'max', 'nunique']
})
user_dates.sample(7)

KeyboardInterrupt: 

In [59]:
%%time
def create_user_sequence(df, msno):
    try:
        df = df.loc[[msno], :].sort_values('date')
        return (msno, df)
    except KeyError:
        return (msno, df.iloc[:0])

pool = mt.Pool(24)
user_sequences = pool.map(partial(create_user_sequence, user_logs.set_index('msno')), train['msno'].unique())
user_sequences = [us for us in user_sequences if us[1].shape[0] > 0]
pool.close()
pool.join()

CPU times: user 6min 5s, sys: 12.7 s, total: 6min 18s
Wall time: 25min 30s


In [60]:
print('Longest sequence length: {}'.format(np.max([df.shape[0] for msno, df in user_sequences])))
print('Average sequence length: {}'.format(np.mean([df.shape[0] for msno, df in user_sequences])))
print('Number of non-empty sequences: {}'.format(len(user_sequences)))

Longest sequence length: 16
Average sequence length: 6.33396524807
Number of non-empty sequences: 98642


In [61]:
msnos = pd.DataFrame({'msno': [us[0] for us in user_sequences]})
msnos = msnos.sample(msnos.shape[0])
target = train.merge(msnos).set_index('msno').loc[[msno for msno, df in user_sequences]]
target.head()

Unnamed: 0_level_0,is_churn
msno,Unnamed: 1_level_1
T0FF6lumjKcqEO0O+tUH2ytc+Kb9EkeaLzcVUiTr1aE=,1
I8dFN2EjFN1mt4Xel8WQX1/g7u6Dg4PBMHLkiDjhUS8=,1
BJfEs9V27SKREEiSEB94PdWU0c9kz5xpe2mEv09nTO0=,1
Q3zQXIS9cOKLdgyF1IIQkm4xEiF9cgIuK3dxTdXN6As=,1
ddBEjhJQYsrouP54GTueBgvsQPrd7frM1YDSp6zi8T0=,1


In [62]:
user_sequences[15][1]

Unnamed: 0_level_0,date,num_25,num_50,num_75,num_985,num_100,num_unq,total_secs
msno,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
I6fXAb89UexuHChfJNZPAicSWK2QurV0/S8rTX3CXqo=,20151108,1,0,0,0,4,4,1012.083
I6fXAb89UexuHChfJNZPAicSWK2QurV0/S8rTX3CXqo=,20160223,27,5,2,7,17,37,6150.869
I6fXAb89UexuHChfJNZPAicSWK2QurV0/S8rTX3CXqo=,20160507,9,1,4,1,12,16,3732.555
I6fXAb89UexuHChfJNZPAicSWK2QurV0/S8rTX3CXqo=,20160713,7,2,5,5,11,21,4752.361


In [None]:
time_range = pandas.date_range('2016-12-02T11:00:00.000Z', '2017-06-06', freq='D')

In [39]:
train_size = 3000
user_sequence_vals = [df.values.transpose() for msno, df in user_sequences]
train_features, train_targets = user_sequences[:train_size], user_sequences[:train_size]
test_features, test_targets = target[train_size:], target[train_size:]

In [40]:
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.layers import Embedding
from keras.layers import Conv1D, GlobalMaxPooling1D

# set parameters:
max_features = 5000
maxlen = 366
batch_size = 32
embedding_dims = 50
filters = 250
kernel_size = 3
hidden_dims = 250
epochs = 2

(x_train, y_train) = (train_features, train_targets)
(x_test, y_test) = (test_features, test_targets)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Build model...')
model = Sequential()

# we start off with an efficient embedding layer which maps
# our vocab indices into embedding_dims dimensions
model.add(Embedding(max_features,
                    embedding_dims,
                    input_length=maxlen))
model.add(Dropout(0.2))

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1))
# we use max pooling:
model.add(GlobalMaxPooling1D())

# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))

# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
validation_data=(x_test, y_test))

(3000, 'train sequences')
(935, 'test sequences')
Pad sequences (samples x time)


ValueError: setting an array element with a sequence

In [18]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

In [22]:
len(x_train[0])

218