In [1]:
import os
import numpy as np
import scipy as sp
from scipy import stats
import pandas as pd
from tqdm import tqdm_notebook as tqdm
from gc import collect

In [2]:
def write_prediction_to_file(prediction):
    header = "id,nextvisit"
    with open("submission.csv", "w") as f:
        print(header, file=f)
        for idx, p in enumerate(prediction, 1):
            print(f"{idx}, {p}", file=f)

In [3]:
def visits2weekdays(visits):
    return np.apply_along_axis(lambda x: (x - 1) % 7 + 1, 0, visits)

def visits2history(visits, history_size):
    history = np.zeros(history_size, dtype=int)
    history[visits-1] = 1
    return history

In [33]:
with open('./train.csv', 'r') as f:
    data = pd.read_csv(f)
    data.visits = data.visits.apply(lambda x: (np.fromstring(x, dtype=int, sep=" ") - 1) % 7)

In [34]:
data.head()

Unnamed: 0,id,visits
0,1,"[1, 6, 6, 1, 3, 3, 3, 5, 5, 0, 4, 6, 6, 5, 4, ..."
1,2,"[2, 3, 4, 0, 3, 2, 4, 0, 2, 6, 5, 3, 4, 4, 2, ..."
2,3,"[1, 2, 4, 4, 4, 0, 6, 5, 5, 6, 4, 6, 5, 1, 6, ..."
3,4,"[0, 5, 0, 0, 4, 1, 0, 5, 1, 4, 6, 4, 5, 4, 1, ..."
4,5,"[5, 1, 6, 0, 4, 1, 1, 1, 6, 1, 1, 1, 1, 4, 1, ..."


In [None]:
DAYS_IN_WEEK = 7

def week_weights_computation(data, delta, alpha):
    result = np.zeros((data.shape[0],DAYS_IN_WEEK))
    for idx, row in enumerate(data):
        visits_num = float(len(row))
        week_weights = np.zeros(DAYS_IN_WEEK)
        for i, weekday in enumerate(row):
            w_1 = i / visits_num
            w_2 = np.log(i + 1) / np.log(visits_num)
            week_weights[weekday] += alpha * np.power(w_1) + (1 - alpha) * w_2
        result[idx, : ] = week_weights / np.sum(week_weights)
    return result


def probs_computation(week_weights):
    probs = [1.] + [1 - w for w in week_weights]
    for i in range(1, len(probs)):
        probs[i] *= probs[i-1]
    propbs = np.array([week_weights[i] * probs[i] for i in range(DAYS_IN_WEEK)])
    return probs

def likelyhood_computation(data, delta, alpha):
    client_weekday_weights = week_weights_computation(data, delta, alpha)
    result = np.zeros(data.shape[0])
    for idx, client_weights in enumerate(client_weekday_weights):
        result[idx] = np.argmax(probs_computation(client_weights))
    return result

In [38]:
np.argmax(np.array([[0,1,3, 7, 5, 4, 2,], [0,1,3, 7, 9, 4, 2,]]), axis=1) + 1

array([4, 5])

In [22]:
def count_person(person, delta, lin, log):
    week = [0]*7
    for i in range(len(person)):
        day = person[i]
        weight1 = i/float(len(person)) 
        weight2 = np.log(i + 1) / np.log(len(person))

        week[(day - 1) % 7] += lin * weight1**delta + log * weight2
    return week

def week_weights(data, delta, lin, log):
    return list(map(lambda person: count_person(person, delta, lin, log), data))

In [16]:
def normalize_week(week):
    total = sum(week)
    return map(lambda day: day/total, week)

def count_prob(week):
    shift = [1 - x for x in week]
    shift = [1.0] + shift
    for i in range(1, len(shift)):
            shift[i] *= shift[i-1]
    return [week[i]*shift[i] for i in range(len(week))]

def likelyhood(visits_daily):
    visits_freq = map(lambda week: normalize_week(week), visits_daily)
    visits_prob = map(lambda week: count_prob(week), visits_freq)
    result = map(lambda person: person.index(max(person)) + 1, visits_prob)
    return result

In [17]:
result = likelyhood(week_weights(data.visits, 0.9, 0.95, 0.05))

In [26]:
first_ten_clients = week_weights(data.visits[:10], 0.9, 0.95, 0.05)

In [28]:
list(normalize_week(first_ten_clients[0]))

[0.08054148451028138,
 0.25545732721161307,
 0.106346173799991,
 0.250504771069922,
 0.06189762154509355,
 0.1227867709169108,
 0.12246585094618824]

In [20]:
output = "id,nextvisit\n"
for i in range(300000):
    output += str(i + 1) + ", " + str(int(result[i])) + "\n"

TypeError: 'map' object is not subscriptable

In [23]:
week_weights(data.visits, 0.9, 0.95, 0.05)

KeyboardInterrupt: 

### LSTM in Keras

In [57]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import LSTM, Embedding
from keras.utils import to_categorical

# one_hot_labels = keras.utils.to_categorical(labels, num_classes=10)

In [102]:
train_x = [row[-25:-1] for row in train_df.weekdays] 
train_y = [row[-1] for row in train_df.weekdays]
train_x = np.stack(train_x, axis=0)-1
train_y = np.stack(train_y, axis=0)-1

In [83]:
train_x[0], train_y[0]

(array([1, 3, 3, 3, 5, 5, 0, 4, 6, 6, 5, 4, 1, 3, 1, 1, 2, 5, 3, 6, 3, 0,
        1, 2]), 3)

In [103]:
#train_x = to_categorical(train_x, num_classes=7)
train_y = to_categorical(train_y, num_classes=7)

In [104]:
model = Sequential()
model.add(Embedding(7, 16, input_length=24))
model.add(LSTM(100))
model.add(Dense(7, activation='softmax'))
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 24, 16)            112       
_________________________________________________________________
lstm_4 (LSTM)                (None, 100)               46800     
_________________________________________________________________
dense_4 (Dense)              (None, 7)                 707       
Total params: 47,619
Trainable params: 47,619
Non-trainable params: 0
_________________________________________________________________
None


In [105]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [106]:
model.fit(train_x, train_y, batch_size=64, epochs=500, verbose=2)

Epoch 1/500
 - 45s - loss: 1.8444 - acc: 0.2570
Epoch 2/500
 - 46s - loss: 1.8306 - acc: 0.2642
Epoch 3/500
 - 46s - loss: 1.8279 - acc: 0.2657
Epoch 4/500
 - 46s - loss: 1.8266 - acc: 0.2657
Epoch 5/500
 - 46s - loss: 1.8254 - acc: 0.2667
Epoch 6/500
 - 46s - loss: 1.8243 - acc: 0.2672
Epoch 7/500
 - 46s - loss: 1.8232 - acc: 0.2680
Epoch 8/500
 - 47s - loss: 1.8218 - acc: 0.2683
Epoch 9/500
 - 47s - loss: 1.8203 - acc: 0.2689
Epoch 10/500
 - 47s - loss: 1.8187 - acc: 0.2703
Epoch 11/500
 - 47s - loss: 1.8161 - acc: 0.2717
Epoch 12/500
 - 46s - loss: 1.8133 - acc: 0.2739
Epoch 13/500
 - 46s - loss: 1.8096 - acc: 0.2763
Epoch 14/500
 - 46s - loss: 1.8056 - acc: 0.2778
Epoch 15/500
 - 47s - loss: 1.8011 - acc: 0.2814
Epoch 16/500
 - 47s - loss: 1.7963 - acc: 0.2840
Epoch 17/500
 - 47s - loss: 1.7913 - acc: 0.2866
Epoch 18/500
 - 47s - loss: 1.7864 - acc: 0.2903
Epoch 19/500
 - 47s - loss: 1.7818 - acc: 0.2938
Epoch 20/500
 - 47s - loss: 1.7771 - acc: 0.2964
Epoch 21/500
 - 47s - loss: 1

KeyboardInterrupt: 

In [107]:
test_x = [row[-24:] for row in train_df.weekdays]
test_x = np.stack(test_x, axis=0)-1
test_y = np.array([row[-1] for row in train_df.weekdays])-1

pred = model.predict_classes(test_x ,verbose=2)

In [108]:
pred += 1

In [109]:
write_prediction_to_file(pred)

In [97]:
from sklearn.metrics import accuracy_score

accuracy_score(test_y, pred)

0.2736133333333333

In [90]:
test_x = np.stack(test_x, axis=0)
test_x.shape

(300000, 24)

In [None]:
stats.mode(train_df.weekdays[0])

In [None]:
DAYS_IN_WEEK = 7
WEEK_IN_MONTH = 4
DAYS_IN_MONTH = DAYS_IN_WEEK * WEEK_IN_MOTH
MONTH_IN_YEAR = 12