In [1]:
import pandas as pd
import numpy as np
import sqlite3
import math

from numpy import transpose
from numpy.linalg import inv,pinv, det
from scipy.stats import norm

engine = sqlite3.connect('yahoo')
np.set_printoptions(precision=3)
click_cols = ['time', 'user_1', 'user_2', 'user_3', 'user_4', 'user_5', 'user_6', 'displayed', 'clicked']

In [62]:
articles_df = pd.read_sql_query('SELECT * FROM articles',con=engine).set_index('index')
clicks_df = pd.read_sql_query('''
    SELECT * FROM clicks WHERE (ABS(CAST((RANDOM()) as int)) % 100) < 10 AND length(article_pool) < 210
''', con=engine)
clicks_df[click_cols] = clicks_df[click_cols].apply(pd.to_numeric, ) # Temporary workaround, this should be done before sql

In [63]:
clicks_df.head()
clicks_df.shape

(151154, 11)

In [None]:
#articles_df.head()
#clicks_df.shape
# clicks_df.head()

In [3]:
def eta(T):
    """ 
    Generates the cutoff probabilities for exploration rounds in interval chaining. 
    
    :param T: the total number of iterations
    """
    return np.array([pow(t, -1/3) for t in range(1,T+1)])

In [59]:
def select_logged_event(t, k, d):
    """
    Select the t'th record from the event log.
    
    :param t: the timestep
    :param k: the number of arms
    :param d: the number of features
    :return:  a tuple containing a (k x 1 x d) log vector, the id of the article displayed,
              whether the article was clicked, and the article pool index corresponding
              to the article
    """
    # Extract all article vectors for first log item.
    _article_pool = eval(clicks_df.ix[t]['article_pool'])
    _article_vecs = np.array([articles_df.ix[x].values for x in _article_pool])
    # For each of the article row vectors, add a copy of the user features.
    logged = np.append(_article_vecs, values=np.tile(user_feat_mat[t], (len(_article_vecs), 1)), axis=1)
#     rewards = np.array(list(map(lambda x : 1 if int(x) == disp_mat[t] else 0, _article_pool)))
#     return np.vstack([X, logged.reshape((1, k, d))*1000]), np.append(Y, clicked_mat[t])
    disp_index = _article_pool.index(str(clicked_mat[t,0]))
#     return logged.reshape((k, 1, d)), clicked_mat[t], disp_index
    return logged.reshape((k, 1, d)), clicked_mat[t, 0], clicked_mat[t, 1], disp_index

In [5]:
def ols_intervals(X, Y, k, _delta, T):
    intervals = []
    for i in range(k):
        t = len(X[i])-1
        if t <= 0:
            continue
        # Compute beta hat.
        _Xti = X[i][:t]
        _XtiT = transpose(_Xti)
        try:
            _XTX = pinv(_XtiT.dot(_Xti))
        except:
            print('Encountered singular matrix. Ignoring.')
            continue
        _Yti = Y[i][:t]
        Bh_t_i = _XTX.dot(_XtiT).dot(_Yti)  # Compute OLS estimators.
#         from IPython.core.debugger import Tracer; Tracer()() 
        yh_t_i = X[i][t].dot(Bh_t_i)
        _s2 = np.var(Y[i][t-1])
        # Compute the confidence interval width using the inverse CDF.
        w_t_i = norm.ppf(1 - _delta/(2*T*k), loc=0, 
                         scale=np.sqrt(_s2 * X[i][t].dot(_XTX).dot(transpose(X[i][t]))))
        if not math.isnan(w_t_i):
            intervals.append([yh_t_i - w_t_i, yh_t_i + w_t_i])
    return intervals

In [64]:
def top_interval_yahoo(T):
    """
    Simulates T rounds of TopInterval for k using the evaluation framework
    described in https://arxiv.org/pdf/1003.5956.pdf.
    
    Evaluation Protocol
    ===================
    Let σ be an input stream of logged events to be used for the simulation,
    where an event consists of a tuple consisting of the user vector, 
    20 article vectors (context vectors), the displayed vector (selected arm), 
    and the result (observed reward).
    Step through each element s sampled from the stream.
    Let t denote the current time-step in the algorithm and h_{t-1} prior history.
    If given h_{t-1} and the current context, the algorithm picks the same vector
    as the selected arm - retain the event and add it to the history. Otherwise,
    keep processing items.
    
    :param T: the number of iterations
    """
    k = 20
    d = 12
    X = [np.empty((0, 12)) for i in range(k)]  # For each arm store a t' x d nd-array.
    Y = [np.empty((0,1)) for i in range(k)]
    _delta = 0.1
    _eta = eta(T)                               # exploration cutoff probabilities
    cursor = 0
    performance = []
    for t in range(T):
        if t < k:
            print('Seeding data for arm {0}'.format(t))
            # Pull each arm once to start.
            while cursor <= len(clicks_df):
                logged, _, disp_clicked, disp_index = select_logged_event(cursor, k, d)
                cursor += 1
                # terminate this loop if we've found the arm we want
                if disp_index == t:
                    X[disp_index] = np.vstack([X[disp_index], logged[disp_index]])
                    Y[disp_index] = np.append(Y[disp_index], disp_clicked)
                    break
        elif t < d or np.random.rand() <= _eta[t]:
            # Play uniformly at random from [1, k].
#             print('Exploration round.')
            logged, _, disp_clicked, disp_index = select_logged_event(cursor, k, d)
            X[disp_index] = np.vstack([X[disp_index], logged[disp_index]])
            Y[disp_index] = np.append(Y[disp_index], disp_clicked)
            cursor += 1
            print('Iteration [{0} / {1}]'.format(t, T))
        else:
            while cursor <= len(clicks_df):
                # Compute input vector for next item in stream and corresponding reward vector.
                # 1. Compute input vector for the next item in the stream and the corresponding reward.
                logged, _, disp_clicked, disp_index = select_logged_event(cursor, k, d)
                Xp = [np.vstack([X[i], logged[i]]) for i in range(k)]
                # 2. For each of the arms, compute the confidence intervals.
                intervals = ols_intervals(Xp, Y, k, _delta, T)
                # Pick the agent with the largest upper bound.
                pick = np.argmax(np.array(intervals)[:,1]) if len(intervals) > 0 else np.random.randint(0,k)
                _article_pool = eval(clicks_df.ix[cursor]['article_pool'])
#                 print('Intervals: {0}'.format(intervals))
#                 print('Pick: {0}. Displayed: {1}'.format(_article_pool[pick], clicks_df.ix[t]['displayed']))
                if clicks_df.ix[cursor]['displayed'] == int(_article_pool[pick]):
                    X[disp_index] = np.vstack([X[disp_index], logged[disp_index]])
                    Y[disp_index] = np.append(Y[disp_index], disp_clicked)
                    print('Iteration [{0} / {1}], matched with stream! {2}'.format(t, T, disp_clicked))
#                     performance.append(Yp[t, pick])
                    cursor += 1
                    break
                cursor += 1
            
    # Compute sum of best picks over each iteration.
#     best = [transpose(Y)[i].max() for i in range(2, T)]
#     print('Best: {0}'.format(sum(best)))
#     print('Performance: {0}'.format(sum(performance)))
#     print('Regret: {0}'.format(sum(best) - sum(performance)))
#     print(Y)
    for i in range(k):
        print('{0}:{1}'.format(i, len(X[i])))


In [66]:
user_feat_mat = clicks_df.as_matrix(['user_1', 'user_2', 'user_3', 'user_4', 'user_5', 'user_6'])
clicked_mat = clicks_df.as_matrix(['displayed', 'clicked'])

tmp = top_interval_yahoo(10000)

Seeding data for arm 0
Seeding data for arm 1
Seeding data for arm 2
Seeding data for arm 3
Seeding data for arm 4
Seeding data for arm 5
Seeding data for arm 6
Seeding data for arm 7
Seeding data for arm 8
Seeding data for arm 9
Seeding data for arm 10
Seeding data for arm 11
Seeding data for arm 12
Seeding data for arm 13
Seeding data for arm 14
Seeding data for arm 15
Seeding data for arm 16
Seeding data for arm 17
Seeding data for arm 18
Seeding data for arm 19
Iteration [20 / 10000]
Iteration [21 / 10000]


  lower_bound = self.a * scale + loc
  upper_bound = self.b * scale + loc


Iteration [22 / 10000], matched with stream! 0
Iteration [23 / 10000], matched with stream! 0
Iteration [24 / 10000], matched with stream! 0
Iteration [25 / 10000]
Iteration [26 / 10000], matched with stream! 0
Iteration [27 / 10000]
Iteration [28 / 10000], matched with stream! 0
Iteration [29 / 10000], matched with stream! 0
Iteration [30 / 10000], matched with stream! 0
Iteration [31 / 10000], matched with stream! 0
Iteration [32 / 10000], matched with stream! 0
Iteration [33 / 10000], matched with stream! 0
Iteration [34 / 10000], matched with stream! 0
Iteration [35 / 10000], matched with stream! 0
Iteration [36 / 10000], matched with stream! 0
Iteration [37 / 10000]
Iteration [38 / 10000]
Iteration [39 / 10000], matched with stream! 0
Iteration [40 / 10000]
Iteration [41 / 10000]
Iteration [42 / 10000], matched with stream! 0
Iteration [43 / 10000], matched with stream! 0
Iteration [44 / 10000], matched with stream! 0
Iteration [45 / 10000], matched with stream! 1
Iteration [46 / 

Iteration [211 / 10000], matched with stream! 0
Iteration [212 / 10000], matched with stream! 0
Iteration [213 / 10000]
Iteration [214 / 10000], matched with stream! 0
Iteration [215 / 10000], matched with stream! 0
Iteration [216 / 10000], matched with stream! 0
Iteration [217 / 10000], matched with stream! 0
Iteration [218 / 10000], matched with stream! 0
Iteration [219 / 10000], matched with stream! 0
Iteration [220 / 10000]
Iteration [221 / 10000]
Iteration [222 / 10000], matched with stream! 0
Iteration [223 / 10000]
Iteration [224 / 10000], matched with stream! 0
Iteration [225 / 10000], matched with stream! 0
Iteration [226 / 10000], matched with stream! 1
Iteration [227 / 10000]
Iteration [228 / 10000], matched with stream! 0
Iteration [229 / 10000], matched with stream! 0
Iteration [230 / 10000], matched with stream! 0
Iteration [231 / 10000], matched with stream! 0
Iteration [232 / 10000], matched with stream! 0
Iteration [233 / 10000], matched with stream! 0
Iteration [234 /

Iteration [399 / 10000], matched with stream! 0
Iteration [400 / 10000], matched with stream! 0
Iteration [401 / 10000], matched with stream! 0
Iteration [402 / 10000], matched with stream! 0
Iteration [403 / 10000], matched with stream! 0
Iteration [404 / 10000], matched with stream! 0
Iteration [405 / 10000], matched with stream! 0
Iteration [406 / 10000], matched with stream! 0
Iteration [407 / 10000], matched with stream! 1
Iteration [408 / 10000], matched with stream! 0
Iteration [409 / 10000], matched with stream! 0
Iteration [410 / 10000], matched with stream! 0
Iteration [411 / 10000], matched with stream! 0
Iteration [412 / 10000], matched with stream! 0
Iteration [413 / 10000], matched with stream! 0
Iteration [414 / 10000], matched with stream! 0
Iteration [415 / 10000], matched with stream! 1
Iteration [416 / 10000], matched with stream! 0
Iteration [417 / 10000], matched with stream! 0
Iteration [418 / 10000], matched with stream! 1
Iteration [419 / 10000], matched with st

Iteration [582 / 10000], matched with stream! 0
Iteration [583 / 10000]
Iteration [584 / 10000], matched with stream! 0
Iteration [585 / 10000], matched with stream! 0
Iteration [586 / 10000], matched with stream! 0
Iteration [587 / 10000], matched with stream! 0
Iteration [588 / 10000]
Iteration [589 / 10000], matched with stream! 0
Iteration [590 / 10000]
Iteration [591 / 10000], matched with stream! 0
Iteration [592 / 10000], matched with stream! 0
Iteration [593 / 10000], matched with stream! 0
Iteration [594 / 10000]
Iteration [595 / 10000], matched with stream! 0
Iteration [596 / 10000], matched with stream! 0
Iteration [597 / 10000], matched with stream! 0
Iteration [598 / 10000], matched with stream! 0
Iteration [599 / 10000], matched with stream! 0
Iteration [600 / 10000], matched with stream! 0
Iteration [601 / 10000], matched with stream! 0
Iteration [602 / 10000], matched with stream! 0
Iteration [603 / 10000], matched with stream! 0
Iteration [604 / 10000]
Iteration [605 /

Iteration [762 / 10000], matched with stream! 0
Iteration [763 / 10000], matched with stream! 0
Iteration [764 / 10000], matched with stream! 1
Iteration [765 / 10000], matched with stream! 0
Iteration [766 / 10000], matched with stream! 0
Iteration [767 / 10000], matched with stream! 0
Iteration [768 / 10000], matched with stream! 0
Iteration [769 / 10000], matched with stream! 0
Iteration [770 / 10000], matched with stream! 0
Iteration [771 / 10000]
Iteration [772 / 10000], matched with stream! 0
Iteration [773 / 10000], matched with stream! 0
Iteration [774 / 10000], matched with stream! 0
Iteration [775 / 10000], matched with stream! 0
Iteration [776 / 10000], matched with stream! 0
Iteration [777 / 10000], matched with stream! 0
Iteration [778 / 10000], matched with stream! 0
Iteration [779 / 10000], matched with stream! 0
Iteration [780 / 10000], matched with stream! 0
Iteration [781 / 10000], matched with stream! 0
Iteration [782 / 10000]
Iteration [783 / 10000], matched with st

Iteration [945 / 10000], matched with stream! 0
Iteration [946 / 10000], matched with stream! 0
Iteration [947 / 10000]
Iteration [948 / 10000], matched with stream! 0
Iteration [949 / 10000], matched with stream! 0
Iteration [950 / 10000], matched with stream! 0
Iteration [951 / 10000], matched with stream! 0
Iteration [952 / 10000], matched with stream! 0
Iteration [953 / 10000]
Iteration [954 / 10000], matched with stream! 0
Iteration [955 / 10000], matched with stream! 0
Iteration [956 / 10000], matched with stream! 0
Iteration [957 / 10000], matched with stream! 0
Iteration [958 / 10000], matched with stream! 0
Iteration [959 / 10000], matched with stream! 0
Iteration [960 / 10000], matched with stream! 0
Iteration [961 / 10000], matched with stream! 0
Iteration [962 / 10000], matched with stream! 0
Iteration [963 / 10000], matched with stream! 0
Iteration [964 / 10000], matched with stream! 0
Iteration [965 / 10000], matched with stream! 0
Iteration [966 / 10000], matched with st

Iteration [1123 / 10000], matched with stream! 0
Iteration [1124 / 10000], matched with stream! 0
Iteration [1125 / 10000], matched with stream! 0
Iteration [1126 / 10000], matched with stream! 0
Iteration [1127 / 10000], matched with stream! 0
Iteration [1128 / 10000], matched with stream! 0
Iteration [1129 / 10000]
Iteration [1130 / 10000], matched with stream! 0
Iteration [1131 / 10000], matched with stream! 0
Iteration [1132 / 10000], matched with stream! 0
Iteration [1133 / 10000], matched with stream! 0
Iteration [1134 / 10000], matched with stream! 0
Iteration [1135 / 10000], matched with stream! 0
Iteration [1136 / 10000], matched with stream! 0
Iteration [1137 / 10000], matched with stream! 0
Iteration [1138 / 10000], matched with stream! 0
Iteration [1139 / 10000]
Iteration [1140 / 10000], matched with stream! 0
Iteration [1141 / 10000], matched with stream! 0
Iteration [1142 / 10000], matched with stream! 0
Iteration [1143 / 10000], matched with stream! 0
Iteration [1144 / 1

Iteration [1298 / 10000], matched with stream! 0
Iteration [1299 / 10000], matched with stream! 0
Iteration [1300 / 10000], matched with stream! 0
Iteration [1301 / 10000], matched with stream! 0
Iteration [1302 / 10000], matched with stream! 0
Iteration [1303 / 10000], matched with stream! 0
Iteration [1304 / 10000], matched with stream! 0
Iteration [1305 / 10000]
Iteration [1306 / 10000], matched with stream! 0
Iteration [1307 / 10000], matched with stream! 0
Iteration [1308 / 10000], matched with stream! 0
Iteration [1309 / 10000], matched with stream! 0
Iteration [1310 / 10000]
Iteration [1311 / 10000]
Iteration [1312 / 10000], matched with stream! 0
Iteration [1313 / 10000]
Iteration [1314 / 10000], matched with stream! 0
Iteration [1315 / 10000]
Iteration [1316 / 10000], matched with stream! 0
Iteration [1317 / 10000], matched with stream! 0
Iteration [1318 / 10000], matched with stream! 0
Iteration [1319 / 10000], matched with stream! 0
Iteration [1320 / 10000], matched with str

Iteration [1480 / 10000], matched with stream! 1
Iteration [1481 / 10000], matched with stream! 0
Iteration [1482 / 10000], matched with stream! 0
Iteration [1483 / 10000], matched with stream! 0
Iteration [1484 / 10000]
Iteration [1485 / 10000], matched with stream! 0
Iteration [1486 / 10000]
Iteration [1487 / 10000], matched with stream! 0
Iteration [1488 / 10000], matched with stream! 0
Iteration [1489 / 10000], matched with stream! 0
Iteration [1490 / 10000], matched with stream! 0
Iteration [1491 / 10000], matched with stream! 0
Iteration [1492 / 10000], matched with stream! 0
Iteration [1493 / 10000], matched with stream! 0
Iteration [1494 / 10000], matched with stream! 0
Iteration [1495 / 10000], matched with stream! 0
Iteration [1496 / 10000], matched with stream! 0
Iteration [1497 / 10000], matched with stream! 0
Iteration [1498 / 10000], matched with stream! 0
Iteration [1499 / 10000], matched with stream! 0
Iteration [1500 / 10000], matched with stream! 0
Iteration [1501 / 1

Iteration [1652 / 10000], matched with stream! 0
Iteration [1653 / 10000], matched with stream! 0
Iteration [1654 / 10000], matched with stream! 0
Iteration [1655 / 10000]
Iteration [1656 / 10000], matched with stream! 0
Iteration [1657 / 10000], matched with stream! 0
Iteration [1658 / 10000], matched with stream! 0
Iteration [1659 / 10000], matched with stream! 0
Iteration [1660 / 10000], matched with stream! 0
Iteration [1661 / 10000], matched with stream! 0
Iteration [1662 / 10000], matched with stream! 0
Iteration [1663 / 10000], matched with stream! 0
Iteration [1664 / 10000], matched with stream! 0
Iteration [1665 / 10000], matched with stream! 0
Iteration [1666 / 10000], matched with stream! 0
Iteration [1667 / 10000], matched with stream! 0
Iteration [1668 / 10000], matched with stream! 0
Iteration [1669 / 10000], matched with stream! 0
Iteration [1670 / 10000], matched with stream! 0
Iteration [1671 / 10000], matched with stream! 0
Iteration [1672 / 10000], matched with strea

Iteration [1826 / 10000], matched with stream! 0
Iteration [1827 / 10000], matched with stream! 0
Iteration [1828 / 10000], matched with stream! 0
Iteration [1829 / 10000], matched with stream! 0
Iteration [1830 / 10000], matched with stream! 0
Iteration [1831 / 10000], matched with stream! 0
Iteration [1832 / 10000], matched with stream! 0
Iteration [1833 / 10000], matched with stream! 0
Iteration [1834 / 10000], matched with stream! 0
Iteration [1835 / 10000], matched with stream! 0
Iteration [1836 / 10000], matched with stream! 0
Iteration [1837 / 10000], matched with stream! 0
Iteration [1838 / 10000], matched with stream! 0
Iteration [1839 / 10000], matched with stream! 0
Iteration [1840 / 10000]
Iteration [1841 / 10000], matched with stream! 0
Iteration [1842 / 10000], matched with stream! 0
Iteration [1843 / 10000], matched with stream! 0
Iteration [1844 / 10000], matched with stream! 0
Iteration [1845 / 10000]
Iteration [1846 / 10000], matched with stream! 1
Iteration [1847 / 1

Iteration [2003 / 10000], matched with stream! 0
Iteration [2004 / 10000], matched with stream! 0
Iteration [2005 / 10000]
Iteration [2006 / 10000], matched with stream! 0
Iteration [2007 / 10000], matched with stream! 1
Iteration [2008 / 10000], matched with stream! 0
Iteration [2009 / 10000], matched with stream! 0
Iteration [2010 / 10000], matched with stream! 0
Iteration [2011 / 10000], matched with stream! 0
Iteration [2012 / 10000], matched with stream! 0
Iteration [2013 / 10000], matched with stream! 0
Iteration [2014 / 10000], matched with stream! 0
Iteration [2015 / 10000], matched with stream! 0
Iteration [2016 / 10000], matched with stream! 0
Iteration [2017 / 10000], matched with stream! 0
Iteration [2018 / 10000], matched with stream! 0
Iteration [2019 / 10000], matched with stream! 0
Iteration [2020 / 10000], matched with stream! 0
Iteration [2021 / 10000], matched with stream! 0
Iteration [2022 / 10000], matched with stream! 0
Iteration [2023 / 10000], matched with strea

Iteration [2179 / 10000], matched with stream! 0
Iteration [2180 / 10000], matched with stream! 0
Iteration [2181 / 10000], matched with stream! 0
Iteration [2182 / 10000], matched with stream! 0
Iteration [2183 / 10000], matched with stream! 0
Iteration [2184 / 10000], matched with stream! 0
Iteration [2185 / 10000], matched with stream! 0
Iteration [2186 / 10000], matched with stream! 0
Iteration [2187 / 10000], matched with stream! 0
Iteration [2188 / 10000], matched with stream! 0
Iteration [2189 / 10000], matched with stream! 0
Iteration [2190 / 10000]
Iteration [2191 / 10000], matched with stream! 0
Iteration [2192 / 10000], matched with stream! 0
Iteration [2193 / 10000], matched with stream! 0
Iteration [2194 / 10000], matched with stream! 0
Iteration [2195 / 10000], matched with stream! 0
Iteration [2196 / 10000], matched with stream! 0
Iteration [2197 / 10000], matched with stream! 0
Iteration [2198 / 10000], matched with stream! 0
Iteration [2199 / 10000], matched with strea

Iteration [2354 / 10000], matched with stream! 0
Iteration [2355 / 10000]
Iteration [2356 / 10000], matched with stream! 0
Iteration [2357 / 10000], matched with stream! 0
Iteration [2358 / 10000], matched with stream! 0
Iteration [2359 / 10000], matched with stream! 0
Iteration [2360 / 10000], matched with stream! 0
Iteration [2361 / 10000], matched with stream! 1
Iteration [2362 / 10000], matched with stream! 0
Iteration [2363 / 10000], matched with stream! 1
Iteration [2364 / 10000], matched with stream! 0
Iteration [2365 / 10000], matched with stream! 0
Iteration [2366 / 10000], matched with stream! 0
Iteration [2367 / 10000], matched with stream! 0
Iteration [2368 / 10000], matched with stream! 0
Iteration [2369 / 10000], matched with stream! 0
Iteration [2370 / 10000], matched with stream! 0
Iteration [2371 / 10000], matched with stream! 0
Iteration [2372 / 10000], matched with stream! 0
Iteration [2373 / 10000], matched with stream! 0
Iteration [2374 / 10000], matched with strea

Iteration [2523 / 10000], matched with stream! 0
Iteration [2524 / 10000], matched with stream! 0
Iteration [2525 / 10000], matched with stream! 0
Iteration [2526 / 10000], matched with stream! 0
Iteration [2527 / 10000], matched with stream! 0
Iteration [2528 / 10000], matched with stream! 0
Iteration [2529 / 10000], matched with stream! 0
Iteration [2530 / 10000], matched with stream! 0
Iteration [2531 / 10000], matched with stream! 0
Iteration [2532 / 10000], matched with stream! 0
Iteration [2533 / 10000], matched with stream! 0
Iteration [2534 / 10000], matched with stream! 0
Iteration [2535 / 10000], matched with stream! 1
Iteration [2536 / 10000], matched with stream! 0
Iteration [2537 / 10000], matched with stream! 0
Iteration [2538 / 10000], matched with stream! 0
Iteration [2539 / 10000], matched with stream! 0
Iteration [2540 / 10000], matched with stream! 0
Iteration [2541 / 10000], matched with stream! 0
Iteration [2542 / 10000], matched with stream! 0
Iteration [2543 / 10

Iteration [2701 / 10000], matched with stream! 0
Iteration [2702 / 10000], matched with stream! 0
Iteration [2703 / 10000], matched with stream! 0
Iteration [2704 / 10000]
Iteration [2705 / 10000], matched with stream! 0
Iteration [2706 / 10000], matched with stream! 0
Iteration [2707 / 10000], matched with stream! 0
Iteration [2708 / 10000], matched with stream! 1
Iteration [2709 / 10000], matched with stream! 0
Iteration [2710 / 10000], matched with stream! 0
Iteration [2711 / 10000], matched with stream! 0
Iteration [2712 / 10000], matched with stream! 0
Iteration [2713 / 10000], matched with stream! 0
Iteration [2714 / 10000]
Iteration [2715 / 10000]
Iteration [2716 / 10000], matched with stream! 0
Iteration [2717 / 10000], matched with stream! 0
Iteration [2718 / 10000], matched with stream! 0
Iteration [2719 / 10000], matched with stream! 0
Iteration [2720 / 10000], matched with stream! 0
Iteration [2721 / 10000], matched with stream! 0
Iteration [2722 / 10000], matched with stre

Iteration [2878 / 10000], matched with stream! 0
Iteration [2879 / 10000], matched with stream! 0
Iteration [2880 / 10000], matched with stream! 0
Iteration [2881 / 10000], matched with stream! 0
Iteration [2882 / 10000], matched with stream! 0
Iteration [2883 / 10000], matched with stream! 0
Iteration [2884 / 10000], matched with stream! 0
Iteration [2885 / 10000], matched with stream! 0
Iteration [2886 / 10000], matched with stream! 0
Iteration [2887 / 10000], matched with stream! 0
Iteration [2888 / 10000], matched with stream! 0
Iteration [2889 / 10000], matched with stream! 0
Iteration [2890 / 10000], matched with stream! 0
Iteration [2891 / 10000], matched with stream! 0
Iteration [2892 / 10000], matched with stream! 0
Iteration [2893 / 10000], matched with stream! 0
Iteration [2894 / 10000], matched with stream! 0
Iteration [2895 / 10000], matched with stream! 0
Iteration [2896 / 10000], matched with stream! 0
Iteration [2897 / 10000], matched with stream! 0
Iteration [2898 / 10

Iteration [3057 / 10000], matched with stream! 0
Iteration [3058 / 10000], matched with stream! 0
Iteration [3059 / 10000], matched with stream! 0
Iteration [3060 / 10000], matched with stream! 0
Iteration [3061 / 10000], matched with stream! 1
Iteration [3062 / 10000], matched with stream! 0
Iteration [3063 / 10000], matched with stream! 0
Iteration [3064 / 10000], matched with stream! 0
Iteration [3065 / 10000], matched with stream! 0
Iteration [3066 / 10000], matched with stream! 0
Iteration [3067 / 10000], matched with stream! 0
Iteration [3068 / 10000], matched with stream! 0
Iteration [3069 / 10000], matched with stream! 0
Iteration [3070 / 10000], matched with stream! 1
Iteration [3071 / 10000], matched with stream! 0
Iteration [3072 / 10000], matched with stream! 0
Iteration [3073 / 10000], matched with stream! 0
Iteration [3074 / 10000], matched with stream! 0
Iteration [3075 / 10000], matched with stream! 0
Iteration [3076 / 10000], matched with stream! 0
Iteration [3077 / 10

Iteration [3230 / 10000], matched with stream! 0
Iteration [3231 / 10000], matched with stream! 1
Iteration [3232 / 10000], matched with stream! 0
Iteration [3233 / 10000], matched with stream! 0
Iteration [3234 / 10000], matched with stream! 0
Iteration [3235 / 10000], matched with stream! 0
Iteration [3236 / 10000], matched with stream! 0
Iteration [3237 / 10000], matched with stream! 0
Iteration [3238 / 10000], matched with stream! 0
Iteration [3239 / 10000], matched with stream! 0
Iteration [3240 / 10000], matched with stream! 0
Iteration [3241 / 10000], matched with stream! 0
Iteration [3242 / 10000], matched with stream! 0
Iteration [3243 / 10000], matched with stream! 0
Iteration [3244 / 10000], matched with stream! 0
Iteration [3245 / 10000], matched with stream! 0
Iteration [3246 / 10000], matched with stream! 0
Iteration [3247 / 10000]
Iteration [3248 / 10000], matched with stream! 0
Iteration [3249 / 10000], matched with stream! 0
Iteration [3250 / 10000], matched with strea

Iteration [3402 / 10000], matched with stream! 0
Iteration [3403 / 10000], matched with stream! 0
Iteration [3404 / 10000], matched with stream! 0
Iteration [3405 / 10000], matched with stream! 0
Iteration [3406 / 10000], matched with stream! 0
Iteration [3407 / 10000], matched with stream! 0
Iteration [3408 / 10000], matched with stream! 0
Iteration [3409 / 10000], matched with stream! 0
Iteration [3410 / 10000], matched with stream! 0
Iteration [3411 / 10000], matched with stream! 0
Iteration [3412 / 10000], matched with stream! 0
Iteration [3413 / 10000], matched with stream! 0
Iteration [3414 / 10000], matched with stream! 0
Iteration [3415 / 10000], matched with stream! 0
Iteration [3416 / 10000], matched with stream! 0
Iteration [3417 / 10000], matched with stream! 0
Iteration [3418 / 10000], matched with stream! 0
Iteration [3419 / 10000], matched with stream! 0
Iteration [3420 / 10000], matched with stream! 0
Iteration [3421 / 10000], matched with stream! 0
Iteration [3422 / 10

Iteration [3572 / 10000], matched with stream! 0
Iteration [3573 / 10000], matched with stream! 0
Iteration [3574 / 10000], matched with stream! 0
Iteration [3575 / 10000], matched with stream! 0
Iteration [3576 / 10000], matched with stream! 0
Iteration [3577 / 10000], matched with stream! 0
Iteration [3578 / 10000], matched with stream! 0
Iteration [3579 / 10000], matched with stream! 0
Iteration [3580 / 10000]
Iteration [3581 / 10000]
Iteration [3582 / 10000], matched with stream! 0
Iteration [3583 / 10000], matched with stream! 0
Iteration [3584 / 10000], matched with stream! 0
Iteration [3585 / 10000], matched with stream! 0
Iteration [3586 / 10000], matched with stream! 0
Iteration [3587 / 10000], matched with stream! 0
Iteration [3588 / 10000], matched with stream! 0
Iteration [3589 / 10000]
Iteration [3590 / 10000], matched with stream! 0
Iteration [3591 / 10000], matched with stream! 0
Iteration [3592 / 10000], matched with stream! 0
Iteration [3593 / 10000]
Iteration [3594 / 

Iteration [3744 / 10000], matched with stream! 0
Iteration [3745 / 10000]
Iteration [3746 / 10000], matched with stream! 0
Iteration [3747 / 10000], matched with stream! 0
Iteration [3748 / 10000], matched with stream! 0
Iteration [3749 / 10000], matched with stream! 0
Iteration [3750 / 10000], matched with stream! 0
Iteration [3751 / 10000], matched with stream! 0
Iteration [3752 / 10000]
Iteration [3753 / 10000], matched with stream! 0
Iteration [3754 / 10000], matched with stream! 0
Iteration [3755 / 10000], matched with stream! 0
Iteration [3756 / 10000], matched with stream! 0
Iteration [3757 / 10000], matched with stream! 0
Iteration [3758 / 10000], matched with stream! 0
Iteration [3759 / 10000], matched with stream! 0
Iteration [3760 / 10000], matched with stream! 0
Iteration [3761 / 10000], matched with stream! 0
Iteration [3762 / 10000], matched with stream! 0
Iteration [3763 / 10000], matched with stream! 0
Iteration [3764 / 10000], matched with stream! 0
Iteration [3765 / 1

Iteration [3919 / 10000], matched with stream! 0
Iteration [3920 / 10000], matched with stream! 0
Iteration [3921 / 10000], matched with stream! 0
Iteration [3922 / 10000], matched with stream! 0
Iteration [3923 / 10000], matched with stream! 0
Iteration [3924 / 10000]
Iteration [3925 / 10000], matched with stream! 0
Iteration [3926 / 10000], matched with stream! 0
Iteration [3927 / 10000], matched with stream! 0
Iteration [3928 / 10000], matched with stream! 0
Iteration [3929 / 10000], matched with stream! 0
Iteration [3930 / 10000]
Iteration [3931 / 10000], matched with stream! 0
Iteration [3932 / 10000], matched with stream! 0
Iteration [3933 / 10000]
Iteration [3934 / 10000], matched with stream! 0
Iteration [3935 / 10000], matched with stream! 0
Iteration [3936 / 10000]
Iteration [3937 / 10000], matched with stream! 0
Iteration [3938 / 10000]
Iteration [3939 / 10000], matched with stream! 0
Iteration [3940 / 10000], matched with stream! 0
Iteration [3941 / 10000], matched with str

Iteration [4092 / 10000], matched with stream! 0
Iteration [4093 / 10000], matched with stream! 0
Iteration [4094 / 10000], matched with stream! 0
Iteration [4095 / 10000], matched with stream! 0
Iteration [4096 / 10000], matched with stream! 0
Iteration [4097 / 10000], matched with stream! 0
Iteration [4098 / 10000], matched with stream! 0
Iteration [4099 / 10000], matched with stream! 0
Iteration [4100 / 10000], matched with stream! 0
Iteration [4101 / 10000], matched with stream! 0
Iteration [4102 / 10000], matched with stream! 0
Iteration [4103 / 10000], matched with stream! 0
Iteration [4104 / 10000], matched with stream! 0
Iteration [4105 / 10000], matched with stream! 0
Iteration [4106 / 10000], matched with stream! 1
Iteration [4107 / 10000]
Iteration [4108 / 10000], matched with stream! 0
Iteration [4109 / 10000], matched with stream! 0
Iteration [4110 / 10000], matched with stream! 0
Iteration [4111 / 10000]
Iteration [4112 / 10000], matched with stream! 0
Iteration [4113 / 1

Iteration [4266 / 10000], matched with stream! 0
Iteration [4267 / 10000], matched with stream! 0
Iteration [4268 / 10000], matched with stream! 0
Iteration [4269 / 10000], matched with stream! 0
Iteration [4270 / 10000], matched with stream! 0
Iteration [4271 / 10000], matched with stream! 0
Iteration [4272 / 10000], matched with stream! 0
Iteration [4273 / 10000], matched with stream! 0
Iteration [4274 / 10000], matched with stream! 0
Iteration [4275 / 10000], matched with stream! 1
Iteration [4276 / 10000]
Iteration [4277 / 10000], matched with stream! 1
Iteration [4278 / 10000], matched with stream! 0
Iteration [4279 / 10000], matched with stream! 0
Iteration [4280 / 10000], matched with stream! 0
Iteration [4281 / 10000], matched with stream! 0
Iteration [4282 / 10000], matched with stream! 0
Iteration [4283 / 10000], matched with stream! 0
Iteration [4284 / 10000]
Iteration [4285 / 10000], matched with stream! 0
Iteration [4286 / 10000], matched with stream! 0
Iteration [4287 / 1

KeyboardInterrupt: 

## Examining Malformed Data

I observed that some of the event log items have > 20 articles in the article pool, which creates a problem since we assume a constant $k$ arms across all events.

It turns out that there are around 2.53 million such events out of the 4.68 million in the first batch file, which is a lot! 

Additionally, it is worth noting that there are a good number of articles that have identical feature vectors despite having different labels.

However, it is not as simple as removing the duplicates for all logged event items, as I determined at least one such case where doing so would result in < 20 articles in the pool. Therefore, distinct articles are distinct despite having identical article vectors (which is troublesome). 

A workaround I am using now is simply to only select the events with 20 articles, which seems to work in the interim.

In [None]:
pd.read_sql_query('''SELECT * FROM clicks WHERE time=1241162400 AND displayed=109513 AND user_1=0.001766''', con=engine)

In [None]:
pd.read_sql_query('''SELECT COUNT(*) FROM clicks''', con=engine)

In [None]:
pd.read_sql_query('''SELECT COUNT(*) FROM clicks''', con=engine)
bad_df = pd.read_sql_query('''SELECT * FROM clicks WHERE length(article_pool) >= 210''', con=engine)
len(bad_df)

In [None]:
bad_sample = eval(bad_df.ix[80840]['article_pool'])

In [None]:
dupes = pd.DataFrame(articles_df.duplicated(), columns=['Result'])
articles_df.ix[list(dupes[dupes.Result == True].index)]

In [None]:
# I've constructed an inverse index to allow swift de-duplication of articles with identical vectors.
d1 = {key:'109509' for key in ['109509', '109503', '109494', '109525', '109530', '109533', '109545']}
d2 = {key:'109506' for key in ['109506', '109547', '109550']}
d3 = {key:'109527' for key in ['109527', '109531', '109543']}
dupe_index = {**d1, **d2, **d3}

In [None]:
print(len(bad_sample))
print(len(set(map(lambda x : dupe_index[x] if x in dupe_index else x, bad_sample))))

In [None]:
dupe_index['109509']