In [1]:
# adapted from https://github.com/tdunning/python-llr

def cmp(a, b):
    return (a > b) - (a < b) 

from collections import Counter 
import math
from functools import reduce

def denormEntropy(counts):
    '''Computes the entropy of a list of counts scaled by the sum of the counts.
    If the inputs sum to one, this is just the normal definition of entropy'''
    counts = list(counts)
    total = float(sum(counts))
    # Note tricky way to avoid 0*log(0)
    return -sum([k * math.log(k/total + (k==0)) for k in counts])

def llr_2x2(k11, k12, k21, k22):
    '''Special case of llr with a 2x2 table'''
    return 2 * abs(denormEntropy([k11+k12, k21+k22]) +
                denormEntropy([k11+k21, k12+k22]) -
                denormEntropy([k11, k12, k21, k22]))

def llr_root(k11, k12, k21, k22):
    '''Computes a score for a 2x2 contingency table, but then adds a sign according
    to whether k11 is larger (result is positive) or smaller (result is negative) 
    than might be expected. The magnitude of the result can be roughly interpreted
    on a scale similar to standard deviations'''
    row = k11 + k21
    total = (k11 + k12 + k21 + k22)
    sign = cmp(float(k11) / (k11 + k12), float(row) / total)
    return math.copysign(math.sqrt(llr_2x2(k11, k12, k21, k22)), sign)

In [2]:
import numpy as np

rawdata = np.array([
    [5,5,0,0,0,0],
    [0,0,5,5,0,0],
    [0,0,0,0,5,5],
    [0,1,5,5,5,0],
    [1,1,5,0,5,5],
    [5,5,0,5,1,1],
    [5,0,0,5,0,1],
    [5,5,5,0,1,0]
    ])
    
likes = np.array([[1 if x == 5 else 0 for x in row] for row in rawdata])
likes

array([[1, 1, 0, 0, 0, 0],
       [0, 0, 1, 1, 0, 0],
       [0, 0, 0, 0, 1, 1],
       [0, 0, 1, 1, 1, 0],
       [0, 0, 1, 0, 1, 1],
       [1, 1, 0, 1, 0, 0],
       [1, 0, 0, 1, 0, 0],
       [1, 1, 1, 0, 0, 0]])

In [3]:
cooccurrence_matrix = np.dot(likes.transpose(), likes)
cooccurrence_matrix

array([[4, 3, 1, 2, 0, 0],
       [3, 3, 1, 1, 0, 0],
       [1, 1, 4, 2, 2, 1],
       [2, 1, 2, 4, 1, 0],
       [0, 0, 2, 1, 3, 2],
       [0, 0, 1, 0, 2, 2]])

In [4]:
np.fill_diagonal(cooccurrence_matrix, 0)
cooccurrence_matrix

array([[0, 3, 1, 2, 0, 0],
       [3, 0, 1, 1, 0, 0],
       [1, 1, 0, 2, 2, 1],
       [2, 1, 2, 0, 1, 0],
       [0, 0, 2, 1, 0, 2],
       [0, 0, 1, 0, 2, 0]])

In [5]:
size = cooccurrence_matrix.shape[0]
sums = np.array([row.sum() for row in cooccurrence_matrix[:,0:size]])
total = sums.sum()
size, sums, total

(6, array([6, 5, 7, 6, 5, 3]), 32)

In [24]:
likes = []
for i in range(0, size):
    for j in range(0, size):
        if i!=j :
            a_b = cooccurrence_matrix[i,j].tolist()
            a_not_b = (sums[i] - a_b).tolist()
            b_not_a = (sums[j] - a_b).tolist()
            not_ab = (total - (a_b + sums[i] + sums[j])).tolist()
            print(i, j, llr_root(a_b, a_not_b, b_not_a, not_ab))
            likes.append(llr_root(a_b, a_not_b, b_not_a, not_ab))

0 1 3.170103206317417
0 2 0.15002170999780928
0 3 1.2790927138135817
0 4 -1.2107359029017233
0 5 -0.9729716210927987
1 0 3.170103206317417
1 2 0.48812442077922275
1 3 0.48812442077922275
1 4 -1.0316921903185028
1 5 -0.8295540929033288
2 0 0.15002170999780928
2 1 0.48812442077922275
2 3 1.2790927138135817
2 4 1.735868960445163
2 5 0.9243118295096052
3 0 1.2790927138135817
3 1 0.48812442077922275
3 2 1.2790927138135817
3 4 0.48812442077922275
3 5 -0.9729716210927987
4 0 -1.2107359029017233
4 1 -1.0316921903185028
4 2 1.735868960445163
4 3 0.48812442077922275
4 5 2.870115006482685
5 0 -0.9729716210927987
5 1 -0.8295540929033288
5 2 0.9243118295096071
5 3 -0.9729716210927987
5 4 2.870115006482685


### Exercise

So far we've used the "people who liked A were unusually likely to like B" statistics to make predictions, but we've ignored the information we have about dislikes. Can you find a way to improve the results by leveraging the 1 values in the rawdata (the dislikes)?

In [17]:
dislikes = np.array([[1 if x == 1 else 0 for x in row] for row in rawdata])
dislikes

array([[0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 1, 0]])

In [21]:
dis_cooccurrence_matrix = np.dot(likes.transpose(), dislikes)
np.fill_diagonal(dis_cooccurrence_matrix, 0)
dis_cooccurrence_matrix

array([[0, 0, 0, 0, 2, 2],
       [0, 0, 0, 0, 2, 1],
       [1, 2, 0, 0, 1, 0],
       [0, 1, 0, 0, 1, 2],
       [1, 2, 0, 0, 0, 0],
       [1, 1, 0, 0, 0, 0]])

In [22]:
size = dis_cooccurrence_matrix.shape[0]
sums = np.array([row.sum() for row in dis_cooccurrence_matrix[:,0:size]])
total = sums.sum()
size, sums, total

(6, array([4, 3, 4, 4, 3, 2]), 20)

In [25]:
dislikes = []
for i in range(0, size):
    for j in range(0, size):
        if i!=j :
            a_b = dis_cooccurrence_matrix[i,j].tolist()
            a_not_b = (sums[i] - a_b).tolist()
            b_not_a = (sums[j] - a_b).tolist()
            not_ab = (total - (a_b + sums[i] + sums[j])).tolist()
            print(i, j, llr_root(a_b, a_not_b, b_not_a, not_ab))
            dislikes.append(llr_root(a_b, a_not_b, b_not_a, not_ab))

0 1 -1.2107359029017233
0 2 -1.4217497366715597
0 3 -1.4217497366715597
0 4 1.735868960445163
0 5 2.5517577697983818
1 0 -1.2107359029017233
1 2 -1.2107359029017233
1 3 -1.2107359029017233
1 4 2.138334252825681
1 5 1.179393758140686
2 0 0.15002170999780928
2 1 1.735868960445163
2 3 -1.4217497366715597
2 4 0.48812442077922275
2 5 -0.9729716210927987
3 0 -1.4217497366715597
3 1 0.48812442077922275
3 2 -1.4217497366715597
3 4 0.48812442077922275
3 5 2.5517577697983818
4 0 0.48812442077922275
4 1 2.138334252825681
4 2 -1.2107359029017233
4 3 -1.2107359029017233
4 5 -0.8295540929033288
5 0 0.9243118295096071
5 1 1.179393758140686
5 2 -0.9729716210927987
5 3 -0.9729716210927987
5 4 -0.8295540929033288


In [31]:
for l,d in zip(likes,dislikes):
    print(l+d)

1.9593673034156938
-1.2717280266737503
-0.142657022857978
0.5251330575434396
1.578786148705583
1.9593673034156938
-0.7226114821225005
-0.7226114821225005
1.106642062507178
0.3498396652373571
0.30004341999561857
2.2239933812243855
-0.142657022857978
2.2239933812243855
-0.048659791583193446
-0.142657022857978
0.9762488415584455
-0.142657022857978
0.9762488415584455
1.578786148705583
-0.7226114821225005
1.106642062507178
0.5251330575434396
-0.7226114821225005
2.040560913579356
-0.04865979158319156
0.3498396652373571
-0.04865979158319156
-1.9459432421855973
2.040560913579356
