In [1]:
import numpy as np
import pandas as pd
from math import sqrt
import operator


In [2]:
data = [
    [1, -1, 1, -1, 1, -1],
    [1, 1, np.NAN, -1, -1, -1],
    [np.NAN, 1, 1, -1, -1, np.NAN],
    [-1, -1, -1, 1, 1, 1],
    [-1, np.NAN, -1, 1, 1, 1]
]

df = pd.DataFrame(data, columns=range(1, 7), index=range(1,6))
df

Unnamed: 0,1,2,3,4,5,6
1,1.0,-1.0,1.0,-1,1,-1.0
2,1.0,1.0,,-1,-1,-1.0
3,,1.0,1.0,-1,-1,
4,-1.0,-1.0,-1.0,1,1,1.0
5,-1.0,,-1.0,1,1,1.0


In [3]:
# Probability of user 1 rating movie 3 with 1, given that he has provided ratings for 2, 3, 4 and 5
# It is calculated as Probability of movie 3 being rated with 1 (which is actually a fraction of users who rated it as 1)
# and an independent probability of each movie being rated as it is ...

# P(r32 = 1 | r31 = 1) -- we are predicting the proba. of r32 being one (which is known), given that user has 
# previously rated movie 31 with 1
# is estimated as the fraction of users that have specified the rating of ruk for the kth item, 
# given that they have specified the rating of their jth item to vs.

k = 2

P_r32 = len(df[k][pd.notna(df[k])])

In [4]:
len(df[2][pd.notna(df[2])])

4

In [5]:
df

Unnamed: 0,1,2,3,4,5,6
1,1.0,-1.0,1.0,-1,1,-1.0
2,1.0,1.0,,-1,-1,-1.0
3,,1.0,1.0,-1,-1,
4,-1.0,-1.0,-1.0,1,1,1.0
5,-1.0,,-1.0,1,1,1.0


In [6]:
j = 1
j_val = 1
k = 2
k_val = 1

In [7]:
users_to_consider = (df[j] == j_val) & (pd.notna(df[k]))
users_to_consider[users_to_consider]

1    True
2    True
dtype: bool

In [8]:
users_who_rated_the_same = df.loc[users_to_consider][k] == k_val

In [9]:
len(users_who_rated_the_same[users_who_rated_the_same]) / len(users_to_consider[users_to_consider])

0.5

In [10]:
df[1] == -1

1    False
2    False
3    False
4     True
5     True
Name: 1, dtype: bool

In [11]:
~ pd.notna(df[3])

1    False
2     True
3    False
4    False
5    False
Name: 3, dtype: bool

In [13]:
people_who_rated_movie_j_and_movie_k = (df[df[j] == j_val])

people_who_rated_movie_j_and_movie_k

Unnamed: 0,1,2,3,4,5,6
1,1.0,-1.0,1.0,-1,1,-1.0
2,1.0,1.0,,-1,-1,-1.0


In [14]:
def get_proba_of_movie_rating(j, v):
    '''
    Returns probability of movie J being rated with specified rating V
    
    According to book, P(r_uj = v) is called prior probability and is
    is estimated to the fraction of the users that have specified the 
    rating v for the jth item (page 82 in the book).
    '''
    
    return len(df[df[j] == v]) / len(df[j].dropna())

def get_proba_of_movie_rating_considering_others(j, j_val, k, k_val):
    # Users to consider is list of users who rated movie J with J_val
    # and who have also rated movie K
    users_to_consider = (df[j] == j_val) & (pd.notna(df[k]))
    
    # Users to consider is fraction of above calculated number of users
    # who have ALSO rated the given movie with specified value
    users_who_rated_the_same = df.loc[users_to_consider][k] == k_val
    
    # This is the final calculation
    return len(users_who_rated_the_same[users_who_rated_the_same]) / len(users_to_consider[users_to_consider])
    
def predict_probability(u, j, rating):
    '''Predict probability of user "u" rating movie "j" with rating RATING '''
    
    # First we need list of all movies rated by this user I_u
    I_u = df.loc[u].dropna().index.values # Array of movie indexes rated by user "u"
    
    # s are the allowed values that rating can take; in our case - that is either 1 or -1
    s = [1, -1]
    
    probabilities = []
    
    for v in s:
        # Get probability that this movie is rated with rating "v" at all
        
        print("Predicting probability of user {} rating movie {} with {} ...".format(u, j, v))
        
        p_j_v = get_proba_of_movie_rating(j, v)
        
        print("\tGeneral probability of movie {} being rated with {} is {}".format(j, v, p_j_v))
        
        # Calculate conditional probabilities now (i.e. ratings for all movies that user has rated
        # taking into account the rating "v")
        
        total_probability = 1
        
        for movie_idx in I_u:
            users_rating = df.loc[u][movie_idx]
            
            conditional_proba = get_proba_of_movie_rating_considering_others(j, v, movie_idx, users_rating)

            print("\tConditional probability P(r{}{} = {}|r{}{} = {}) = {}".format(u, movie_idx, users_rating, u,j, v, conditional_proba))

            total_probability = total_probability * conditional_proba
            

        r_u_j = p_j_v * total_probability
        print("Probability of user {} rating movie {} with {} is {}".format(u, j, v, r_u_j))
        
        probabilities.append(r_u_j)
        
        print("")
            
    most_probable_rating_idx = probabilities.index(max(probabilities))
    
    print("User {} would most likely rate movie {} with {}".format(u, j, s[most_probable_rating_idx]))
    # Now we need probability of movie J being rated with rating V
    #p_u_jr = get_proba_of_movie_rating(j, v)

In [15]:
df

Unnamed: 0,1,2,3,4,5,6
1,1.0,-1.0,1.0,-1,1,-1.0
2,1.0,1.0,,-1,-1,-1.0
3,,1.0,1.0,-1,-1,
4,-1.0,-1.0,-1.0,1,1,1.0
5,-1.0,,-1.0,1,1,1.0


In [16]:
predict_probability(3, 2, -1)

Predicting probability of user 3 rating movie 2 with 1 ...
	General probability of movie 2 being rated with 1 is 0.5
	Conditional probability P(r32 = 1.0|r32 = 1) = 1.0
	Conditional probability P(r33 = 1.0|r32 = 1) = 1.0
	Conditional probability P(r34 = -1.0|r32 = 1) = 1.0
	Conditional probability P(r35 = -1.0|r32 = 1) = 1.0
Probability of user 3 rating movie 2 with 1 is 0.5

Predicting probability of user 3 rating movie 2 with -1 ...
	General probability of movie 2 being rated with -1 is 0.5
	Conditional probability P(r32 = 1.0|r32 = -1) = 0.0
	Conditional probability P(r33 = 1.0|r32 = -1) = 0.5
	Conditional probability P(r34 = -1.0|r32 = -1) = 0.5
	Conditional probability P(r35 = -1.0|r32 = -1) = 0.0
Probability of user 3 rating movie 2 with -1 is 0.0

User 3 would most likely rate movie 2 with 1


In [17]:
get_proba_of_movie_rating(4, 1)

0.4

In [18]:
df.loc[3].dropna().index.values

array([2, 3, 4, 5])

In [19]:
len(df[df[1] == 1])

2