In [None]:
import pandas as pd
import numpy as np
import random
import pickle

# read data from pandas
df = pd.read_json('goodreads_interactions_poetry.json', lines=True)

# parse into matrix
book_id = df['book_id'].tolist()
user_id = df['user_id'].tolist()
ratings = df['rating'].tolist()
interactions = [[x,y,z] for (x,y,z) in zip(user_id, book_id, ratings)]

# convert to dictionary
user_dict = {}
for user, book, rating in interactions:
    if user not in user_dict:
        user_dict[user] = {}
    user_dict[user][book] = rating
X = []
for user in user_dict.keys():
    X.append(user_dict[user])
    
num_itters = 128
log_probs = []

n = 1000

# save x before truncating
with open('poetry_full_users.pickle', 'wb') as handle:
    pickle.dump(X, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
# reduce data size to n and map book ids to nicer numbers
X = X[:n]
new_X = []
book_map = {}
num_books = 0
for user in X:
    books = {}
    for book in user.keys():
        if book not in book_map:
            book_map[book] = num_books
            num_books+=1
        books[book_map[book]] = user[book]
    new_X.append(books)
X = new_X

In [None]:
## EM Algorithm

k = num_books # number of books
ny = 5 # categories
# initialize prjyi with random values
prjyi = np.random.random((k, ny))
py = [0.4, 0.2, 0.2, 0.1, 0.1] # start with semi random valuse adding to 1
for itter in range(num_itters+1):
    if(itter != 0):
        rhoit = [[0] * ny for l in range(n)]
        for t in range(n):
            probs = []
            for i in range(ny):
                temp = 1.0
                for j in range(k):
                    if j in X[t]:
                        if X[t][j] >=2:
                            temp *= prjyi[j][i]
                        else:
                            temp *= (1-prjyi[j][i])
                probs.append(temp * py[i])
            psum = sum(probs)
            rhoit[t] = [l / psum for l in probs]
            
        py_new = [0.0] * ny
        for t in range(n):
            for i in range(ny):
                py_new[i] += rhoit[t][i]
        py_new = [l / n for l in py_new]

        # New PRJYI
        prjyi_new = [[0.0] * ny for l in range(k)]
        for i in range(ny):
            for j in range(k):
                for t in range(n):
                    if j not in X[t]:
                        # Not seen case
                        prjyi_new[j][i] += rhoit[t][i] * prjyi[j][i]
                    else:
                        # Seen case
                        prjyi_new[j][i] += rhoit[t][i] * (1 if X[t][j] >= 3 else 0)
                prjyi_new[j][i] /= sum([row[i] for row in rhoit])


        ## Do update
        py = py_new
        print(py)
        prjyi = prjyi_new

    print("finished updates")
    ## Get Log Probabilities
    log_prob = 0.0
    for t in range(n):
        prob = 0.0
        for i in range(ny):
            temp = 1.0
            for j in range(k):
                if (j in X[t]):
                    # has seen
                    if X[t][j] >=2:
                        temp *= prjyi[j][i]
                    else:
                        temp *= 1-prjyi[j][i]
            prob += (py[i] * temp)
        
        log_prob += np.log(prob)
    print(itter)
    log_prob /= n
    log_probs.append(log_prob)

with open('prjyi.pickle', 'wb') as handle:
    pickle.dump(prjyi, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('py.pickle', 'wb') as handle:
    pickle.dump(py, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Making predictions
data_out = []
for i in range(len(X)):
    line = [np.argmax(np.array(rhoit[i]))]
    line.append(X[i])
    data_out.append(line)

In [None]:
# Save the data
with open('em_result.pickle', 'wb') as handle:
    pickle.dump(data_out, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('em_result.pickle', 'rb') as handle:
    data = pickle.load(handle)

In [None]:
#categories are 0,1,2,3,4
number_of_books = 3561
books = []
for i in range(3562):
    books.append(i)
    
#recommendation for person in category 0

#creating a list with only in the people in category 0
shopper_0 = list()
shopper_1 = list()
shopper_2 = list()
shopper_3 = list()
shopper_4 = list()
shopper_0_books_ratings = list()
for i in data:
    if i[0] == 0:
        shopper_0.append(i[1])
    if i[0] == 1:
        shopper_1.append(i[1])
    if i[0] == 2:
        shopper_2.append(i[1])
    if i[0] == 3:
        shopper_3.append(i[1])
    if i[0] == 4:
        shopper_4.append(i[1])

## map book ids to book names
book_mapping = {}
df = pd.read_json('goodreads_books_poetry.json', lines=True)
book_ids = df['book_id'].tolist()
titles = df['title'].tolist()
title_map = [[book_id,title] for (book_id,title) in zip(book_ids, titles)]
for book_id, title in title_map:
    if book_id in book_map:
        book_mapping[book_map[book_id]] = title
    

In [None]:
# epsilon greedy approach
number_of_books = 3562
def epsilon_greedy_train(shopper_list, num_books):
    averages = []
    for i in range(num_books):
        averages.append([0,0])
    
    for ratings in shopper_list:
        for key in ratings.keys():
            averages[key][1] *= averages[key][0]
            averages[key][1] += ratings[key]
            averages[key][0] += 1
            averages[key][1] /= averages[key][0]
    return averages
def calculate_epsilon_greedy(current_averages, num_books, epsilon, book_mapping, seen_books):
    should_explore = random.random()
    if should_explore < epsilon:
        book_choice = book_mapping[random.choice(range(num_books))]
        while book_choice in seen_books:
            book_choice = book_mapping[random.choice(range(num_books))]
        return book_choice
    else:
        best = -1
        best_idx = -1
        for i in range(len(current_averages)):
            if book_mapping[i] not in seen_books and current_averages[i][1] > best:
                best = current_averages[i][1]
                best_idx = i
        return book_mapping[best_idx]

In [None]:
## sample 10 recomendations for each category
recomendations = []
epsilon = 0.1
shoppers = [shopper_0, shopper_1, shopper_2 ,shopper_3 , shopper_4]
for i in range(5):
    recomendations.append(set())
    averages = epsilon_greedy_train(shoppers[i], number_of_books)
    seen_books = set()
    while(len(recomendations[i]) < 10):
        rec = calculate_epsilon_greedy(averages, number_of_books, epsilon, book_mapping, seen_books)
        seen_books.add(rec)
        recomendations[i].add(rec)

In [None]:
# read in saved data
with open('poetry_full_users.pickle', 'rb') as handle:
    data_new = pickle.load(handle)
test_user_list = []

# get the users that have reviewed at least 3 books from our book set to test with
for i in data_new:
    counter_to_include = 0
    for j in i.keys():
        if j in book_map:
            counter_to_include += 1
    if counter_to_include > 3:
        test_user_list.append(i)
new_users = test_user_list

In [None]:
# read in saved data
with open('poetry_full_users_test.pickle', 'wb') as handle:
    pickle.dump(new_users, handle, protocol=pickle.HIGHEST_PROTOCOL)
new_users = new_users[-1000:]

#map books in new_users to books previously
new_user_mapping = []
for i in new_users:
    user_books = {}
    for b,r in i.items():
        if b in book_map.keys():
            user_books[book_map[b]] = r
    new_user_mapping.append(user_books)


In [None]:
# Calculate probability of being in each cagtegory for each user

rhoit_new = [[0] * ny for l in range(n)]
for t in range(n):
    probs = []
    for i in range(ny):
        temp = 1.0
        for j in range(k):
            if j in new_user_mapping[t]:
                if new_user_mapping[t][j] >=2:
                    temp *= prjyi[j][i]
                else:
                    temp *= (1-prjyi[j][i])
        probs.append(temp * py[i])
        psum = sum(probs)
        rhoit_new[t] = [l / psum for l in probs]    

In [None]:
# assign each of the test users a category
data_out_new = []
for i in range(len(new_user_mapping)):
    line = [np.argmax(np.array(rhoit_new[i]))]
    line.append(new_user_mapping[i])
    data_out_new.append(line)

# create data set of user categories and books they have reviewed
data_out_new_mapping = []
for i in data_out_new:
    new_list = []
    new_list.append(i[0])
    user_books_new = {}
    for b,r in i[1].items():
        user_books_new[book_mapping[b]] = r
    new_list.append(user_books_new)
    data_out_new_mapping.append(new_list)

In [None]:
#drop cases where no books read in current books selection
# Analyze the percentage of good recomendations we made 
#(# of recomendations that a new user actually reviwed positively / total # of recomendations a user reviweed
num = 0
denom = 0
for user in data_out_new_mapping:
    user_test_cat = user[0]
    user_test_books = user[1]
    for test_book_name,test_book_rating in user_test_books.items():
        if test_book_name in recomendations[user_test_cat]:
            denom += 1
            if test_book_rating >=3 :
                num += 1

In [None]:
# Calculate % of users who were recomended a book that they actually reviewed
denom = 0
for user in data_out_new_mapping:
    denom += 1
    user_test_cat = user[0]
    user_test_books = user[1]
    for test_book_name,test_book_rating in user_test_books.items():
        if test_book_name in recomendations[user_test_cat]:

            num += 1
            break