In [1]:
import numpy as np
import pandas as pd
import os
import surprise
import pickle
from src.utils import recommend, simulate
from sklearn.metrics.pairwise import euclidean_distances
import itertools
import time

In [2]:
MOVIE_LENS_PATH = os.path.abspath(os.path.join(os.getcwd(), '..', 'data', 'raw', 'movie_lense'))
MOVIE_LENS_PATH

'/home/levin/escaping_echo_chambers/data/raw/movie_lense'

### Recommender part

In [3]:
#Prepare the data for recommender:
movielens = pd.read_csv(os.path.join(MOVIE_LENS_PATH, 'ratings.csv'))#.drop('timestamp', axis = 1)
genome_scores = pd.read_csv(os.path.join(MOVIE_LENS_PATH, 'genome-scores.csv'))
# movielens = movielens[movielens.timestamp > (max(movielens['timestamp']) - 10*365*24*60*60)]
movielens = movielens.drop('timestamp', axis = 1)
movielens = movielens.rename(columns = {'userId':'uid', 'movieId':'iid', 'rating':'rating'})
#Let's look at how many movies have genome tags
movies_with_genome = genome_scores['movieId'].unique()
print('# of movies with genome tags:', len(movies_with_genome))
#Total number
print('# of movies in total:', len(movielens['iid'].unique()))
movielens_short = movielens[movielens.iid.isin(movies_with_genome)]
movielens = movielens_short

# of movies with genome tags: 13176
# of movies in total: 53889


In [4]:
#Sample the data for the recommender
n = 1000
np.random.seed(1)
sample_users = np.random.choice(movielens.uid.unique(), n)

In [5]:
movielens = movielens_short[movielens_short.uid.isin(sample_users)]
movielens.shape

(94157, 3)

In [6]:
#Prepare the first iteration data:
lower_rating = movielens['rating'].min()
upper_rating = movielens['rating'].max()
print('Range:', lower_rating, upper_rating)

Range: 0.5 5.0


In [7]:
reader = surprise.Reader(rating_scale = (lower_rating, upper_rating))
DATA = surprise.Dataset.load_from_df(movielens, reader)

In [8]:
%%time
alg = surprise.SVD()
output = alg.fit(DATA.build_full_trainset())

CPU times: user 3.94 s, sys: 290 µs, total: 3.94 s
Wall time: 3.93 s


### Diversity Measurement Part

In [9]:
#Compute diversity part
genome_scores_df = pd.read_csv(os.path.join(MOVIE_LENS_PATH, 'genome-scores.csv'))
genome_tags_df = pd.read_csv(os.path.join(MOVIE_LENS_PATH, 'genome-tags.csv'))
tgsdf = genome_scores_df.pivot(index='movieId',columns='tagId',values='relevance')
genome_tags_dict = dict(zip(genome_tags_df['tagId'],genome_tags_df['tag']))
movie_tags_df = tgsdf.rename(genome_tags_dict,axis='columns')
dist = euclidean_distances(movie_tags_df,movie_tags_df)

In [10]:
def load_evolution_df(filename):
    evolution_df = pd.read_csv(filename)
    # Clean
    evolution_df.rename({'Unnamed: 0':'Epoch','Unnamed: 1':'userId'},axis='columns',inplace=True)
    evolution_df['userId'] = [int(s.split('_')[1]) for s in evolution_df['userId'].values]
    evolution_df['Epoch'] = [int(s.split('_')[1]) for s in evolution_df['Epoch'].values]
    return evolution_df

# def get_user_recommended_diversity(userID,epoch, evolution_df):
    
#     tdf = evolution_df[(evolution_df['Epoch']==epoch) & (evolution_df['userId']==userID)]
#     movies_list = tdf.drop(labels=['Epoch','userId'],axis=1).values[0]
#     print(movies_list)
#     # convert to list of indices into distance matrix
#     movies_list_idx = [movie_tags_df.index.get_loc(i) for i in movies_list]

#     return compute_average_diversity(movies_list_idx)

def get_user_recommended_diversity(movies_list):
    movies_list_idx = [movie_tags_df.index.get_loc(i) for i in movies_list]
    return compute_average_diversity(movies_list_idx)

def compute_average_diversity(movies_list_idx):
    tot_dist = 0
    count = 0
    for i in itertools.combinations(movies_list_idx,2):
        count += 1 
        x,y = i
        d = dist[x,y]
        tot_dist+=d
    return tot_dist/count

In [11]:
# evolution_df = load_evolution_df('evolution_200epochs_10users_sim_1000users_total_15mov.csv')
# evolution_df['Epoch'].max()

In [12]:
# get_user_recommended_diversity(128088, 0, evolution_df)

In [16]:
# len(get_unseen_movie_ids(128088)) #The function returns ids for the movies in the needed order,
# #the unseen_movies_possible_ratings are the corresponding ratings

In [17]:
#uid should be global
def get_unseen_movie_ids(uid, data):
    all_movie_ids = data.df['iid'].unique()
    uid_rated = data.df[data.df['uid'] == uid]['iid']
    movies_to_recommend = np.setdiff1d(all_movie_ids, uid_rated)
    return movies_to_recommend


def add_possible_ratings(unseen_movies_possible_ratings, data, UID, lower_rating = 0.5, upper_rating=5.0):    
    unseen_movie_ids = get_unseen_movie_ids(UID, data)
    uid_list = [UID]*len(unseen_movie_ids)
    update_df = pd.DataFrame(list(zip(uid_list, unseen_movie_ids, unseen_movies_possible_ratings)), 
               columns =['uid', 'iid', 'rating'])
    update_df = update_df[(update_df['rating'] >= 0.5) & (update_df['rating'] <= 5.0)]
#     print(update_df)
    print('Number of movies added this time: {}'.format(len(update_df)))
    data_new = data.df.append([update_df]).reset_index(drop = True)
    reader = surprise.Reader(rating_scale = (lower_rating, upper_rating))
    data_updated = surprise.Dataset.load_from_df(data_new, reader)
    return data_updated

In [110]:
def manipulate_ratings_to_increase_diversity(unseen_movies_possible_ratings):
    TOP_N = 50 #Possibly, increasing this simlifies optimization
    #Add ratings
    data = add_possible_ratings(unseen_movies_possible_ratings, DATA, UID)
    #Based on those ratings, fit and recommend
#     user_top_iids = [] #Recommendations
#     user_top_rating = [] #Corresponding ratings
    #Fit the recommender
    start_fit = time.time()
    print('Shape of the data:', data.df.shape)
    # Global ALG
    model = ALG.fit(data.build_full_trainset())
    end_fit = time.time()
    print('    Fit took {} seconds'.format(end_fit-start_fit))
    #Recommend for the UID, but don't assume any watching, use the appended data
    start_predict = time.time()
    iid_recommended, ratings_recommended, data_update = recommend(UID, data, model, top_n = TOP_N)
#     user_top_iids.append(iid_recommended)
#     user_top_rating.append(ratings_recommended)
    diversity = get_user_recommended_diversity(iid_recommended)
    end_predict = time.time()
    print('    Predict took {} seconds'.format(end_predict-start_predict))
    print('Current Diversity: ', diversity)
    print('Curren norm', np.linalg.norm(unseen_movies_possible_ratings, 1))
    return 1/(diversity + 1) #ALPHA*np.linalg.norm(unseen_movies_possible_ratings, 1)#

In [66]:
# def recommend(uid, data, model, top_n = 100):
#     """
#     Returns the mean and covariance matrix of the demeaned dataset X (e.g. for PCA)
    
#     Parameters
#     ----------
#     uid : int
#         user id
#     data : surprise object with data
#         The entire system, ratings of users (Constructed with reader from surprise)
#     model : susrprise object
#         Trained algorithm
#     top_n : int
#         The number of movies to recommend

#     Returns
#     -------
#     pd.DataFrame
#         recommended movies
#     pd.DataFram
#         predicted ratings for the recommended movies
#     data_update
#         predicted movies and ratings in the movielens format (uid, iid, rating)
    
#     """
#     all_movie_ids = data.df['iid'].unique()
#     uid_rated = data.df[data.df['uid'] == uid]['iid']
#     movies_to_recommend = np.setdiff1d(all_movie_ids, uid_rated)
#     prediction_set = [[uid, iid, 0] for iid in movies_to_recommend] #here 0 is arbitrary, ratings don't matter
#     predictions = model.test(prediction_set)
#     pred_ratings = np.array([pred.est for pred in predictions])
#     top = pred_ratings.argsort()[::-1][:top_n]
#     print(movies_to_recommend[top], pred_ratings[top])
#     data_update = pd.DataFrame([[uid, movies_to_recommend[top][i], pred_ratings[top][i]] for i in range(top_n)], columns = ['uid', 'iid', 'rating'])
#     return movies_to_recommend[top], pred_ratings[top], data_update#

In [33]:
UID = 128088
n = len(get_unseen_movie_ids(128088, DATA))
ALG = surprise.SVD()

# random_ratings = np.random.rand(n)
# random_indices = np.random.choice(np.arange(n), 100)
# random_ratings[random_indices] = 5.0
manipulate_ratings_to_increase_diversity(random_ratings)

Number of movies added this time: 3965
Shape of the data: (98122, 3)
    Fit took 4.244449138641357 seconds
    Predict took 0.027065277099609375 seconds


7.53067747917412

In [34]:
#Should be smooth so let's try constrained trust region
manipulate_ratings_to_increase_diversity(random_ratings+0.1)-manipulate_ratings_to_increase_diversity(random_ratings+0.1)

Number of movies added this time: 4631
Shape of the data: (98788, 3)
    Fit took 4.198207139968872 seconds
    Predict took 0.02157735824584961 seconds
Number of movies added this time: 4631
Shape of the data: (98788, 3)
    Fit took 4.172102928161621 seconds
    Predict took 0.02150273323059082 seconds


-0.14614622915236364

In [78]:
def callbackF(Xi):
    obj_val = manipulate_ratings_to_increase_diversity(Xi)
    objective_history.append(obj_val)
    
    N_iter = 1
    print('{0:4s}   {1:9s}   {2:9s}   {3:9s}'.format(N_iter, np.max(Xi), np.sum(Xi>=0.5), obj_val))
    N_iter += 1

In [94]:
def f(x):
    return np.linalg.norm(x, ord = 1)

In [106]:
import numpy as np
import scipy.optimize as scopt

UID = 128088
n = len(get_unseen_movie_ids(128088, DATA))
ALG = surprise.SVD()
ALPHA = 0.1 

objective_history = []
x0 = 0.5*np.random.randn(n) + 0.5
#bounds = scopt.Bounds([-2.0,-0.5,-2.0],[2.0,0.8,0.7])
bounds = scopt.Bounds([0.0]*n,[5.0]*n)#[(0.0,5.0)]*n 


# print('{0:4s}   {1:9s}   {2:9s}   {3:9s}'.format('Iter', 'Max Given', 'Num > .5', 'f(X)'))
# Res = scopt.minimize(manipulate_ratings_to_increase_diversity, x0, method = 'trust-constr', bounds = bounds, 
#                      tol = 1e-3, options={"maxiter":3})#"maxfun":6})
Res = scopt.minimize(manipulate_ratings_to_increase_diversity, x0, method = 'Powell', tol = 1e-3, options={"maxiter":10, 'disp':True})#"maxfun":6})
print(Res)

Number of movies added this time: 3986
Shape of the data: (98143, 3)
    Fit took 4.318876504898071 seconds
    Predict took 1.6540324687957764 seconds
Current Diversity:  6.083438036162226
Curren norm 4636.094202476711
Number of movies added this time: 3986
Shape of the data: (98143, 3)
    Fit took 4.106801271438599 seconds
    Predict took 1.6859323978424072 seconds
Current Diversity:  6.089641549922294
Curren norm 4636.094202476711
Number of movies added this time: 3986
Shape of the data: (98143, 3)
    Fit took 4.314179420471191 seconds
    Predict took 1.6022934913635254 seconds
Current Diversity:  6.0947255243949385
Curren norm 4637.094202476711
Number of movies added this time: 3986
Shape of the data: (98143, 3)
    Fit took 4.23164701461792 seconds
    Predict took 1.6432173252105713 seconds
Current Diversity:  6.08088870114714
Curren norm 4638.712236476711
Number of movies added this time: 3986
Shape of the data: (98143, 3)
    Fit took 4.450747489929199 seconds
    Predict t

    Fit took 4.138841390609741 seconds
    Predict took 1.5839815139770508 seconds
Current Diversity:  6.0900104075811985
Curren norm 4651.932149074364
Number of movies added this time: 3988
Shape of the data: (98145, 3)
    Fit took 4.153506278991699 seconds
    Predict took 1.6394422054290771 seconds
Current Diversity:  6.091256695038672
Curren norm 4654.333404753019
Number of movies added this time: 3988
Shape of the data: (98145, 3)
    Fit took 4.358616352081299 seconds
    Predict took 1.7184514999389648 seconds
Current Diversity:  6.08004168838408
Curren norm 4655.817462405121
Number of movies added this time: 3988
Shape of the data: (98145, 3)
    Fit took 4.498501777648926 seconds
    Predict took 1.69134521484375 seconds
Current Diversity:  6.090032199339197
Curren norm 4653.181141059326
Number of movies added this time: 3988
Shape of the data: (98145, 3)
    Fit took 4.476044654846191 seconds
    Predict took 1.6411867141723633 seconds
Current Diversity:  6.081157979593692
C

Shape of the data: (98144, 3)
    Fit took 4.1466124057769775 seconds
    Predict took 1.6733863353729248 seconds
Current Diversity:  6.0874303383679145
Curren norm 4654.688253569111
Number of movies added this time: 3987
Shape of the data: (98144, 3)
    Fit took 4.123229265213013 seconds
    Predict took 1.6016933917999268 seconds
Current Diversity:  6.083141350201396
Curren norm 4654.688253569111
Number of movies added this time: 3987
Shape of the data: (98144, 3)
    Fit took 4.240194797515869 seconds
    Predict took 1.6626560688018799 seconds
Current Diversity:  6.0783474234475054
Curren norm 4654.484577514839
Number of movies added this time: 3987
Shape of the data: (98144, 3)
    Fit took 4.179245948791504 seconds
    Predict took 1.6693894863128662 seconds
Current Diversity:  6.079178433038252
Curren norm 4656.30628756911
Number of movies added this time: 3987
Shape of the data: (98144, 3)
    Fit took 4.2058703899383545 seconds
    Predict took 1.778761386871338 seconds
Curre

    Predict took 1.7646427154541016 seconds
Current Diversity:  6.075909583144828
Curren norm 4660.688253553563
Number of movies added this time: 3988
Shape of the data: (98145, 3)
    Fit took 4.204951047897339 seconds
    Predict took 1.6233186721801758 seconds
Current Diversity:  6.072405755986191
Curren norm 4661.282038842138
Number of movies added this time: 3988
Shape of the data: (98145, 3)
    Fit took 4.344954490661621 seconds
    Predict took 1.697756052017212 seconds
Current Diversity:  6.085953914202629
Curren norm 4660.924321528408
Number of movies added this time: 3988
Shape of the data: (98145, 3)
    Fit took 4.434638977050781 seconds
    Predict took 1.7629215717315674 seconds
Current Diversity:  6.082272417999909
Curren norm 4661.170219553574
Number of movies added this time: 3988
Shape of the data: (98145, 3)
    Fit took 4.257676839828491 seconds
    Predict took 1.8131258487701416 seconds
Current Diversity:  6.071410530363366
Curren norm 4661.070219553563
Number of

KeyboardInterrupt: 

In [None]:
import blackbox as bb


# def fun(par):
#     return par[0]**2 + par[1]**2  # dummy example

UID = 128088
n = len(get_unseen_movie_ids(128088, DATA))
ALG = surprise.SVD()
ALPHA = 0.1 

best_params = bb.search_min(f = manipulate_ratings_to_increase_diversity,  # given function
                            domain = [  # ranges of each parameter
                                [0.0, 5.0],
                                ]*n,
                            budget = 20000,  # total number of function calls available
                            batch = 4,  # number of calls that will be evaluated in parallel
                            resfile = 'best_params.csv')  # text file where results will be saved

[blackbox] evaluating batch 1/5000 (samples 1..4/20000) @ 2020-10-03 22:04:19 ...
Number of movies added this time: 5865
Number of movies added this time: 7217
Number of movies added this time: 6767
Number of movies added this time: 7184
Shape of the data: (100022, 3)
Shape of the data: (101374, 3)
Shape of the data: (100924, 3)
Shape of the data: (101341, 3)
    Fit took 4.782639741897583 seconds
    Predict took 0.00904083251953125 seconds
Current Diversity:  6.645415282119791
Curren norm 20993.4424811981
    Fit took 4.837298154830933 seconds
    Fit took 4.777379989624023 seconds
    Predict took 0.022381067276000977 seconds
Current Diversity:  6.983232005734879
    Predict took 0.013930559158325195 seconds
Curren norm 8806.72124059905
Current Diversity:  6.787082748985031
Curren norm 15955.163721797147
    Fit took 4.841265678405762 seconds
    Predict took 0.00931859016418457 seconds
Current Diversity:  6.705869303571793
Curren norm 20476.884962396198
[blackbox] evaluating batch 

    Fit took 4.8645920753479 seconds
    Fit took 4.843224287033081 seconds
    Predict took 0.016425132751464844 seconds
Current Diversity:  6.795938893316216
Curren norm 19526.800939768633
    Predict took 0.01630425453186035 seconds
Current Diversity:  6.520838377554528
Curren norm 19953.52218036768
    Fit took 4.875979661941528 seconds
    Predict took 0.010249853134155273 seconds
Current Diversity:  6.503572620628584
Curren norm 19956.96466156578
    Fit took 4.988277435302734 seconds
    Predict took 0.010155200958251953 seconds
Current Diversity:  6.353263516343455
Curren norm 19565.243420966726
[blackbox] evaluating batch 10/5000 (samples 37..40/20000) @ 2020-10-03 22:05:08 ...
Number of movies added this time: 7132
Number of movies added this time: 7160
Number of movies added this time: 7134
Number of movies added this time: 7162
Shape of the data: (101289, 3)
Shape of the data: (101317, 3)
Shape of the data: (101319, 3)
Shape of the data: (101291, 3)
    Fit took 4.863867521

    Fit took 4.937357187271118 seconds
    Predict took 0.012985467910766602 seconds
Current Diversity:  6.516525597720413
Curren norm 19923.601879537266
    Predict took 0.011926889419555664 seconds
Current Diversity:  6.169145952802895
Curren norm 19710.32312013631
[blackbox] evaluating batch 18/5000 (samples 69..72/20000) @ 2020-10-03 22:05:52 ...
Number of movies added this time: 7150

Number of movies added this time: 7146Number of movies added this time: 7160
Number of movies added this time: 7163
Shape of the data: (101317, 3)
Shape of the data: (101307, 3)
Shape of the data: (101303, 3)
Shape of the data: (101320, 3)
    Fit took 4.819348335266113 seconds
    Predict took 0.010093927383422852 seconds
Current Diversity:  6.413028287430741
Curren norm 19728.76560133441
    Fit took 4.868372201919556 seconds
    Fit took 4.808038949966431 seconds
    Predict took 0.010963678359985352 seconds
Current Diversity:  6.853749861904119
    Predict took 0.01105189323425293 seconds
Curren 

Number of movies added this time: 7153
Number of movies added this time: 7162
Number of movies added this time: 7152
Number of movies added this time: 7164
Shape of the data: (101310, 3)
Shape of the data: (101319, 3)
Shape of the data: (101309, 3)
Shape of the data: (101321, 3)
    Fit took 4.740772008895874 seconds
    Predict took 0.015520095825195312 seconds
Current Diversity:  6.536916773779341
Curren norm 19900.566541103042
    Fit took 4.880470037460327 seconds    Fit took 4.846471786499023 seconds

    Predict took 0.013731002807617188 seconds
Current Diversity:      Predict took 0.01638650894165039 seconds6.6889015085702885

Current Diversity:  Curren norm6.162833998396087 
19778.845300503992
Curren norm 19762.287781702085
    Fit took 4.81395697593689 seconds
    Predict took 0.010112524032592773 seconds
Current Diversity:  6.415273839509485
Curren norm 19909.009022301143
[blackbox] evaluating batch 27/5000 (samples 105..108/20000) @ 2020-10-03 22:06:40 ...
Number of movies a

    Predict took 0.01618194580078125 seconds
Current Diversity:  6.028231381241817
Curren norm 19803.924999673574
    Fit took 4.947053670883179 seconds
    Fit took 4.879491090774536 seconds
    Predict took 0.016349315643310547 seconds    Fit took 4.900029897689819 seconds
    Predict took 0.011301040649414062 seconds
Current Diversity:  
6.636012256647068Current Diversity: 
 6.333792099824534Curren norm
Curren norm  19899.0887214707219910.646240272617

    Predict took 0.013590097427368164 seconds
Current Diversity:  6.317361688282389
Curren norm 19797.36748087167
[blackbox] evaluating batch 35/5000 (samples 137..140/20000) @ 2020-10-03 22:07:24 ...
Number of movies added this time: 7148
Number of movies added this time: 7163
Number of movies added this time: 7153
Number of movies added this time: 7164
Shape of the data:

In [109]:
best_params 128088

array([0.02201063, 0.00404368])

In [80]:
objective_history

[]

In [None]:



# x0 = np.array([1.1, 1.1, 1.1], dtype=np.double)
# [xopt, fopt, gopt, Bopt, func_calls, grad_calls, warnflg] = \
#     fmin_bfgs(rosen, 
#               x0, 
#               callback=callbackF, 
#               maxiter=2000, 
#               full_output=True, 
#               retall=False)

In [36]:
print('{0:4s}   {1:9s}   {2:9s}   {3:9s}   {4:9s}'.format('Iter', ' X1', ' X2', ' X3', 'f(X)'))

Iter    X1          X2          X3         f(X)     


Iter   Max Given   Num > .5    f(X)     


In [41]:
print ('{0:4s}   {1:9s}   {2:9s}   {4:9s}'.format('Iter', 'Max Given', 'Num > 0.5', 'f(X)'))  

IndexError: Replacement index 4 out of range for positional args tuple