In [1]:
import pandas as pd
import numpy as np
from lenskit import batch, topn, util
from lenskit.algorithms import Recommender, als, user_knn, item_knn, svd, user_knn
from lenskit.algorithms.ranking import TopN
from lenskit.algorithms.basic import UnratedItemCandidateSelector, Popular
from lenskit import crossfold as cf

from collections import deque
import random
import gc

In [2]:
pd.set_option('mode.chained_assignment', None)

In [3]:
ratings=pd.read_parquet("Data/Experiment_Data/full_training_subset_of_select_tracks_experiment.parquet")
ratings.columns=['user', 'item', 'rating']

In [4]:
user_age_index_df=pd.read_parquet("Data/Experiment_Data/user_age_index_basic_for_experiment_updated.parquet")
user_age_index_df=user_age_index_df.reset_index()
artists_index_df=pd.read_parquet("Data/Experiment_Data/tracks_artist_index_for_experiment.parquet")

#Convert the user age and artist data into a lookup dictonary
user_age_index=dict(zip(list(user_age_index_df['user']), list(user_age_index_df['user_age'])))
artists_index=dict(zip(list(artists_index_df['item']), list(artists_index_df['artist'])))

In [5]:
user_counts=ratings.groupby('user').count().reset_index()
users_admited=user_counts[user_counts['item']>20]
users_admited=pd.DataFrame({'user':users_admited.user.unique()})
ratings=ratings.merge(users_admited,how='inner',on='user')

In [6]:
def user_similarity(user1,user2,items_per_user):
    user1_item_set=items_per_user[user1]
    user2_item_set=items_per_user[user2]
    return len(user1_item_set.intersection(user2_item_set))/len(user1_item_set.union(user2_item_set))
def claculate_similarities():
    age_ranges=[(10,20),(16,26),(46,56),(49,59),(49,55),(55,61)]  

    ratings_w_ages=ratings.merge(user_age_index_df,how='inner',on='user')

    user_items=ratings.groupby('user').item.unique()

    for i in user_items.index:
        user_items[i]=set(user_items[i])

    for age_range_start,age_range_end in age_ranges:

        similarity_sum_all_in_group=0
        similarity_sum_with_similar_in_group=0

        sim_user_number_sum=0

        similarity_sum_all_outof_group=0
        similarity_sum_with_similar_outof_group=0

        users_with_no_similarity_in_group=0
        users_with_no_similarity_outof_group=0
        users_with_no_similarity_general=0

        users_in_age_range=ratings_w_ages[(ratings_w_ages['user_age']>=age_range_start) & (ratings_w_ages['user_age']<=age_range_end)]
        users_in_age_range=users_in_age_range.user.unique()

        print('Age Range: ',age_range_start,'-',age_range_end)
        print('Number of users in age range: ',len(users_in_age_range))
        other_users=set(ratings_w_ages.user.unique()).difference(set(users_in_age_range))
        for user1 in users_in_age_range:

            #Test Within age group
            current_user_similarity_sum=0
            users_with_similarity=0
            for user2 in users_in_age_range:

                if(user1==user2):
                    continue

                similarity=user_similarity(user1,user2,user_items)

                current_user_similarity_sum=current_user_similarity_sum+similarity

                if similarity>0:
                    users_with_similarity=users_with_similarity+1

            similarity_sum_all_in_group=similarity_sum_all_in_group+(current_user_similarity_sum/len(users_in_age_range))
            similarity_sum_with_similar_in_group=similarity_sum_with_similar_in_group+(current_user_similarity_sum/users_with_similarity)

            sim_user_number_sum=sim_user_number_sum+users_with_similarity

            if current_user_similarity_sum==0:
                users_with_no_similarity_in_group=users_with_no_similarity_in_group+1

            current_user_similarity_sum_passed=current_user_similarity_sum

            #Test out of age group
            current_user_similarity_sum=0
            users_with_similarity=0



    #         for user2 in other_users:

    #             if(user1==user2):
    #                 continue

    #             similarity=user_similarity(user1,user2,user_items)

    #             current_user_similarity_sum=current_user_similarity_sum+similarity

    #             if similarity>0:
    #                 users_with_similarity=users_with_similarity+1

    #         similarity_sum_all_outof_group=similarity_sum_all_outof_group+(current_user_similarity_sum/len(other_users))
    #         similarity_sum_with_similar_outof_group=similarity_sum_with_similar_outof_group+(current_user_similarity_sum/users_with_similarity)

    #         if current_user_similarity_sum==0:
    #             users_with_no_similarity_outof_group=users_with_no_similarity_outof_group+1

    #         if (current_user_similarity_sum==0) & (current_user_similarity_sum_passed==0):
    #             users_with_no_similarity_general=users_with_no_similarity_in_group+1






        print('Number of users in age range with no similarity to other users in their age group: ',users_with_no_similarity_in_group)
        print('Average similiarity of users between themselved within the age range: ',similarity_sum_all_in_group/len(users_in_age_range))
        print('Average similiarity of only similar users within the age range: ',similarity_sum_with_similar_in_group/(len(users_in_age_range)-users_with_no_similarity_in_group))
        print("The average number of users a users is similar with within the age group:",sim_user_number_sum/len(users_in_age_range))

    #     print('Number of users in age range with no similarity to other users out of their group: ',users_with_no_similarity_outof_group)
    #     print('Average similiarity of users with users out of their age range: ',similarity_sum_all_outof_group/len(other_users))
    #     print('Average similiarity of only similar users users out of their age range: ',similarity_sum_with_similar_outof_group/(len(other_users)-users_with_no_similarity_outof_group))


    #     print('Number of users in age range with no similarity to other users in general (in and out of their group): ',users_with_no_similarity_general)


In [8]:
claculate_similarities()
gc.collect()

Age Range:  10 - 20
Number of users in age range:  11069
Number of users in age range with no similarity to other users in their age group:  0
Average similiarity of users between themselved within the age range:  0.008431282034049657
Average similiarity of only similar users within the age range:  0.013672836159205488
The average number of users a users is similar with within the age group: 6489.645857801066
Age Range:  16 - 26
Number of users in age range:  26208
Number of users in age range with no similarity to other users in their age group:  0
Average similiarity of users between themselved within the age range:  0.00855732158513584
Average similiarity of only similar users within the age range:  0.01252244192007138
The average number of users a users is similar with within the age group: 16900.90544871795
Age Range:  46 - 56
Number of users in age range:  795
Number of users in age range with no similarity to other users in their age group:  0
Average similiarity of users betwee

0

In [9]:
Item_similarities={}

def item_dist(item1,item2):
    
    #key_pair_1=str(item1)+','+str(item2)
   # key_par_2=str(item1)+','+str(item2)
    
    #if(key_pair_1 in Item_similarities.keys()):
        #return Item_similarities[key_pair_1]
    #else:
    
    #    item1_users=set(ratings[ratings['item']==item1].user.unique())
    #    item2_users=set(ratings[ratings['item']==item2].user.unique())
    #    distance=len(item1_users.intersection(item2_users))/len(item1_users.union(item2_users))
        
    #    Item_similarities[key_pair_1]=distance
     #   Item_similarities[key_pair_2]=distance
        
        
    return i2i_sim_matrix[item1][item2]

def calculate_intralist_diversity(topN_list):
    
    items=topN_list.item.unique()
    list_length=len(topN_list)
    
    item_distances_sum=sum([sum([item_dist(i,j) for j in items]) for i in items])
    
    return item_distances_sum/(list_length*(list_length-1))

In [7]:

def ajdust_diversity_user(userID, diversity, topN_list, n=100,k=50):
    
    similar_track_removal_probability=int(np.floor(diversity*100))
    #print(similar_track_removal_probability)
    artist_already_pressent=[]
    removed_tracks_indices=[]
    
    for i in range(0,len(topN_list)):
        candidate=topN_list.iloc[i]
        item=int(candidate['item'])
        rank=int(candidate['rank'])
        item_artist=artists_index[item]
        
        if (item_artist in artist_already_pressent):
            removal_chance=random.randrange(1,100)
            
            if(removal_chance<similar_track_removal_probability):
                removed_tracks_indices.append(i)
                #print(removal_chance, " removed artist ",item_artist)
        else:
            artist_already_pressent.append(item_artist)
            #print("New artist ",item_artist)
        
        if i == k:
            break
    
    topN_list_final=topN_list.drop(removed_tracks_indices).reset_index(drop=True)
    topN_list_final['rank']=list(range(1,len(topN_list_final)+1))
    
    topN_list_final=topN_list_final.head(k)
    
    if len(topN_list_final)>0:
        return len(artist_already_pressent)/len(topN_list_final), topN_list_final
    else:
        return 0, topN_list_final



def adjust_divesities(users, diversity, TopN_lists_persanolized, n=100,k=50):
    final_list=None
    set_list=True
    
    diversities=[]
    
    #users_processed=0
    
    for user in users:
        
        user_list=TopN_lists_persanolized[TopN_lists_persanolized['user']==user]
        user_list=user_list.reset_index(drop=True)
        
        diversitiy, user_adujusted_list=ajdust_diversity_user(user, diversity, user_list, n=n,k=k)
        diversities.append(diversitiy)
        if set_list:
            final_list=user_adujusted_list
            set_list=False
        else:
            final_list=final_list.append(user_adujusted_list)
        
#         users_processed=users_processed+1
#         if users_processed % 1000 == 0:
#             print("Users Processed: ",users_processed/1000,"K")
    diversities=list(filter(lambda x: x != 0, diversities))       
    return sum(diversities)/len(diversities), final_list

In [8]:
def convert_to_DF(series):
    return pd.DataFrame(series).T

def generate_recommendations(user_set, algorithm, n=100):
    
    #Adjust Diversity of Recomendations

    recommendations = batch.recommend(algorithm, user_set, n ,n_jobs=25)
    
    #recommendations=adjust_divesities(user_set,recommendations, diversity, n=n)
    
    return recommendations

def evaluate_recomendations(recomendations, truth, k=100):
    
    analysis = topn.RecListAnalysis()
    analysis.add_metric(topn.ndcg,k=k)
    analysis.add_metric(topn.precision,k=k)
    analysis.add_metric(topn.recall,k=k)
    analysis.add_metric(topn.hit)
    results = analysis.compute(recomendations, truth)
    
    return results


def shanon_enthropy(rec_list):
    
    rec_list=rec_list.groupby('item').count().reset_index()
    rec_list=list(rec_list['user'])
    
    total_recs=sum(rec_list)
    probs=np.array(rec_list)/total_recs
    
    enthropy=sum(probs*np.log2(probs))*-1
    
    return enthropy



In [None]:
## Crossvalidate sliding window

#Evaluate Recomenders
age_ranges=[(10,64)]#[(10,20),(16,26),(46,56),(49,59),(49,55),(55,61)] # 5,7 already tested at this level !ADD IT FOR RETEST
#neighbours_set=[3,6,12,24,50]
neighbours_set=[6,12,24,50]
#diversities=[(0.2,0.8),(0.3,0.7),(0.4,0.6),(0.15,0.4)]
#diversities=[(0,0),(0.05,0.05),(0.1,0.1),(0.15,0.15),(0.2,0.2),(0.25,0.25),(0.30,0.30),(0.4,0.4),(0.5,0.5)]
diversities=[0,0.2,0.4,0.6] #,0.4,0.8]#,(0.1,0.1),(0.2,0.2),(0.4,0.4)
list_lengths=[10,5]
n=50


results=None
set_results=True
cv_iter=0

for train, test in cf.partition_users(ratings[['user', 'item', 'rating']], 5, cf.SampleFrac(0.2)):
    cv_iter=cv_iter+1
    print("Testing in CV iteration:",cv_iter)
    
    for neighbours in neighbours_set:
        print("Traning recommender with",neighbours,"neighbours") #, save_nbrs=100
        predictor = user_knn.UserUser(neighbours,min_nbrs=neighbours,center=False,feedback='implicit',use_ratings=False)#
        Unseen_item_selector = UnratedItemCandidateSelector()
        recommender = TopN(predictor, Unseen_item_selector)    
        predictor.fit(train)
        Unseen_item_selector.fit(train)
        
        recomendations_all=generate_recommendations(test.user.unique(),recommender,n=n)
        #recomendations_all=recomendations_all.merge(artists_index_df,on='item',how='inner')
        
        for k in list_lengths:
            print("Testing at",k,"recommendations")
            for diversity in diversities:  
            #younger_older_diversities=[(low_diversity, high_diversity) , (high_diversity, low_diversity)]
                for age_range_start,age_range_end in age_ranges:

                    
                    age_range_str=str(age_range_start)+'-'+str(age_range_end)

                    users_in_age_range=user_age_index_df[(user_age_index_df['user_age']>=(age_range_start)) & 
                                            (user_age_index_df['user_age']<=(age_range_end))]                

                    info= "age range "+age_range_str+" with "+str(neighbours)+" neighbours and " + str(diversity) + " diversity, " 
                    print('Processing',info)

                    truth_in_age_range=test.merge(users_in_age_range[['user']], how='inner', on='user')

                    recs_in_age_range = recomendations_all.merge(users_in_age_range[['user']], how='inner', on='user')

                    print("============")
                    print("WE ARE RECCOEMEDNING TO")
                    print("============")
                    
                    testing=recs_in_age_range.groupby('user').count().reset_index()
                    print(len(testing[testing['score']>0]))
                    
                    print("USERS")
                    print("============")
                    Intralist_diversity_avg, recs_in_age_range = adjust_divesities(users_in_age_range.user.unique(), diversity, 
                                                          recs_in_age_range, n=n,k=k)




                    results_i=evaluate_recomendations(recs_in_age_range, truth_in_age_range, k=k)
                    results_i=results_i[["ndcg","precision","recall","hit"]].mean()
                    results_i=convert_to_DF(results_i)

                    diversity_pressent=shanon_enthropy(recs_in_age_range)

                    print('Final Evaluation for',k, 'recomendations in', info, ", with Inta-user",Intralist_diversity_avg,'and',diversity_pressent,"measured Shannon diversity")
                    print(results_i)

                    #results_i=crossvalidate_ages(younger_users_recomender,older_users_recomender,
                    #                   Unseen_item_selector, ratings,
                    #                    younger_users, older_users,
                    #                   younger_diversity, older_diversity,
                    #                    info=info, n=100)


                    results_i['List_Len']=[k]*len(results_i)
                    results_i['Neighbours']=[neighbours]*len(results_i)
                    results_i['Diversity_adjustment']=[diversity]*len(results_i)
                    results_i['Intralist_Diversity_calculated']=[Intralist_diversity_avg]*len(results_i)
                    results_i['Shannon_diversity_pressent']=[diversity_pressent]*len(results_i)
                    results_i['Age_range']=[age_range_str]*len(results_i)
                    results_i['CV_iter']=[cv_iter]*len(results_i)
                    if set_results:
                        results=results_i
                        set_results=False
                    else:
                        results=results.append(results_i)
                #break
            #break
        #break
    #break
    sifix="_at_fold_"+str(cv_iter)
    results.to_parquet("Data/Experiment_Data/Results_Cross_validationNew_5_10_2"+sifix+".parquet")

results.to_parquet("Data/Experiment_Data/Results_Cross_validationNew_5_10_2_Final.parquet")    
    
    
results.head()                


Testing in CV iteration: 1
Traning recommender with 6 neighbours
Testing at 10 recommendations
Processing age range 10-64 with 6 neighbours and 0 diversity, 
WE ARE RECCOEMEDNING TO
7620
USERS
Final Evaluation for 10 recomendations in age range 10-64 with 6 neighbours and 0 diversity,  , with Inta-user 0.7740157480315023 and 11.548471815413743 measured Shannon diversity
       ndcg  precision    recall       hit
0  0.122642   0.194646  0.199296  0.790945
Processing age range 10-64 with 6 neighbours and 0.2 diversity, 
WE ARE RECCOEMEDNING TO
7620
USERS
Final Evaluation for 10 recomendations in age range 10-64 with 6 neighbours and 0.2 diversity,  , with Inta-user 0.7740157480315023 and 11.553913794289176 measured Shannon diversity
       ndcg  precision    recall       hit
0  0.121526    0.19269  0.197204  0.790157
Processing age range 10-64 with 6 neighbours and 0.4 diversity, 
WE ARE RECCOEMEDNING TO
7620
USERS
Final Evaluation for 10 recomendations in age range 10-64 with 6 neighbou

In [12]:

# results_means=results.groupby(['Age_range','Neighbours','Diversity_adjustment','List_Len']).mean().reset_index()
# print_l=False
# list_lengths=[5,10]
# for list_len in list_lengths:
#     results_means=results_means[results_means['List_Len']==list_len]
#     for age_r in results_means.Age_range.unique():

results=pd.read_parquet("Data/Experiment_Data/Results_Cross_validationNew_5_10_2_at_fold_1.parquet")
for f in range(2,6):
    r=pd.read_parquet("Data/Experiment_Data/Results_Cross_validationNew_5_10_2_at_fold_"+str(f)+".parquet")
    results=results.append(r)        
# age_range=5
# sifix="_at_age_range_"+str(age_range)

#results=pd.read_parquet("Data/Experiment_Data/Results_Cross_validation_final2.parquet")
results_means_all=results.groupby(['Age_range','Neighbours','Diversity_adjustment','List_Len']).mean().reset_index()

print_l=False
list_lengths=[5,10]

add=None
set_add=True
for list_len in list_lengths:
    
    results_means=results_means_all[results_means_all['List_Len']==list_len]
    for age_r in results.Age_range.unique():
        print("Age Range",age_r)
        results_age_r=results_means[results_means['Age_range']==age_r].sort_values(by='ndcg',ascending=False).head(4)
        print(results_age_r)

        if set_add:
            add=results_age_r
            set_add=False
        else:
            add=add.append(results_age_r)

filename="Data/Experiment_Data/Results_Cross_validationNew_5_10_neighbours2_Summarized.csv"
add.to_csv(filename)

Age Range 10-20
   Age_range  Neighbours  Diversity_adjustment  List_Len      ndcg  precision  \
36     10-20          50                   0.0         5  0.170478   0.319195   
39     10-20          50                   0.2         5  0.169035   0.315391   
24     10-20          24                   0.0         5  0.168965   0.318187   
27     10-20          24                   0.2         5  0.166682   0.313580   

      recall       hit  Intralist_Diversity_calculated  \
36  0.320937  0.774646                        0.853309   
39  0.317142  0.775940                        0.853309   
24  0.319982  0.773149                        0.849173   
27  0.315404  0.774825                        0.849173   

    Shannon_diversity_pressent   CV_iter  
36                    9.374158  2.333333  
39                    9.370489  2.333333  
24                    9.700070  2.333333  
27                    9.702684  2.333333  
Age Range 16-26
   Age_range  Neighbours  Diversity_adjustment  List_Len

In [15]:
results_means_all.to_csv("Data/Experiment_Data/Results_Cross_validation_5_10_neighbours_All.csv")

In [38]:
#Looking for age groups with sliding window





#Test with sliding window
#Evaluate Recomenders
age_ranges=[3,5,7] 

neighbours_set=[6,12,24,50]

diversities=[0,0.2,0.4,0.6,0.8]

train, test=single_split(ratings)

results=None
set_results=True


for age_range in age_ranges:
    print("Testing in age range:",age_range)
    for neighbours in neighbours_set:
        print("Traning recommender with",neighbours,"neighbours") #, save_nbrs=100
        predictor = user_knn.UserUser(neighbours,min_nbrs=neighbours,center=False,feedback='implicit',use_ratings=False)
        Unseen_item_selector = UnratedItemCandidateSelector()
        recommender = TopN(predictor, Unseen_item_selector)    
        predictor.fit(train)
        Unseen_item_selector.fit(train)
        
        recomendations_all=generate_recommendations(test.user.unique(),recommender,n=100)
        
        for diversity in diversities:  
        #younger_older_diversities=[(low_diversity, high_diversity) , (high_diversity, low_diversity)]
            for central_age in range(10+age_range,64-age_range,3):
                
                
                
                age_range_start= central_age-age_range
                age_range_end = central_age+age_range
                age_range_str=str(age_range_start)+'-'+str(age_range_end)
                
                users_in_age_range=user_age_index_df[(user_age_index_df['user_age']>=(age_range_start)) & 
                                        (user_age_index_df['user_age']<=(age_range_end))]                
                
                info= "age range "+age_range_str+" with "+str(neighbours)+" neighbours and " + str(diversity) + " diversity, " 
                print('Processing',info)
                
                truth_in_age_range=test.merge(users_in_age_range[['user']], how='inner', on='user')
                
                recs_in_age_range = recomendations_all.merge(users_in_age_range[['user']], how='inner', on='user')
                
                recs_in_age_range= adjust_divesities(users_in_age_range.user.unique(), diversity, 
                                                      recs_in_age_range, n=100)
                
                
                
                results_i=evaluate_recomendations(recs_in_age_range, truth_in_age_range, k=50)
                results_i=results_i[["ndcg","precision","recall","hit"]].mean()
                results_i=convert_to_DF(results_i)
                
                print('Final Evaluation for 50 recomendations in', info)
                print(results_i)
                
                #results_i=crossvalidate_ages(younger_users_recomender,older_users_recomender,
                #                   Unseen_item_selector, ratings,
                #                    younger_users, older_users,
                #                   younger_diversity, older_diversity,
                #                    info=info, n=100)
                
                results_i['Neighbours']=[neighbours]*len(results_i)
                results_i['Diversity']=[diversity]*len(results_i)
                results_i['Age_range']=[age_range_str]*len(results_i)
                
                if set_results:
                    results=results_i
                    set_results=False
                else:
                    results=results.append(results_i)
                #break
            #break
        #break
    #break
    sifix="_at_age_range_"+str(age_range)
    results.to_parquet("Data/Experiment_Data/Results_Sliding_Window"+sifix+".parquet")

    
    
    
results.head()                





Testing in age range: 5
Traning recommender with 6 neighbours
Processing age range 10-20 with 6 neighbours and 0 diversity, 
Final Evaluation for 50 recomendations in age range 10-20 with 6 neighbours and 0 diversity, 
       ndcg  precision    recall       hit
0  0.203897   0.115717  0.213307  0.975336
Processing age range 13-23 with 6 neighbours and 0 diversity, 
Final Evaluation for 50 recomendations in age range 13-23 with 6 neighbours and 0 diversity, 
       ndcg  precision    recall       hit
0  0.196866   0.122136  0.198031  0.973772
Processing age range 16-26 with 6 neighbours and 0 diversity, 
Final Evaluation for 50 recomendations in age range 16-26 with 6 neighbours and 0 diversity, 
       ndcg  precision    recall       hit
0  0.187057   0.122378  0.187503  0.972694
Processing age range 19-29 with 6 neighbours and 0 diversity, 
Final Evaluation for 50 recomendations in age range 19-29 with 6 neighbours and 0 diversity, 
       ndcg  precision    recall       hit
0  0.1784

Unnamed: 0,ndcg,precision,recall,hit,Neighbours,Diversity,Age_range
0,0.203897,0.115717,0.213307,0.975336,6,0.0,10-20
0,0.196866,0.122136,0.198031,0.973772,6,0.0,13-23
0,0.187057,0.122378,0.187503,0.972694,6,0.0,16-26
0,0.178472,0.123055,0.177837,0.973449,6,0.0,19-29
0,0.171679,0.123759,0.169178,0.973733,6,0.0,22-32


In [79]:
#Generate Results

age_range=5
sifix="_at_age_range_"+str(age_range)

results=pd.read_parquet("Data/Experiment_Data/Results_Cross_validation_final2.parquet")

add=None
set_add=True

for age_r in results.Age_range.unique():
    print("Age Range",age_r)
    results_age_r=results[results['Age_range']==age_r].sort_values(by='ndcg',ascending=False).head(4)
    print(results_age_r)
    
    if set_add:
        add=results_age_r
        set_add=False
    else:
        add=add.append(results_age_r)
add.to_csv("Data/Experiment_Data/Results_CV_2_summarized.csv")

Age Range 49-59
       ndcg  precision    recall       hit  Neighbours  Diversity_adjustment  \
0  0.166460   0.093846  0.160486  0.923077          12                   0.2   
0  0.166424   0.094154  0.161676  0.923077          12                   0.0   
0  0.160021   0.094769  0.162784  0.907692           6                   0.0   
0  0.158668   0.088000  0.146684  0.907692          12                   0.4   

   Shannon_diversity_pressent Age_range  CV_iter  
0                    9.703549     49-59        2  
0                    9.703978     49-59        2  
0                   10.053383     49-59        2  
0                    9.683972     49-59        2  
Age Range 49-55
       ndcg  precision    recall       hit  Neighbours  Diversity_adjustment  \
0  0.167653   0.098846  0.159472  0.923077          12                   0.0   
0  0.165956   0.095000  0.153926  0.923077          12                   0.2   
0  0.162281   0.093462  0.150064  0.903846          12                  