# import pandas as pd
import json
import matplotlib.pyplot as plt
import numpy as np

seed = 0

In [83]:
df = pd.read_csv('./ratings.dat', names=['user_id', 'movie_id', 'rating', 'timestamp'],
                        sep="::", engine='python')

# movie item lists in score data not equal to movies_extrainfos

Cold-warm transition with head \& short-tail items
1. Sort interactions by time
2. Plot transition ratio
3. Warm-cold user similarity
4. Popularity bias metrics: which to show? (item or user side?)

In [84]:
freq_thres = 20
movie_cnt = df.movie_id.value_counts()
movie_freq = movie_cnt[movie_cnt>=freq_thres]

In [85]:
print("Total items: {}\n Freq items: {}\n Freq ratio: {:.2f}".format(len(movie_cnt), len(movie_freq), len(movie_freq)/len(movie_cnt)))

Total items: 3704
 Freq items: 3043
 Freq ratio: 0.82


In [86]:
movie_freq_set = set(movie_freq.index)
score_data = df.loc[df.movie_id.apply(lambda movie: movie in movie_freq_set)]

# sanity check
assert len(set(score_data.movie_id.unique())) == len(movie_freq)
assert score_data.movie_id[0] in movie_freq_set

print("Total interactions: {}".format(len(score_data)))
print("Total users: {}".format(len(score_data.user_id.unique())))
print("Total items: {}".format(len(score_data.movie_id.unique())))

Total interactions: 995492
Total users: 6040
Total items: 3043


In [488]:
# Sort interactions by time
score_data.sort_values(by=['timestamp'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [489]:
# popular/tail/niche items (top/bottom 20% popularity)
i_sorted = score_data.movie_id.value_counts().index
total_items = i_sorted.values
pop_items = i_sorted[:int(len(i_sorted)*0.2)].values
tail_items = i_sorted[int(len(i_sorted)*0.2):].values
niche_items = i_sorted[int(len(i_sorted)*0.8):].values

# cold_users
n_users = len(score_data.user_id.unique())
heavy_users = score_data.user_id.value_counts().head(int(n_users*0.2)).index.values
warm_users = score_data.user_id.value_counts().head(int(n_users*0.8)).index.values
cold_users = score_data.user_id.value_counts().tail(int(n_users*0.2)).index.values

# Do users change after first k interactions?

In [490]:
score_data['pop'] = score_data['movie_id'].apply(lambda x: 1 if x in pop_items else 0)
score_data['tail'] = score_data['movie_id'].apply(lambda x: 1 if x in tail_items else 0)
score_data['niche'] = score_data['movie_id'].apply(lambda x: 1 if x in niche_items else 0)

user_pop_cnt = score_data.groupby(['user_id'])['pop']
user_tail_cnt = score_data.groupby(['user_id'])['tail']
user_niche_cnt = score_data.groupby(['user_id'])['niche']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  score_data['pop'] = score_data['movie_id'].apply(lambda x: 1 if x in pop_items else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  score_data['tail'] = score_data['movie_id'].apply(lambda x: 1 if x in tail_items else 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  score_data['niche'] = score_d

In [491]:
_ = score_data.groupby(['user_id'])

# Head ratio of first 10 interactions
init10 = _.head(10).groupby(['user_id'])
init_head_ratio = init10['pop'].sum()/init10['pop'].count()

# Head ratio of total interactions
total_head_ratio = _['pop'].sum()/_['pop'].count()

# Head ratio of first 10 & total interactions by 100+ interaction users
heavy100 = _.filter(lambda x: len(x) >=100).groupby(['user_id'])

heavy100_init10 = heavy100.head(10).groupby(['user_id'])
init_head_ratio_heavy100 = heavy100_init10['pop'].sum()/heavy100_init10['pop'].count()

total_head_ratio_heavy100 = heavy100['pop'].sum()/heavy100['pop'].count()

print("All user initial 10 interactions: {:.4f}".format(init_head_ratio.mean()))
print("All user total interactions: {:.4f}".format(total_head_ratio.mean()))
print("Heavy user initial 10 interactions: {:.4f}".format(init_head_ratio_heavy100.mean()))
print("Heavy user total interactions: {:.4f}".format(total_head_ratio_heavy100.mean()))

All user initial 10 interactions: 0.6795
All user total interactions: 0.6597
Heavy user initial 10 interactions: 0.6726
Heavy user total interactions: 0.6200


In [492]:
#### about half of users have more than 95(≈100) interactions
u_hist_len = score_data.groupby(['user_id']).count().sort_values(['pop']).reset_index()['movie_id']
for decile in range(0, 10):
    print("{}({}%)".format(u_hist_len[int(len(u_hist_len)*decile/10)], decile*10), end=' ')

16(0%) 27(10%) 37(20%) 51(30%) 70(40%) 95(50%) 126(60%) 173(70%) 253(80%) 398(90%) 

In [493]:
import pickle
movie_dict = pickle.load(open('./m_movie_dict.pkl', 'rb'))

Divide by users' init clicks and get self-similarity (feature)

In [494]:
all_user_num = len(score_data.user_id.unique())
all_user_init_tail_num = len(score_data.groupby(['user_id']).head(10).loc[score_data.groupby(['user_id']).head(10)['pop']==False].user_id.unique())
print("Total # of users: {}".format(all_user_num))
print("No initial tail interaction by all users: {}".format(all_user_init_tail_num))
print("Ratio of users with no initial tail interaction: {:.3f}".format((all_user_num-all_user_init_tail_num)/all_user_num))

Total # of users: 6040
No initial tail interaction by all users: 5785
Ratio of users with no initial tail interaction: 0.042


In [495]:
all_user_num = len(score_data.user_id.unique())
all_user_init_tail_num = len(score_data.groupby(['user_id']).head(10).loc[score_data.groupby(['user_id']).head(10)['pop']==True].user_id.unique())
print("Total # of users: {}".format(all_user_num))
print("No initial head interaction by all users: {}".format(all_user_init_tail_num))
print("Ratio of users with no initial head interaction: {:.3f}".format((all_user_num-all_user_init_tail_num)/all_user_num))

Total # of users: 6040
No initial head interaction by all users: 6040
Ratio of users with no initial head interaction: 0.000


In [496]:
freq_user_num = len(freq_users.user_id.unique())
freq_user_init_tail_num = len(freq_users.groupby(['user_id']).head(10).loc[freq_users.groupby(['user_id']).head(10)['pop']==False].user_id.unique())
print("frequent users ({}+ interaction): {}".format(freq_condition, freq_user_num))
print("No initial tail interaction by frequent users: {}".format(freq_user_init_tail_num))
print("Ratio of users with no initial tail interaction: {:.3f}".format((freq_user_num-freq_user_init_tail_num)/freq_user_num))

frequent users (100+ interaction): 2935
No initial tail interaction by frequent users: 2816
Ratio of users with no initial tail interaction: 0.041


for interaction 100+ users,
1. similarity between first 10 clicks (1/n) and the rest clicks
2. similarity between first 10 clicks (1/n, weighted) and the rest clicks

In [497]:
freq_condition = 100
freq_users = score_data.groupby(['user_id']).filter(lambda x: len(x) >=freq_condition)
fu_init = freq_users.groupby(['user_id']).head(10)
fu_later = freq_users.drop(fu_init.index)

# sanity check
assert set(fu_init.index.values).intersection(set(fu_later.index.values)) == set()

pop item이 없는 경우에 -> 그냥 pop item interaction similarity로 대체

In [498]:
from sklearn.metrics.pairwise import cosine_similarity

def get_feature(x, movie_dict):
    return movie_dict[x[0]]

def get_pair_group_similarity(l, r, movie_dict):
    """Get group similarity between array l and array r.
    Return average similarity of array l elements.
    Caution: first column is removed for Movielens data ([:,1:])
    """
    # substitute movie id with dictionary value
    l_feature = np.apply_along_axis(get_feature, 0, l, movie_dict).squeeze(0).transpose()[:,1:] # remove ratings
    r_feature = np.apply_along_axis(get_feature, 0, r, movie_dict).squeeze(0).transpose()[:,1:] # remove ratings

    # similarity between initial/later clicks
    lr_sim = cosine_similarity(l_feature, r_feature).mean(1)
    
    return lr_sim

In [504]:
def get_headtail_similarity(user_hist, init_idx=10):
    """Get unweighted cosine similarity between head and tail items
    """
    l = user_hist.iloc[:init_idx]
    l_h = l.loc[l['pop'] == True].movie_id.to_numpy().reshape(1,-1)
    l_t = l.loc[l['pop'] == False].movie_id.to_numpy().reshape(1,-1)
    #l = l.movie_id.to_numpy().reshape(1,-1)

    r = user_hist.iloc[init_idx:]
    #r_h = r.loc[r['pop'] == True].movie_id.to_numpy().reshape(1,-1)
    r_t = r.loc[r['pop'] == False].movie_id.to_numpy().reshape(1,-1)
    r = r.movie_id.to_numpy().reshape(1,-1)

    lh_rt_sim = get_pair_group_similarity(l_h, r_t, movie_dict).mean()
    lh_r_sim = get_pair_group_similarity(l_h, r, movie_dict).mean()

    if l_t.size > 0:
        lt_rt_sim = get_pair_group_similarity(l_t, r_t, movie_dict).mean()
        lt_r_sim = get_pair_group_similarity(l_t, r, movie_dict).mean()
    else:
        lt_rt_sim, lt_r_sim = 0, 0
        
    return lh_rt_sim, lh_r_sim, l_h.size, lt_rt_sim, lt_r_sim, l_t.size

In [505]:
def get_avg_sim(init_idx):
    lh_rt_sims, lh_r_sims, l_h_sizes, lt_rt_sims, lt_r_sims, l_t_sizes = [], [], [], [], [], []

    for freq_user_id in freq_users.user_id.unique():
        freq_user_hist = freq_users[freq_users.user_id==freq_user_id]
        lh_rt_sim, lh_r_sim, l_h_size, lt_rt_sim, lt_r_sim, l_t_size = get_headtail_similarity(freq_user_hist, init_idx=init_idx)
        if l_z_size == 0:
            continue

        lh_rt_sims.append(lh_rt_sim)
        lh_r_sims.append(lh_r_sim)
        l_h_sizes.append(l_h_size)
        lt_rt_sims.append(lt_rt_sim)
        lt_r_sims.append(lt_r_sim)
        l_t_sizes.append(l_z_size)

    lh_rt_sims = np.array(lh_rt_sims)
    lh_r_sims = np.array(lh_r_sims)
    l_h_sizes = np.array(l_h_sizes)
    lt_rt_sims = np.array(lt_rt_sims)
    lt_r_sims = np.array(lt_r_sims)
    l_t_sizes = np.array(l_t_sizes)

    print("Cosine similarity via {} initial interactions".format(init_idx)) 
    print('init total & future total: {:.4f}'.format(((lh_r_sims+lt_r_sims)/2).mean()))
    print('init total & future tails: {:.4f}'.format(((lh_rt_sims+lt_rt_sims)/2).mean()))
    print('init heads & future total: {:.4f}'.format(lh_r_sims.mean()))
    print('init heads & future tails: {:.4f}'.format(lh_rt_sims.mean()))
    print('init tails & future total: {:.4f}'.format(lt_r_sims.mean()))
    print('init tails & future tails: {:.4f}'.format(lt_rt_sims.mean()))

In [506]:
get_avg_sim(30)

Cosine similarity via 30 initial interactions
init total & future total: 0.0928
init total & future tails: 0.0955
init heads & future total: 0.0924
init heads & future tails: 0.0929
init tails & future total: 0.0931
init tails & future tails: 0.0982


In [507]:
get_avg_sim(20)

Cosine similarity via 20 initial interactions
init total & future total: 0.0917
init total & future tails: 0.0940
init heads & future total: 0.0915
init heads & future tails: 0.0914
init tails & future total: 0.0920
init tails & future tails: 0.0967


In [508]:
get_avg_sim(10)

Cosine similarity via 10 initial interactions
init total & future total: 0.0889
init total & future tails: 0.0901
init heads & future total: 0.0902
init heads & future tails: 0.0889
init tails & future total: 0.0876
init tails & future tails: 0.0913
