In [1]:
from fileinput import filename
import pandas as pd
import numpy as np
from scipy import linalg
from scipy.sparse.linalg import svds
import random 
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm
import scipy.stats as ss
import pickle

In [2]:
def ReadData():
    ml1m_dir = 'data/ratings.dat'
    ml1m_rating = pd.read_csv(ml1m_dir, sep='::', header=None, names=['uid', 'mid', 'rating', 'timestamp'],  engine='python')
    unique_uid = np.unique(np.array(ml1m_rating['uid'].tolist()))
    unique_mid = np.unique(np.array(ml1m_rating['mid'].tolist()))
    uid_dict = dict([(y,x) for x,y in enumerate(unique_uid)])
    mid_dict = dict([(y,x) for x,y in enumerate(unique_mid)])
    print('DICTIONARY PREPARED:')

    # init user item dictionary:
    
    uid_list = ml1m_rating['uid'].tolist()
    uid_list_len = len(uid_list)
    mid_list = ml1m_rating['mid'].tolist()
    mid_list_len = len(mid_list)
    rating_list = ml1m_rating['rating'].tolist()
    user_item_dict = {x:set() for x in range(len(unique_uid))}
    item_user_dict = {x:set() for x in range(len(unique_mid))}
    for i in range(uid_list_len):
        uid_list[i] = uid_dict[uid_list[i]]
        mid_list[i] = mid_dict[mid_list[i]]
        # rating_list[i] = 1 # comment this line if you want to activate explicit ratings
        user_item_dict[uid_list[i]].add(mid_list[i])
        item_user_dict[mid_list[i]].add(uid_list[i])
    tmp_df = pd.DataFrame({"uid":uid_list, "mid":mid_list, "ratings":rating_list})
    v = tmp_df.uid.value_counts()
    df = tmp_df[tmp_df.uid.isin(v.index[v.gt(30)])]
### code to store less than 30 interactions:
    df_less_30 = tmp_df[tmp_df.uid.isin(v.index[v.le(30)])]
    return df, df_less_30, len(np.unique(mid_list))

In [3]:
df_gt_30, df_le_30, unique_mids = ReadData()
print("GREATER THAN 30:\n", df_gt_30)
print("LESS THAN 30: \n", df_le_30)
print(len(df_gt_30))
print(len(df_le_30))
print("UNIQUE MIDS: ", unique_mids)

DICTIONARY PREPARED:
GREATER THAN 30:
           uid   mid  ratings
0           0  1104        5
1           0   639        3
2           0   853        3
3           0  3177        4
4           0  2162        5
...       ...   ...      ...
1000204  6039  1019        1
1000205  6039  1022        5
1000206  6039   548        5
1000207  6039  1024        4
1000208  6039  1025        4

[980300 rows x 3 columns]
LESS THAN 30: 
          uid   mid  ratings
233        3  3235        5
234        3  1120        3
235        3  2743        4
236        3  1124        4
237        3   971        4
...      ...   ...      ...
999740  6037  1288        2
999741  6037  2495        1
999742  6037  2511        3
999743  6037  3165        3
999744  6037  1007        5

[19909 rows x 3 columns]
980300
19909
UNIQUE MIDS:  3706


In [4]:
test_df = df_gt_30.groupby("uid").tail(1)
# print(len(df_gt_30))
train_df = df_gt_30.drop(df_gt_30.groupby('uid').tail(1).index, inplace=False)
assert(len(df_gt_30)== len(test_df) + len(train_df))
# print(len(test_df))
# print(len(train_df))
dic_train_df_uid_mapping = dict([(y,x) for x,y in enumerate(np.unique(train_df['uid']))])
dic_train_df_uid_rmapping = dict([(x,y) for x,y in enumerate(np.unique(train_df['uid']))])
### no need for mid mapping

uid_of_train_df = train_df['uid'].tolist()
for i in range(len(uid_of_train_df)):
    uid_of_train_df[i] = dic_train_df_uid_mapping[uid_of_train_df[i]]
# for index, row in train_df.iterrows():
#     train_df['uid'][index] = dic_train_df_uid_mapping[train_df['uid'][index]]
core_user_ko_input_train_df = pd.DataFrame({'uid':uid_of_train_df, 'mid':train_df['mid'], 'ratings':train_df['ratings']})

In [5]:
train_ui_dic = {}    
for user in range(6040):
    train_ui_dic[user] = []
for index,row in train_df.iterrows():
        train_ui_dic[row['uid']].append(row['mid'])

- utility functions for CUR coreusers.

In [6]:
MAX_MID = 27277 + 1
def select_cols(mat, k, dup=False):
    # prob 1d array of probabilities of all columns
    prob = mat.T.dot(mat)
    prob = np.array(np.diagonal(prob))
    denom = np.abs(prob).sum(axis = 0)
    prob = prob/denom

    C = np.zeros((mat.shape[0], k))
    ind_cols = np.arange(0, prob.size)
    c_ind = []
    for i in range(k):
        rand_sel = np.random.choice(ind_cols, 1, p=prob)
        c_ind.append(rand_sel[0])
        C[:, i] = mat[:, rand_sel[0]]
        # C[:, i] = C[:, i]/np.sqrt(k*prob[rand_sel[0]])

    return C, c_ind

def select_rows(mat, k, dup=False):

    prob = mat.dot(mat.T)
    prob = np.array(np.diagonal(prob))
    denom = np.abs(prob).sum(axis=0)
    prob = prob/denom
    print(prob)
    r = np.zeros((k, mat.shape[1]))
    ind_rows = np.arange(0, prob.size)
    r_ind = []
    for i in range(k):
        # print(ind_rows)
        rand_sel = np.random.choice(ind_rows, 1, p=prob)
        r_ind.append(rand_sel[0])
        r[i, :] = mat[rand_sel[0], :]
        # r[i, :] = r[i, :]/np.sqrt(k*prob[rand_sel[0]])
    r_ind = np.array(r_ind)
    return r, r_ind

def matIntersection(mat, c_ind, r_ind):
    
    W = np.zeros((len(r_ind), len(c_ind)))
    for i in range(len(r_ind)):
        W[i] = mat[r_ind[i], c_ind]
    
    return W

def pseudoInverse(W):
    # U = WP (W+)

    # W = X.Z.YT
    X, Z, YT = np.linalg.svd(W)
    
    # W+ = Y.Z+.XT
    XT = X.T
    Y = YT.T
    # Z+ = reciprocal(Z)
    ZP = np.reciprocal(Z)
    ZP = sp.spdiags(ZP, 0, ZP.size, ZP.size)
    ZP = ZP@ZP
    
    # W+ = Y.Z+.XT
    WP = Y@ZP
    WP = WP@XT

    return WP

In [7]:
def CUR_ExtractCoreUsers(dataframe, unique_user_len, unique_item_len):
    # print("# of rows in ml1m_ratings: ", len(dataframe))
    u_len = unique_user_len
    print("USER LEN:", u_len)
    # print(user_id)

    m_len = unique_item_len
    print("MOVIE LEN:", m_len)
    userItemMatrix = np.zeros(shape=(u_len, m_len))
    # print(userItemMatrix)

    for index, row in dataframe.iterrows():
        userItemMatrix[row['uid']][row['mid']] = row['ratings']
        # print(row['uid'], row['mid'])
    print("USER ITEM MATRIX: \n", userItemMatrix)

    mat = userItemMatrix
    print("MAT:", mat)
    C, c_ind = select_cols(mat, int(u_len * 0.20)) ## getting 20% core users
    r, r_ind= select_rows(mat, int(u_len * 0.20))
    print("r", r)
    print("r_ind", r_ind)

    cur_coreusers = dataframe.iloc[np.where(dataframe.uid.isin(r_ind))]
    # coreusers.reset_index()
    # print("CORE USERS:\n", coreusers)
    return cur_coreusers

In [8]:
core_users = CUR_ExtractCoreUsers(core_user_ko_input_train_df, len(np.unique(uid_of_train_df)), unique_mids)
support_user_list = np.unique(core_users['uid'])
print("CORE USERS:" ,core_users)

USER LEN: 5231
MOVIE LEN: 3706
USER ITEM MATRIX: 
 [[5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [3. 0. 0. ... 0. 0. 0.]]
MAT: [[5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [3. 0. 0. ... 0. 0. 0.]]
[6.85044107e-05 1.37447016e-04 5.90832284e-05 ... 2.13400733e-04
 1.38688567e-04 3.52162333e-04]
r [[0. 0. 0. ... 0. 0. 4.]
 [3. 5. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 5.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [5. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
r_ind [2472  681 3036 ... 4219 4213 2983]
CORE USERS:          uid   mid  ratings
4263      27  1087        2
4264      27  1157        2
4265      27  3189        2
4266      27  2354        4
4267      27  2357        3
...      ...   ...      ...
996786  5208  1022        4
996787  5208   548        4
996788  5208  1024        4
996789  5208  1025        3
996790  5208  1

In [9]:
core_users_index_list = core_users.index.to_list()
non_core_user_index = (train_df.index.difference(core_users.index))
non_core_user_index = non_core_user_index.tolist()

core_users_df = train_df.loc[core_users_index_list]
non_core_user_df = train_df.loc[non_core_user_index]
print("NON CORE USERS:" ,non_core_user_df)
print("CORE USERS:" ,core_users)

NON CORE USERS:           uid   mid  ratings
0           0  1104        5
1           0   639        3
2           0   853        3
3           0  3177        4
4           0  2162        5
...       ...   ...      ...
1000203  6039  1018        3
1000204  6039  1019        1
1000205  6039  1022        5
1000206  6039   548        5
1000207  6039  1024        4

[683611 rows x 3 columns]
CORE USERS:          uid   mid  ratings
4263      27  1087        2
4264      27  1157        2
4265      27  3189        2
4266      27  2354        4
4267      27  2357        3
...      ...   ...      ...
996786  5208  1022        4
996787  5208   548        4
996788  5208  1024        4
996789  5208  1025        3
996790  5208  1862        2

[291458 rows x 3 columns]


In [10]:
print("TEST DF CONTAINS TEST FOR CORE AND NON CORE ENTITIES:\n" ,test_df)
# print(core_users['uid'])
unique_uids_in_support_trian = np.unique(np.array(core_users_df['uid']))
unique_uids_in_query_trian = np.unique(non_core_user_df['uid'])
print(len(unique_uids_in_support_trian))
support_test_df = test_df.loc[test_df['uid'].isin(unique_uids_in_support_trian)]
print("SUPPORT TEST DF:" ,support_test_df)
query_test_df = test_df.loc[test_df['uid'].isin(unique_uids_in_query_trian)]
print("QUERY TEST DF:\n", query_test_df)

TEST DF CONTAINS TEST FOR CORE AND NON CORE ENTITIES:
           uid   mid  ratings
52          0  1154        4
181         1  1155        5
232         2  1900        4
451         4   683        4
522         5    33        4
...       ...   ...      ...
998634   6034  1865        4
999522   6035  1867        3
999724   6036  1025        5
999867   6038  1025        4
1000208  6039  1025        4

[5231 rows x 3 columns]
894
SUPPORT TEST DF:          uid   mid  ratings
4653      32  1155        3
8908      58  1155        5
9512      61  1155        4
11015     80  1155        5
11133     81  1020        5
...      ...   ...      ...
991314  5986   548        4
994110  6001  1025        4
995572  6010  1865        5
995882  6014  1007        5
996791  6015  3540        4

[894 rows x 3 columns]
QUERY TEST DF:
           uid   mid  ratings
52          0  1154        4
181         1  1155        5
232         2  1900        4
451         4   683        4
522         5    33        4
.

In [11]:
support_train = []
for index,row in core_users_df.iterrows():
    support_train.append([row['uid'], row['mid'], row['ratings']])
query_train = []
for index, row in non_core_user_df.iterrows():
    query_train.append([row['uid'], row['mid'], row['ratings']])
support_test = []
for index, row in support_test_df.iterrows():
    support_test.append([row['uid'], row['mid'], row['ratings']])
query_test = []
for index, row in query_test_df.iterrows():
    query_test.append([row['uid'], row['mid'], row['ratings']])
user_his_dic = {}
for u in train_ui_dic.keys():
    user_his_dic[u] = train_ui_dic[u]
user_supp_list = np.unique(core_users_df['uid']).tolist()

In [13]:
import pickle
with open("cur_20_support_as_core.pkl", "wb") as f:
    pickle.dump(support_train, f)
    pickle.dump(query_train, f)
    pickle.dump(support_test, f)
    pickle.dump(query_test, f)
    pickle.dump(user_supp_list, f)
    pickle.dump(user_his_dic, f)

- 20% cur coreusers into IDCF

In [1]:
!python pretrain-1m.py
!python train-1m.py
!python test-1m.py

-------Dataset Info--------
split way [threshold] with threshold 30 training_ratio 1.0
train set size: support/query 291458/683611
test set size: support/query 894/4337
Epoch 0 Step 271: Train 2.6694 Reg: 0.5657
Test: 0.8934 MAE: 0.7671 RMSE: 0.9452
Val: 0.8126 MAE: 0.7110 RMSE: 0.9015
Epoch 1 Step 542: Train 0.8031 Reg: 0.4971
Test: 0.8746 MAE: 0.7540 RMSE: 0.9352
Val: 0.8017 MAE: 0.7072 RMSE: 0.8954
Epoch 2 Step 813: Train 0.7963 Reg: 0.4269
Test: 0.8596 MAE: 0.7478 RMSE: 0.9272
Val: 0.8049 MAE: 0.7073 RMSE: 0.8972
Epoch 3 Step 1084: Train 0.7943 Reg: 0.3723
Test: 0.8621 MAE: 0.7481 RMSE: 0.9285
Val: 0.8012 MAE: 0.7062 RMSE: 0.8951
Epoch 4 Step 1355: Train 0.7916 Reg: 0.3328
Test: 0.8694 MAE: 0.7483 RMSE: 0.9324
Val: 0.7938 MAE: 0.7020 RMSE: 0.8910
Epoch 5 Step 1626: Train 0.7896 Reg: 0.3046
Test: 0.8575 MAE: 0.7467 RMSE: 0.9260
Val: 0.7939 MAE: 0.7039 RMSE: 0.8910
Epoch 6 Step 1897: Train 0.7869 Reg: 0.2843
Test: 0.8519 MAE: 0.7474 RMSE: 0.9230
Val: 0.7944 MAE: 0.7043 RMSE: 0.8913
E