### Item-Item Collaborative Filtering

In [2]:
import pandas as pd
import numpy as np
import random
from sklearn.model_selection import KFold
from sklearn.model_selection import TimeSeriesSplit
from recsys_utils import recsys_load_training_df, recsys_evaluate, recsys_cv_split_single, recsys_cv_split_userid, recsys_cv_split_single_tweetid
from sklearn.model_selection import train_test_split
import scipy.stats
import scipy.spatial
from sklearn.metrics import precision_recall_curve, auc, log_loss
from math import sqrt
import math
import warnings
import sys
from scipy.sparse.linalg import norm
from scipy import sparse as sp


We tried this method with two samples: the first one has 10000 unique tweets and 223470 unique users('engaging_user_id' column) in 226241 ratings; the second one has 78795 unique tweets and 10000 unique users in 80425 ratings. 
For item-item collaborative filtering we need to have both tweets and users repeated multiple times in our ratings, because for computing similarities between items, we need the target item to be rated by multiple users (i.e we need that tweet id to appear many times in our ratings); and for prediction we need those items in the neighborhood to be rated by the target user (i.e we need the target user to appear many times in our ratings). The first sample covered the condition of a tweet appearing multiple times, the second sample covered the condition of a user appearing multiple times. We did not have any samples that covered both conditions, that is why we cannot expect this method to perform reliably for our case. 

In [3]:
df = recsys_load_training_df('../Data/training_sample_by_tweet.tsv')
#df = recsys_load_training_df('../Data/training_sample.tsv')

In [4]:
unique_user_ids = df['engaging_user_id'].append(df['engaged_with_user_id']).unique()
unique_user_ids.sort()

unique_tweet_ids = df['tweet_id'].unique()
unique_tweet_ids.sort()

m = len(unique_user_ids)
n = len(unique_tweet_ids)
print(m)
print(n)
print(len(df))
print(len(df['engaging_user_id'].unique()))

userId_to_userIDX = dict(zip(unique_user_ids, range(m)))
userIDX_to_userId = dict(zip(range(m), unique_user_ids))

tweetId_to_tweetIDX = dict(zip(unique_tweet_ids, range(n)))
tweetIDX_to_tweetId = dict(zip(range(n), unique_tweet_ids))

233000
10000
226241
223470


In [5]:
df.head()

Unnamed: 0,text_tokens,hashtags,tweet_id,present_media,present_links,present_domains,tweet_type,language,tweet_timestamp,engaged_with_user_id,...,engaging_user_id,engaging_user_follower_count,engaging_user_following_count,engaging_user_is_verified,engaging_user_account_creation,engaged_follows_engaging,reply_timestamp,retweet_timestamp,retweet_with_comment_timestamp,like_timestamp
0,"[101, 46242, 40751, 161, 100062, 10107, 10114,...",,4EC1C41302B820B2459337FE31859B16,,[BB79CD318A68247B64F0E0BE7AFD5A92],[3896E26D12C903F0A00B6B1BE9A9BEA3],TopLevel,D3164C7FBCF2565DDF915B1B3AEFB1DC,2020-02-11 13:14:01,959F94011E65010724E85D1B5CBAE18F,...,00008C78BF1DDCC9FDCBAFFAF318A175,34,36,False,2018-10-07 00:40:24,False,NaT,NaT,NaT,NaT
1,"[101, 56898, 137, 10741, 24493, 13538, 131, 12...",,CDA46E9A8E12E9FBAD060A8DE875D02C,,,,Retweet,D3164C7FBCF2565DDF915B1B3AEFB1DC,2020-02-10 00:03:46,716ACF42D9D35BE7374C590C525B0440,...,000116FBD3C83C9C457041C99E371B0B,52,150,False,2018-12-21 01:26:23,False,NaT,NaT,NaT,NaT
2,"[101, 56898, 137, 21540, 35826, 10263, 10575, ...",,4589FDB470A435F92AD5023FACB451A2,,,,Retweet,06D61DCBBE938971E1EA0C38BD9B5446,2020-02-12 02:35:53,83C175E35C3244F0F0250571A128EDBA,...,000231681C7F3716348F35C577591DF3,1354,203,False,2012-03-20 22:11:00,True,NaT,NaT,NaT,NaT
3,"[101, 11723, 112, 187, 169, 19826, 10114, 1011...",,427F0CA21434D0E1258165D42F86F3AB,,,,Quote,D3164C7FBCF2565DDF915B1B3AEFB1DC,2020-02-11 04:06:36,C94E03277B7D08A8AE0E7DBCAACEC879,...,00080417F22C1223FFBE7B6622C55CB6,117,2763,False,2014-08-08 06:10:35,False,NaT,NaT,NaT,2020-02-11 06:41:36
4,"[101, 56898, 137, 58442, 15490, 10716, 45389, ...",,853BB89C90CB3AE1462F3F8203EBB644,[Photo],,,Retweet,022EC308651FACB02794A8147AEE1B78,2020-02-07 14:19:27,8EC11BE3F8DEACEF096ED6AB760A409E,...,0008C05833E31FA1C1102AA71E9E0452,130,260,False,2019-01-22 19:54:08,True,NaT,2020-02-07 14:53:05,NaT,NaT


In [6]:
r_df = pd.concat([df['engaging_user_id'].map(userId_to_userIDX),
                 df['tweet_id'].map(tweetId_to_tweetIDX),
                 df['reply_timestamp'],
                 df['retweet_timestamp'],
                 df['retweet_with_comment_timestamp'],
                 df['like_timestamp']], axis = 1)

cols_to_values = ['reply_timestamp', 'retweet_timestamp', 'retweet_with_comment_timestamp', 'like_timestamp']
for col in cols_to_values:
    r_df.loc[~r_df[col].isnull(), col] = 1
    r_df.loc[r_df[col].isnull(), col] = -1


In [7]:
r_df.head()

Unnamed: 0,engaging_user_id,tweet_id,reply_timestamp,retweet_timestamp,retweet_with_comment_timestamp,like_timestamp
0,3,3091,-1,-1,-1,-1
1,5,8110,-1,-1,-1,-1
2,10,2769,-1,-1,-1,-1
3,30,2646,-1,-1,-1,1
4,37,5212,-1,1,-1,-1


In [8]:
res = recsys_cv_split_single_tweetid(r_df)
ratings_train = res[0][0]
ratings_test = res[0][1]

In [9]:
print(ratings_train.shape)
print(ratings_test.shape)

(210898, 6)
(11100, 6)


### Create the Ratings Matrix

In [10]:
R_reply = sp.csr_matrix((ratings_train.reply_timestamp.astype('int8'), (ratings_train.engaging_user_id, 
                                                                     ratings_train.tweet_id)))

R_retweet = sp.csr_matrix((ratings_train.retweet_timestamp.astype('int8'), (ratings_train.engaging_user_id, 
                                                                         ratings_train.tweet_id)))

R_retweet_wc = sp.csr_matrix((ratings_train.retweet_with_comment_timestamp.astype('int8'), (ratings_train.engaging_user_id, 
                                                                               ratings_train.tweet_id))) 

R_like = sp.csr_matrix((ratings_train.like_timestamp.astype('int8'), (ratings_train.engaging_user_id, 
                                                                   ratings_train.tweet_id))) 

### Item to Item Similarities

In [11]:
def compute_item_similarities(i_id, R_matrix):
    item_sums = R_matrix.sum(axis=0).A1
    item_cnts = (R_matrix != 0).sum(axis=0).A1
    item_avgs = item_sums / item_cnts
    iI = np.empty((n,))
    np.seterr(divide='ignore', invalid='ignore')
    R_copy = R_matrix.copy()
    nz = R_copy.T.nonzero()
    R_copy.data = R_copy.data - item_avgs[nz[0]]
    u = R_copy[:, i_id].copy()
    numerator = R_copy.T.dot(u).A
    denominator = norm(u) * np.sqrt(R_copy.T.multiply(R_copy.T).sum(1)).A1
    iI = np.ravel(numerator)/denominator
    where_are_NaNs = np.isnan(iI)
    iI[where_are_NaNs] = 0.0
    return iI

### Create User Neighborhood

In [12]:
R_like_dok = R_like.todok()
R_retweet_dok = R_retweet.todok()
R_retweet_wc_dok = R_retweet_wc.todok()
R_reply_dok = R_reply.todok()

In [13]:
k = 5
def create_item_neighborhood(u_id, i_id, R_matrix_dok, R_matrix):
    nh = {}
    iI = compute_item_similarities(i_id, R_matrix)
    iI_copy = iI.copy()

    iI_sort = np.argsort(iI_copy)[::-1]
    i = 0
    while len(nh) < k and i < len(iI_sort):
        if iI_sort[i] != i_id and (iI_sort[i], u_id) in R_matrix_dok:
            nh[iI_sort[i]] = iI[iI_sort[i]].item(0)
        i += 1

    return nh

In [14]:
with_deviations = True

def predict_rating_internal(u_id, i_id, R_matrix_dok, R_matrix):
    nh = create_item_neighborhood(u_id, i_id, R_matrix_dok, R_matrix)
    neighborhood_weighted_avg = 0.
    numerator = 0.
    denominator = 0.
    for v in nh:
        numerator += nh[v] * R_matrix[u_id,v]
        denominator += np.absolute(nh[v])
    if denominator == 0:
        neighborhood_weighted_avg = 0.;
    else:
        neighborhood_weighted_avg = numerator/denominator
    prediction = neighborhood_weighted_avg
    return prediction

In [15]:
matrices = [(R_like, R_like_dok,0), (R_reply, R_reply_dok,1), (R_retweet, R_retweet_dok,2), 
            (R_retweet_wc, R_retweet_wc_dok,3)]

In [1]:
#this takes too much time; won't complete

# def recommender_train_predict(df_train, df_test):
#     R_reply = sp.csr_matrix((df_train.reply_timestamp.astype('int8'), (df_train.engaging_user_id, 
#                                                                      df_train.tweet_id)))

#     R_retweet = sp.csr_matrix((df_train.retweet_timestamp.astype('int8'), (df_train.engaging_user_id, 
#                                                                          df_train.tweet_id)))

#     R_retweet_wc = sp.csr_matrix((df_train.retweet_with_comment_timestamp.astype('int8'), (df_train.engaging_user_id, 
#                                                                                df_train.tweet_id)))

#     R_like = sp.csr_matrix((df_train.like_timestamp.astype('int8'), (df_train.engaging_user_id, 
#                                                                    df_train.tweet_id)))
    
#     print(len(df_train), len(df_test))
    
#     R_like_dok = R_like.todok()
#     R_retweet_dok = R_retweet.todok()
#     R_retweet_wc_dok = R_retweet_wc.todok()
#     R_reply_dok = R_reply.todok()
#     pred_reply = []
#     pred_retweet = []
#     pred_retweet_wc = []
#     pred_like = []
#     for idx, row in df_test.iterrows():
#         pred_reply += [predict_rating_internal(row['engaging_user_id'], row['tweet_id'], R_reply_dok, R_reply)]
#         pred_retweet += [predict_rating_internal(row['engaging_user_id'], row['tweet_id'], R_retweet_dok, R_retweet)]
#         pred_retweet_wc += [predict_rating_internal(row['engaging_user_id'], row['tweet_id'] ,R_retweet_wc_dok, R_retweet_wc)]
#         pred_like += [predict_rating_internal(row['engaging_user_id'], row['tweet_id'], R_like_dok, R_like)]
# #     pred_reply = [predict_rating_internal(row[0],row[1], R_reply_dok, R_reply) for row in df_test.to_numpy().astype(int)]
# #     pred_retweet = [predict_rating_internal(row['engaging_user_id'],row['tweet_id'], R_retweet_dok, R_retweet) for row in df_test.to_numpy().astype(int)]
# #     pred_retweet_wc = [predict_rating_internal(row['engaging_user_id'],row['tweet_id'], R_retweet_wc_dok, R_retweet_wc) for row in df_test.to_numpy().astype(int)]
# #     pred_like = [predict_rating_internal(row['engaging_user_id'],row['tweet_id'], R_like_dok, R_like) for row in df_test.to_numpy().astype(int)]
#     print(pred_reply, type(pred_reply))
#     return pred_reply, pred_retweet, pred_retweet_wc, pred_like

# df_results = recsys_evaluate(r_df, recommender_train_predict, 'mf_random')
# df_results

In [17]:
for R_sparse, R_dok, index in matrices:
    if index == 0:
        pred_like = [predict_rating_internal(row[0],row[1], R_dok, R_sparse) for row in ratings_test.to_numpy().astype(int)]
    elif index == 1:
        pred_reply = [predict_rating_internal(row[0],row[1], R_dok, R_sparse) for row in ratings_test.to_numpy().astype(int)]
    elif index == 2:
        pred_retweet = [predict_rating_internal(row[0],row[1], R_dok, R_sparse) for row in ratings_test.to_numpy().astype(int)]
    elif index == 3:
         pred_retweet_wc = [predict_rating_internal(row[0],row[1], R_dok, R_sparse) for row in ratings_test.to_numpy().astype(int)]

In [27]:
def compute_prauc(pred, gt):
    prec, recall, thresh = precision_recall_curve(gt, pred)
    prauc = auc(recall, prec)
    return prauc

def calculate_ctr(gt):
    positive = len([x for x in gt if x == 1])
    ctr = positive/float(len(gt))
    return ctr

def compute_rce(pred, gt):
    cross_entropy = log_loss(gt, pred)
    data_ctr = calculate_ctr(gt)
    strawman_cross_entropy = log_loss(gt, [data_ctr for _ in range(len(gt))])
    return (1.0 - cross_entropy/strawman_cross_entropy)*100.0

In [28]:
prauc_like = compute_prauc(pred_like, ratings_test['like_timestamp'].astype('int8'))
prauc_reply = compute_prauc(pred_reply, ratings_test['reply_timestamp'].astype('int8'))
prauc_retweet = compute_prauc(pred_retweet, ratings_test['retweet_timestamp'].astype('int8'))
prauc_retweet_wc = compute_prauc(pred_retweet_wc, ratings_test['retweet_with_comment_timestamp'].astype('int8'))

In [29]:
rce_like = compute_rce(pred_like, ratings_test['like_timestamp'].astype('int8'))
rce_reply = compute_rce(pred_reply, ratings_test['reply_timestamp'].astype('int8'))
rce_retweet = compute_rce(pred_retweet, ratings_test['retweet_timestamp'].astype('int8'))
rce_retweet_wc = compute_rce(pred_retweet_wc, ratings_test['retweet_with_comment_timestamp'].astype('int8'))

In [30]:
print(prauc_like)
print(prauc_reply)
print(prauc_retweet)
print(prauc_retweet_wc)

0.7362612612612612
0.5077027027027027
0.5477477477477477
0.503018018018018


In [31]:
print(rce_like)
print(rce_reply)
print(rce_retweet)
print(rce_retweet_wc)

-2259.671908572321
-568.6700624439948
-946.8416397393288
-465.56195860819827


In [32]:
set(pred_retweet_wc)

{0.0}

In [33]:
set(pred_like)

{0.0}

In [34]:
set(pred_reply)

{0.0}

In [35]:
set(pred_retweet)

{0.0}