In [1]:
import pandas as pd
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split

# Preprocess Data

In [2]:
user_rating = pd.read_csv('raw_data/user_rating.txt', sep='\t', names=['user_id', 'other_id', 'trust', 'CREATION'])
user_rating.head()


Unnamed: 0,user_id,other_id,trust,CREATION
0,3287060356,232085,-1,2001/01/10
1,3288305540,709420,1,2001/01/10
2,3290337156,204418,-1,2001/01/10
3,3294138244,269243,-1,2001/01/10
4,3294138244,170692484,-1,2001/01/10


In [3]:
#drop useless columns
user_rating = user_rating.drop(columns = ['CREATION'],axis = 1)

In [4]:
mc = pd.read_csv('raw_data/mc.txt', sep='|', names=['item_id', 'creater_id', 'subject'])
mc.head()

Unnamed: 0,item_id,creater_id,subject
0,1445594,718357,149002400000.0
1,1445595,220568,149003600000.0
2,1445596,717325,5303145000.0
3,1445597,360156,192620900000.0
4,1445598,718857,149002200000.0


In [5]:
mc = mc.drop(columns = ['creater_id'],axis = 1)

In [6]:
rating = pd.read_csv('raw_data/rating.txt', sep='\t', names=['item_id', 'user_id', 'rating','status','CREATION','LAST_MODIFIED','TYPE','VERTICAL_ID'])
rating

Unnamed: 0,item_id,user_id,rating,status,CREATION,LAST_MODIFIED,TYPE,VERTICAL_ID
0,139431556,591156,5,0,2001/01/10,,1,2518365.0
1,139431556,1312460676,5,0,2001/01/10,,1,2518365.0
2,139431556,204358,5,0,2001/01/10,,1,2518365.0
3,139431556,368725,5,0,2001/01/10,,1,2518365.0
4,139431556,277629,5,0,2001/01/10,,1,2518365.0
...,...,...,...,...,...,...,...,...
13668315,891503,311238,5,0,2001/01/10,2001/12/27,1,2522499.0
13668316,891503,210412,5,0,2001/01/10,2001/12/27,1,2522499.0
13668317,891503,351471,5,0,2001/01/10,2001/12/27,1,2522499.0
13668318,891503,394639,5,0,2001/01/10,2001/12/27,1,2522499.0


In [7]:
rating = rating.drop(columns = ['CREATION','LAST_MODIFIED','TYPE','VERTICAL_ID'],axis = 1)

In [8]:
#only keep data whose rating is greater than 3
filtered_rating = rating[rating['rating'] > 3]


In [9]:
#left join with mc data
joined = pd.merge(filtered_rating,mc,on = 'item_id',how = 'left')
joined

Unnamed: 0,item_id,user_id,rating,status,subject
0,139431556,591156,5,0,3.357800e+04
1,139431556,1312460676,5,0,3.357800e+04
2,139431556,204358,5,0,3.357800e+04
3,139431556,368725,5,0,3.357800e+04
4,139431556,277629,5,0,3.357800e+04
...,...,...,...,...,...
12581743,891503,311238,5,0,3.423066e+11
12581744,891503,210412,5,0,3.423066e+11
12581745,891503,351471,5,0,3.423066e+11
12581746,891503,394639,5,0,3.423066e+11


In [10]:
joined['subject'].value_counts()

6.854272e+06    229086
5.262271e+08    164992
4.623950e+08    128444
1.490041e+11     77245
7.640704e+06     57072
                 ...  
1.624170e+05         1
7.636400e+04         1
1.625060e+05         1
1.589020e+05         1
6.691788e+09         1
Name: subject, Length: 109758, dtype: int64

In [None]:
#use kmeans to cluster subjects into 6 topics
kmeans = KMeans(n_clusters=6)  
joined = joined.dropna()
kmeans.fit(joined[['subject']])
joined['topic'] = kmeans.labels_
print(joined)



In [None]:
#get random sample according to the ratio of each topic
class_counts = joined['topic'].value_counts() 
class_ratios = class_counts / len(joined) 
target_samples_per_class = (class_ratios * 1170000).astype(int) 
sampled_data = pd.concat([joined[joined['topic'] == topic].sample(target_samples_per_class[topic], replace=True) for topic in joined['topic'].unique()]).reset_index() 
sampled_data

In [None]:
#data who adopt the 10-core filtering setting(items emerge more than 10 times and user emerges more than 10 times)
item_id_counts = sampled_data['item_id'].value_counts()
item_id_gt_10 = item_id_counts[item_id_counts > 10].index.tolist()
sampled_data = sampled_data[sampled_data['item_id'].isin(item_id_gt_10)]

user_id_counts = sampled_data['user_id'].value_counts()
user_id_gt_10 = user_id_counts[user_id_counts > 10].index.tolist()
sampled_data = sampled_data[sampled_data['user_id'].isin(user_id_gt_10)]
sampled_data['user_id'].unique().shape

In [None]:
fan=user_rating[user_rating['trust']==1]
fan_count=fan.groupby('user_id')['trust'].count()
for user_id,fanc in fan_count.items():
    if user_id in sampled_data['user_id'].values:
        sampled_data.loc[sampled_data['user_id'] == user_id, 'fan_count'] = fanc

In [None]:
sampled_data['fan_count']=sampled_data['fan_count'].fillna(0)

In [None]:
#generate new id for users
unique_ids = sampled_data['user_id'].unique()
id_mapping = {id: idx for idx, id in enumerate(unique_ids)}
sampled_data['user_id'] = sampled_data['user_id'].map(id_mapping)
sampled_data['user_id'].unique()

In [None]:
#generate new id for items
unique_ids = sampled_data['item_id'].unique()
id_mapping = {id: idx for idx, id in enumerate(unique_ids)}
sampled_data['item_id'] = sampled_data['item_id'].map(id_mapping)
sampled_data['item_id'].unique()

In [None]:
# split into train set, validation set and test set according to the ratio 6:2:2
def sample_data(df):
    grouped_data = df.groupby('user_id')
    sampled_data = grouped_data.apply(lambda x: x.sample(n=1))
    sampled_data.reset_index(drop=True, inplace=True)
    remaining_data = df.merge( sampled_data, indicator=True, how='outer').loc[lambda x: x['_merge'] == 'left_only']
    remaining_data.drop(columns='_merge', inplace=True)
    additional_samples = remaining_data.sample(n=120000, random_state=42)
    sampled_data = pd.concat([sampled_data, additional_samples], ignore_index=True)
    sampled_data.reset_index(drop=True, inplace=True)
    return sampled_data.drop('index',axis=1)
def remove_train_data(sampled_data, train_data):
    remaining_data = sampled_data.merge(train_data, indicator=True, how='outer').loc[lambda x: x['_merge'] == 'left_only']
    remaining_data.drop(columns='_merge', inplace=True)  
    return remaining_data.drop('index',axis=1)

train_data = sample_data(sampled_data)
temp_data = remove_train_data(sampled_data, train_data)
val_data, test_data = train_test_split(temp_data, test_size=0.5, random_state=42)

## Extract features of Recommender System

In [None]:
unique_item=sampled_data['item_id'].unique()
category_item_dict={}
for i in unique_item:
    topic = sampled_data[sampled_data['item_id'] == i]['topic'].iloc[0]
    if topic in category_item_dict:
        category_item_dict[topic].append(i)
    else:
        category_item_dict[topic] = [i]
len(category_item_dict)


In [None]:
category_num_dict={}
for k in category_item_dict.keys():
    category_num_dict[k]=len(category_item_dict[k])
category_num_dict



If we obtain data using this method, we observe that our dataset exhibits greater imbalance among items compared to the dataset described in the paper.

In [None]:
item_categories_list=sampled_data['topic'].tolist()

In [None]:
test_dict={}
grouped_test=test_data.groupby('user_id')['item_id'].apply(lambda x: list(x))

for user_id, item_ids in grouped_test.items():
    test_dict[user_id]=item_ids
len(test_dict)

In [None]:
train_dict={}
grouped_train=train_data.groupby('user_id')['item_id'].apply(lambda x: list(x))

for user_id, item_ids in grouped_train.items():
    train_dict[user_id]=item_ids


In [None]:
a=[]
for i in range(len(train_dict)):
    a.append(len(train_dict[i]))
max(a)

In [None]:
val_dict={}
grouped_val=val_data.groupby('user_id')['item_id'].apply(lambda x: list(x))

for user_id, item_ids in grouped_val.items():
    val_dict[user_id]=item_ids
val_dict

In [None]:
#use number of other_id's trust relationship to count the social utility
user_num_fans_list=train_data['fan_count'].apply(lambda x:int(x)).to_list()
user_num_fans_list
        

In [None]:
import pickle
with open('nips23_social_igf/data/Epinions/category_item_dict.pkl', 'wb') as f:
    pickle.dump(category_item_dict, f)
with open('nips23_social_igf/data/Epinions/category_num_dict.pkl', 'wb') as f:
    pickle.dump(category_num_dict, f)
with open('nips23_social_igf/data/Epinions/item_categories_list.pkl', 'wb') as f:
    pickle.dump(item_categories_list, f)
with open('nips23_social_igf/data/Epinions/test_dict.pkl', 'wb') as f:
    pickle.dump(test_dict, f)
with open('nips23_social_igf/data/Epinions/train_dict.pkl', 'wb') as f:
    pickle.dump(train_dict, f)
with open('nips23_social_igf/data/Epinions/val_dict.pkl', 'wb') as f:
    pickle.dump(val_dict, f)
with open('nips23_social_igf/data/Epinions/user_num_fans_list.pkl', 'wb') as f:
    pickle.dump(user_num_fans_list, f)

In [3]:
import pickle
import pandas as pd

# 假设你有一个名为 'data.pkl' 的文件
with open('data/Epinions/category_item_dict.pkl', 'rb') as file:
    data = pickle.load(file)

data.keys()

dict_keys([3, 5, 4, 10, 19, 18])

In [2]:
import pickle
import pandas as pd

# 假设你有一个名为 'data.pkl' 的文件
with open('data/Epinions/category_num_dict.pkl', 'rb') as file:
    data1 = pickle.load(file)

data1

defaultdict(int, {3: 3994, 5: 2635, 4: 2023, 10: 1613, 19: 1987, 18: 1430})