# U-I interactions extraction, 5-core filtering, re-indexing for Amazon Sports_and_Outdoors dataset
- Extracting U-I interactions and performing 5-core, re-indexing
- dataset located at: http://jmcauley.ucsd.edu/data/amazon/links.html, rating only file in "Small" subsets for experimentation

In [1]:
import os, csv
import pandas as pd

In [None]:
os.chdir('your dataset folder path')
os.getcwd()

## 先5-core过滤
## 5-core filtering

In [3]:
df = pd.read_csv('ratings_Sports_and_Outdoors.csv', names=['userID', 'itemID', 'rating', 'timestamp'], header=None)
print(f'shape: {df.shape}')
df[:5]

shape: (3268695, 4)


Unnamed: 0,userID,itemID,rating,timestamp
0,A3PMSRCL80KSA1,31852,4.0,1388275200
1,A1SNLWGLFXD70K,31852,4.0,1392940800
2,A1KJ4CVG87QW09,31852,4.0,1389657600
3,AA9ITO6ZLZW6,31852,5.0,1399507200
4,APJ5ULJ1RMZ4,31852,1.0,1398556800


In [4]:
k_core = 5
learner_id, course_id, tmstmp_str = 'userID', 'itemID', 'timestamp'

df.dropna(subset=[learner_id, course_id, tmstmp_str], inplace=True)
df.drop_duplicates(subset=[learner_id, course_id, tmstmp_str], inplace=True)
print(f'After dropped: {df.shape}')
df[:3]

After dropped: (3268695, 4)


Unnamed: 0,userID,itemID,rating,timestamp
0,A3PMSRCL80KSA1,31852,4.0,1388275200
1,A1SNLWGLFXD70K,31852,4.0,1392940800
2,A1KJ4CVG87QW09,31852,4.0,1389657600


In [5]:
from collections import Counter
import numpy as np

min_u_num, min_i_num = 5, 5

def get_illegal_ids_by_inter_num(df, field, max_num=None, min_num=None):
    if field is None:
        return set()
    if max_num is None and min_num is None:
        return set()

    max_num = max_num or np.inf
    min_num = min_num or -1

    ids = df[field].values
    inter_num = Counter(ids)
    ids = {id_ for id_ in inter_num if inter_num[id_] < min_num or inter_num[id_] > max_num}
    print(f'{len(ids)} illegal_ids_by_inter_num, field={field}')

    return ids


def filter_by_k_core(df):
    while True:
        ban_users = get_illegal_ids_by_inter_num(df, field=learner_id, max_num=None, min_num=min_u_num)
        ban_items = get_illegal_ids_by_inter_num(df, field=course_id, max_num=None, min_num=min_i_num)
        if len(ban_users) == 0 and len(ban_items) == 0:
            return

        dropped_inter = pd.Series(False, index=df.index)
        if learner_id:
            dropped_inter |= df[learner_id].isin(ban_users)
        if course_id:
            dropped_inter |= df[course_id].isin(ban_items)
        print(f'{len(dropped_inter)} dropped interactions')
        df.drop(df.index[dropped_inter], inplace=True)



## k-core

In [6]:
filter_by_k_core(df)
print(f'k-core shape: {df.shape}')
print(f'shape after k-core: {df.shape}')
df[:2]

1906153 illegal_ids_by_inter_num, field=userID
376127 illegal_ids_by_inter_num, field=itemID
3268695 dropped interactions
22213 illegal_ids_by_inter_num, field=userID
54919 illegal_ids_by_inter_num, field=itemID
589029 dropped interactions
18323 illegal_ids_by_inter_num, field=userID
3743 illegal_ids_by_inter_num, field=itemID
422478 dropped interactions
2298 illegal_ids_by_inter_num, field=userID
4388 illegal_ids_by_inter_num, field=itemID
349749 dropped interactions
3331 illegal_ids_by_inter_num, field=userID
639 illegal_ids_by_inter_num, field=itemID
326238 dropped interactions
579 illegal_ids_by_inter_num, field=userID
1012 illegal_ids_by_inter_num, field=itemID
311188 dropped interactions
897 illegal_ids_by_inter_num, field=userID
169 illegal_ids_by_inter_num, field=itemID
305054 dropped interactions
155 illegal_ids_by_inter_num, field=userID
308 illegal_ids_by_inter_num, field=itemID
300866 dropped interactions
301 illegal_ids_by_inter_num, field=userID
47 illegal_ids_by_inter_nu

Unnamed: 0,userID,itemID,rating,timestamp
564,AIXZKN4ACSKI,1881509818,5.0,1390694400
565,A1L5P841VIO02V,1881509818,5.0,1328140800


## Re-index

In [7]:
df.reset_index(drop=True, inplace=True)

In [8]:

i_mapping_file = 'i_id_mapping.csv'
u_mapping_file = 'u_id_mapping.csv'

splitting = [0.8, 0.1, 0.1]
uid_field, iid_field = learner_id, course_id

uni_users = pd.unique(df[uid_field])
uni_items = pd.unique(df[iid_field])

# start from 0
u_id_map = {k: i for i, k in enumerate(uni_users)}
i_id_map = {k: i for i, k in enumerate(uni_items)}

df[uid_field] = df[uid_field].map(u_id_map)
df[iid_field] = df[iid_field].map(i_id_map)
df[uid_field] = df[uid_field].astype(int)
df[iid_field] = df[iid_field].astype(int)

# dump
rslt_dir = './'
u_df = pd.DataFrame(list(u_id_map.items()), columns=['user_id', 'userID'])
i_df = pd.DataFrame(list(i_id_map.items()), columns=['asin', 'itemID'])

u_df.to_csv(os.path.join(rslt_dir, u_mapping_file), sep='\t', index=False)
i_df.to_csv(os.path.join(rslt_dir, i_mapping_file), sep='\t', index=False)
print(f'mapping dumped...')

mapping dumped...


In [None]:

# =========2. splitting
print(f'splitting ...')
tot_ratio = sum(splitting)
# remove 0.0 in ratios
ratios = [i for i in splitting if i > .0]
ratios = [_ / tot_ratio for _ in ratios]
split_ratios = np.cumsum(ratios)[:-1]

#df[tmstmp_str] = df[tmstmp_str].map(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ"))
split_ratios

In [10]:
ts_id = 'timestamp'

split_timestamps = list(np.quantile(df[ts_id], split_ratios))
# get df training dataset unique users/items
df_train = df.loc[df[ts_id] < split_timestamps[0]].copy()
df_val = df.loc[(split_timestamps[0] <= df[ts_id]) & (df[ts_id] < split_timestamps[1])].copy()
df_test = df.loc[(split_timestamps[1] <= df[ts_id])].copy()

x_label, rslt_file = 'x_label', 'sports-indexed.inter'
df_train[x_label] = 0
df_val[x_label] = 1
df_test[x_label] = 2
temp_df = pd.concat([df_train, df_val, df_test])
temp_df = temp_df[[learner_id, course_id, 'rating', ts_id, x_label]]
print(f'columns: {temp_df.columns}')

temp_df.columns = [learner_id, course_id, 'rating', ts_id, x_label]

temp_df.to_csv(os.path.join(rslt_dir, rslt_file), sep='\t', index=False)
temp_df[:5]
#print('done!')

columns: Index(['userID', 'itemID', 'rating', 'timestamp', 'x_label'], dtype='object')


Unnamed: 0,userID,itemID,rating,timestamp,x_label
1,1,0,5.0,1328140800,0
2,2,0,4.0,1330387200,0
3,3,0,4.0,1328400000,0
4,4,0,4.0,1366675200,0
5,5,0,5.0,1351814400,0


## Reload

In [11]:
indexed_df = pd.read_csv(rslt_file, sep='\t')
print(f'shape: {indexed_df.shape}')
indexed_df[:4]

shape: (296337, 5)


Unnamed: 0,userID,itemID,rating,timestamp,x_label
0,1,0,5.0,1328140800,0
1,2,0,4.0,1330387200,0
2,3,0,4.0,1328400000,0
3,4,0,4.0,1366675200,0


In [12]:
u_uni = indexed_df[learner_id].unique()
c_uni = indexed_df[course_id].unique()

print(f'# of unique learners: {len(u_uni)}')
print(f'# of unique courses: {len(c_uni)}')

print('min/max of unique learners: {0}/{1}'.format(min(u_uni), max(u_uni)))
print('min/max of unique courses: {0}/{1}'.format(min(c_uni), max(c_uni)))


# of unique learners: 35598
# of unique courses: 18357
min/max of unique learners: 0/35597
min/max of unique courses: 0/18356
