# 从ratings_Sports_and_Outdoors.csv文件中提取U-I交互图, 5-core后重新编号
- Extracting U-I interactions and performing 5-core, re-indexing
- dataset located at: http://jmcauley.ucsd.edu/data/amazon/links.html, rating only file in "Small" subsets for experimentation

In [1]:
import os, csv
import pandas as pd

In [2]:
os.chdir('../data/Inha')
os.getcwd()

'/home/inhamath/competition/MMRec/data/Inha'

## 先5-core过滤
## 5-core filtering

In [3]:
df = pd.read_csv('../../../data/raw/train.csv', names=['userID', 'itemID', 'rating'], header = 0)
print(f'shape: {df.shape}')
df[:5]

shape: (1254441, 3)


Unnamed: 0,userID,itemID,rating
0,114341,9124,5.0
1,114341,32109,4.0
2,114341,44195,5.0
3,114341,24427,5.0
4,114341,10994,5.0


In [4]:
k_core = 5
learner_id, course_id = 'userID', 'itemID'

df.dropna(subset=[learner_id, course_id], inplace=True)
df.drop_duplicates(subset=[learner_id, course_id], inplace=True)
print(f'After dropped: {df.shape}')
df[:3]

After dropped: (1254441, 3)


Unnamed: 0,userID,itemID,rating
0,114341,9124,5.0
1,114341,32109,4.0
2,114341,44195,5.0


In [5]:
from collections import Counter
import numpy as np

min_u_num, min_i_num = 0, 0

def get_illegal_ids_by_inter_num(df, field, max_num=None, min_num=None):
    if field is None:
        return set()
    if max_num is None and min_num is None:
        return set()

    max_num = max_num or np.inf
    min_num = min_num or -1

    ids = df[field].values
    inter_num = Counter(ids)
    ids = {id_ for id_ in inter_num if inter_num[id_] < min_num or inter_num[id_] > max_num}
    print(f'{len(ids)} illegal_ids_by_inter_num, field={field}')

    return ids


def filter_by_k_core(df):
    while True:
        ban_users = get_illegal_ids_by_inter_num(df, field=learner_id, max_num=None, min_num=min_u_num)
        ban_items = get_illegal_ids_by_inter_num(df, field=course_id, max_num=None, min_num=min_i_num)
        if len(ban_users) == 0 and len(ban_items) == 0:
            return

        dropped_inter = pd.Series(False, index=df.index)
        if learner_id:
            dropped_inter |= df[learner_id].isin(ban_users)
        if course_id:
            dropped_inter |= df[course_id].isin(ban_items)
        print(f'{len(dropped_inter)} dropped interactions')
        df.drop(df.index[dropped_inter], inplace=True)



## k-core

In [6]:
filter_by_k_core(df)
print(f'k-core shape: {df.shape}')
print(f'shape after k-core: {df.shape}')
df[:10]

0 illegal_ids_by_inter_num, field=userID
0 illegal_ids_by_inter_num, field=itemID
k-core shape: (1254441, 3)
shape after k-core: (1254441, 3)


Unnamed: 0,userID,itemID,rating
0,114341,9124,5.0
1,114341,32109,4.0
2,114341,44195,5.0
3,114341,24427,5.0
4,114341,10994,5.0
5,88818,50223,3.0
6,88818,14862,1.0
7,88818,44195,1.0
8,85622,26388,5.0
9,85622,44195,3.0


## Re-index

In [7]:
df.reset_index(drop=True, inplace=True)

In [8]:
i_mapping_file = 'i_id_mapping.csv'
u_mapping_file = 'u_id_mapping.csv'

splitting = [0.8, 0.2]
uid_field, iid_field = learner_id, course_id

uni_users = pd.unique(df[uid_field])
uni_items = pd.unique(df[iid_field])

# start from 0
u_id_map = {k: i for i, k in enumerate(uni_users)}
i_id_map = {k: i for i, k in enumerate(uni_items)}

df[uid_field] = df[uid_field].map(u_id_map)
df[iid_field] = df[iid_field].map(i_id_map)
df[uid_field] = df[uid_field].astype(int)
df[iid_field] = df[iid_field].astype(int)

# dump
rslt_dir = './'
u_df = pd.DataFrame(list(u_id_map.items()), columns=['user_id', 'userID'])
i_df = pd.DataFrame(list(i_id_map.items()), columns=['asin', 'itemID'])

u_df.to_csv(os.path.join(rslt_dir, u_mapping_file), sep='\t', index=False)
i_df.to_csv(os.path.join(rslt_dir, i_mapping_file), sep='\t', index=False)
print(f'mapping dumped...')

mapping dumped...


In [9]:
# =========2. splitting
print(f'splitting ...')
tot_ratio = sum(splitting)
# remove 0.0 in ratios
ratios = [i for i in splitting if i > .0]
ratios = [_ / tot_ratio for _ in ratios]
split_ratios = np.cumsum(ratios)[:-1]

#df[tmstmp_str] = df[tmstmp_str].map(lambda x: datetime.strptime(x, "%Y-%m-%dT%H:%M:%SZ"))
split_ratios

splitting ...


array([0.8])

In [10]:
# get df training dataset unique users/items
df_train = df.sample(frac = splitting[0], random_state = 42)
df_val = df.drop(df_train.index)
df_test = df.drop(df_train.index).drop(df_val.index)

x_label, rslt_file = 'x_label', 'Inha.inter'
df_train[x_label] = 0
df_val[x_label] = 1
df_test[x_label] = 2
temp_df = pd.concat([df_train, df_val, df_test])
temp_df = temp_df[[learner_id, course_id, 'rating', x_label]]
print(f'columns: {temp_df.columns}')

temp_df.columns = [learner_id, course_id, 'rating', x_label]

temp_df.to_csv(os.path.join(rslt_dir, rslt_file), sep='\t', index=False)
temp_df[:5]

#print('done!')

columns: Index(['userID', 'itemID', 'rating', 'x_label'], dtype='object')


Unnamed: 0,userID,itemID,rating,x_label
304196,30606,7659,5.0,0
368392,38338,5631,2.0,0
111968,10107,7026,5.0,0
677958,82008,17995,5.0,0
967403,131088,1136,5.0,0


## Reload

In [11]:
indexed_df = pd.read_csv(rslt_file, sep='\t')
print(f'shape: {indexed_df.shape}')
indexed_df[:4]

shape: (1254441, 4)


Unnamed: 0,userID,itemID,rating,x_label
0,30606,7659,5.0,0
1,38338,5631,2.0,0
2,10107,7026,5.0,0
3,82008,17995,5.0,0


In [12]:
u_uni = indexed_df[learner_id].unique()
c_uni = indexed_df[course_id].unique()

print(f'# of unique learners: {len(u_uni)}')
print(f'# of unique courses: {len(c_uni)}')

print('min/max of unique learners: {0}/{1}'.format(min(u_uni), max(u_uni)))
print('min/max of unique courses: {0}/{1}'.format(min(c_uni), max(c_uni)))


# of unique learners: 192403
# of unique courses: 62989
min/max of unique learners: 0/192402
min/max of unique courses: 0/62988


In [13]:
rslt_file

'Inha.inter'