In [3]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import json
import networkx as nx
from community import best_partition

In [4]:
raw = pd.read_csv("./data/gowalla_checkins.csv")
raw = raw.sort_values("datetime")

In [5]:
raw["datetime"] = pd.to_datetime(raw["datetime"])
raw["week"] = raw["datetime"].dt.week

In [6]:
def get_month(x):
    year = x.year
    month = x.month
    return str(year)+str(month)

In [7]:
raw["month"] = raw["datetime"].apply(get_month)

In [8]:
train_length = int(raw.shape[0]*0.8)
train_set = raw[:train_length]
test_set = raw[train_length:]

In [9]:
train_item = list(train_set.placeid.value_counts()[train_set.placeid.value_counts()>100].index)
train_set = train_set[train_set.placeid.isin(train_item)]

In [10]:
len(train_set.placeid.unique())

32316

## Find connected user

In [122]:
edges = pd.read_csv("./data/gowalla_friendship.csv")

In [12]:
g_test = nx.Graph()
g_test.add_edges_from(edges.values)

In [13]:
g_test = nx.subgraph(g_test,list(train_set.userid.unique()))

In [14]:
print(g_test.number_of_nodes(), len(train_set.userid.unique()))

197288 197288


In [15]:
par_df = pd.read_csv("./data/partition.csv")

In [103]:
par = best_partition(g_test)

In [104]:
par_df = pd.DataFrame(par,index=[0]).T.reset_index()
par_df.columns = ["uid","cluster"]

In [105]:
par_df.to_csv("./data/partition.csv",index=0)

In [16]:
par_df.cluster.value_counts()[par_df.cluster.value_counts()>2500]

0     39547
13    31181
3     15409
8     10196
6      8958
32     8477
17     8273
2      7756
1      6709
5      4983
11     4447
29     4059
10     3375
7      3247
50     3131
Name: cluster, dtype: int64

In [155]:
len(list(nx.connected_components(nx.subgraph(g_test,connect_user))))

1

In [153]:
connect_user = list(par_df[par_df.cluster==6].uid.values)

In [154]:
len(train_set[train_set.userid.isin(connect_user)].userid.unique())

8958

In [95]:
def get_id(data):
    user_list = data.userid.unique()
    item_list = data.placeid.unique()
    user_map = {v:i for i,v in enumerate(user_list)}
    user_inv = {i:v for i,v in enumerate(user_list)}
    item_map = {v:i for i,v in enumerate(item_list)}
    item_inv = {i:v for i,v in enumerate(item_list)}
    return user_map, user_inv, item_map, item_inv
def normalize_id(data,user_map, item_map):
    data["userid"] = data["userid"].apply(lambda x:user_map[x])
    data["placeid"] = data["placeid"].apply(lambda x:item_map[x])
    return data

In [96]:
def filter_useritem(train_set, test_set):
    train_user, train_item = connect_user, train_set.placeid.unique()
#     train_user = list(train_set.userid.value_counts().index[:3000])
    train_item = list(train_set.placeid.value_counts()[train_set.placeid.value_counts()>100].index)
#     train_item = list(train_set.placeid.unique())
    train_set = train_set[train_set.userid.isin(train_user)]
    train_set = train_set[train_set.placeid.isin(train_item)]
    test_set = test_set[test_set.userid.isin(train_user)]
    test_set = test_set[test_set.placeid.isin(train_item)]
    return train_set, test_set 

In [156]:
a, b = filter_useritem(train_set, test_set)

In [157]:
print(a.shape,b.shape)

(276740, 5) (60797, 5)


In [158]:
b[b.userid.isin(a.userid.unique())].shape

(60797, 5)

In [159]:
b = b[b.userid.isin(a.userid.unique())]
b = b[b.placeid.isin(a.placeid.unique())]

In [160]:
user_map, user_inv, item_map, item_inv = get_id(a)
train, test = normalize_id(a,user_map,item_map), normalize_id(b, user_map, item_map)

In [161]:
print(len(train_set.userid.unique()),len(user_map),len(a.userid.unique()))

197288 8958 8958


In [162]:
print(train.shape,test.shape)

(276740, 5) (59932, 5)


In [163]:
print((len(train.placeid.unique())))

7230


## Create training set

In [105]:
train_user, train_item = list(train.userid.unique()), list(train.placeid.unique())

In [106]:
positive_pair = set([tuple(i) for i in train[["userid","placeid"]].values])

In [107]:
negative_pair = set()
rd_user = np.random.choice(train_user,size=len(positive_pair))
rd_item = np.random.choice(train_item,size=len(positive_pair))
negative_pair.update(list(zip(rd_user, rd_item)))

In [108]:
negative_pair = negative_pair - positive_pair

In [109]:
positive_df = pd.DataFrame(list(positive_pair))
positive_df["label"] = 1 
negative_df = pd.DataFrame(list(negative_pair))
negative_df["label"] = 0
train_df = pd.concat((positive_df,negative_df))

In [110]:
len(train_df)

36538

## Create testing set 

In [111]:
train_dict = dict()
for u,df in train.groupby("userid"):
    train_dict[u] = list(df.placeid.unique())

In [112]:
def check_neighbor_adopt(item,user,train_dict,graph):
    friends = list(graph.neighbors(user))
    locs = set()
    for f in friends:
        locs.update(train_dict[f])
    if item in locs:
        return True
    else:
        return False

In [147]:
test_social = dict()
for item, df in test.groupby("placeid"):
    test_users = set(df.userid.values)
    rm_user = set()
    for u in test_users:
        if item in train_dict[u]:
            rm_user.add(u)
        elif not check_neighbor_adopt(item, u,train_dict,g):
            rm_user.add(u)
    if list(test_users - rm_user):
        test_social[item] = list(test_users - rm_user)

In [135]:
test_ans = dict()
for item, df in test.groupby("placeid"):
    test_ans[item] = list(set(df.userid.values.tolist()))

In [140]:
len(test_social)

214

In [134]:
print(np.mean([len(i) for i in test_ans.values()]))
print(np.mean([len(i) for i in test_social.values()]))

7.9398505451427175
7.303738317757009


## create user feature

In [148]:
from collections import defaultdict

In [149]:
gp = train.groupby(["userid","week"])

In [150]:
d = defaultdict(list)
i = 0
for _, df in gp:
    uid = df.userid.values[0]
    lid = df.placeid.values.tolist()
    d[uid].append(lid)
    if (i+1) %10000 ==0:
        print(".",end="")
    i+=1

.........

In [151]:
num_item = train.placeid.max()+1
num_user = train.userid.max()+1

In [152]:
# user_feature = np.zeros(num_user).tolist()
# for u,week_sess in d.items():
#     user_feature[u] = week_sess[-1]

In [153]:
item_embedding = np.zeros((num_item,num_user))
for item,df in train.groupby("placeid"):
    for uid, n in dict(df.userid.value_counts()).items():
        item_embedding[item][uid] = n

In [154]:
user_feature = np.zeros(num_user).tolist()
for u,week_sess in d.items():
    history_seq = []
    for each_week in week_sess:
        week_vector = np.zeros(num_item)
        for each_item in each_week:
            week_vector[each_item] = week_vector[each_item]+1
        history_seq.append(week_vector.tolist())
    user_feature[u] = history_seq

## Graph

In [114]:
len(user_map)

3131

In [123]:
edges = edges[(edges.userid1.isin(user_map.keys())) & (edges.userid2.isin(user_map.keys()))]

In [124]:
edges["userid1"] = edges["userid1"].apply(lambda x:user_map[x])
edges["userid2"] = edges["userid2"].apply(lambda x:user_map[x])

In [125]:
g = nx.Graph()

In [126]:
g.add_edges_from(edges.values)
# g.add_nodes_from(user_map.values())

In [127]:
g.number_of_nodes()

3131

In [128]:
len(list(nx.connected_components(g)))

1

In [77]:
adj = nx.adjacency_matrix(g).todense()

In [78]:
adj_list = np.array(list(g.edges()))

## Save data 

In [171]:
# np.save("./data/adj",adj)
np.save("./data/feature",user_feature)
np.save("./data/item_embedding",item_embedding)

In [172]:
np.save("./data/train",train_df.values)

In [173]:
np.save("./data/adj_list",adj_list)

In [144]:
import json
with open('./data/test_ans.json', 'w') as fp:
    json.dump(test_ans, fp)

In [150]:
with open('./data/test_social.json', 'w') as fp:
    json.dump(test_social, fp,default=str)

In [51]:
d2 = {str(i):str(v) for i,v in item_inv.items()}

In [52]:
with open('item_inv.json', 'w') as fp:
    json.dump(d2, fp)

In [41]:
# for i,v in zip(["user_map", "user_inv","item_map","item_inv"],[user_map, user_inv, item_map, item_inv ]):
#     with open('%s.json'%i, 'w') as fp:
#         json.dump({str(q):w for q,w in v.items()}, fp)

In [151]:
sum([len(i) for i in test_social.values()])

1563