In [9]:
import dgl
import numpy as np
from numpy import genfromtxt
import pandas as pd
import dgl.function as fn
from dgl import DGLGraph
import torch.nn as nn
import torch.nn.functional as F
import torch
from torch.utils.data import DataLoader
from dgl.nn.pytorch import GraphConv
import pickle

In [10]:
cuda=True

# Generate Big Graph

In [11]:
curr_phase=6

In [12]:
click_data=genfromtxt(f'./underexpose_train/underexpose_train_click-{0}.csv', delimiter=',',dtype=np.int32)
for i in range(1,curr_phase+1):
    click_data=np.concatenate((click_data, genfromtxt(f'./underexpose_train/underexpose_train_click-{i}.csv', delimiter=',',dtype=np.int32)))
click_data=click_data[:,:2]
users=click_data[:,0]
items=click_data[:,1]

In [13]:
total_vertices=len(np.unique(users))+len(np.unique(items))

In [14]:
uid2vid={}
vid2uid={}
iid2vid={}
vid2iid={}
inc=0
for user in np.unique(users):
    uid2vid[user]=inc
    vid2uid[inc]=user
    inc+=1
for item in np.unique(items):
    iid2vid[item]=inc
    vid2iid[inc]=item
    inc+=1

In [15]:
assert((len(iid2vid)+len(uid2vid))==total_vertices)

In [16]:
g=dgl.DGLGraph()
g.add_nodes(total_vertices)

In [17]:
for i in range(len(users)):
    src=uid2vid[users[i]]
    dst=iid2vid[items[i]]
    g.add_edge(src,dst)
    g.add_edge(dst,src)

In [18]:
g=dgl.transform.add_self_loop(g)

In [19]:
 n_edges = g.number_of_edges()
# normalization
degs = g.in_degrees().float()
norm = torch.pow(degs, -0.5)
norm[torch.isinf(norm)] = 0
if cuda:
    norm = norm.cuda()
g.ndata['norm'] = norm.unsqueeze(1)

In [12]:
torch.sum(g.out_degrees(np.arange(0,g.number_of_nodes())))/g.number_of_nodes()#average degree

tensor(29)

In [13]:
sorted(g.in_degrees(np.arange(0,g.number_of_nodes())))[int(g.number_of_nodes()*0.2)] 
#what is the degree of 20 percentile of the lowest cliked items

tensor(5)

In [20]:
g

DGLGraph(num_nodes=129813, num_edges=3843087,
         ndata_schemes={'norm': Scheme(shape=(1,), dtype=torch.float32)}
         edata_schemes={})

# Get User Features

In [14]:
with open("./underexpose_train/user_generate_feat.txt","r")as f:
    lines=f.readlines()
    usr_feat=np.zeros((len(lines),5))
    for i in range(len(lines)):
        if lines[i].split(",")[2]=="0":
            usr_feat[i][3]=1
        if lines[i].split(",")[2]=="1":
            usr_feat[i][4]=1
    del lines

In [197]:
# with open("./underexpose_train/underexpose_user_feat.csv","r")as f:
#     lines=f.readlines()
#     usr_feat=np.zeros((len(lines),5))
#     for i in range(len(lines)):
#         if lines[i].split(",")[2]=="M":
#             usr_feat[i][3]=1
#         if lines[i].split(",")[2]=="F":
#             usr_feat[i][4]=1
#     del lines
# fn="./underexpose_train/underexpose_user_feat.csv"
# usr_data=genfromtxt(fn, delimiter=',',dtype=np.int16)

In [15]:
fn="./underexpose_train/user_generate_feat.txt"
usr_data=genfromtxt(fn, delimiter=',',dtype=np.int16)

In [16]:
usr_feat[:,0:2]=usr_data[:,0:2]
usr_feat[:,2]=usr_data[:,3]

In [17]:
uid2feat={}
for i in range(len(usr_feat)):
    uid2feat[int(usr_feat[i][0])]=usr_feat[i][1:]

In [18]:
del usr_data

# Get Item Features

In [202]:
item_data=genfromtxt("./underexpose_train/underexpose_item_feat_clean.csv", delimiter=',',dtype=np.float32)

In [203]:
item_ids=item_data[:,0].astype(np.int)
item_feats=item_data[:,1:].astype(np.float16)

In [204]:
iid2feat={}
for i in range(len(item_ids)):
    iid2feat[item_ids[i]]=item_feats[i]

In [205]:
del item_data


# Generate Entire Features

For each vertex (either USER or ITEM)

In [283]:
ufeatdim=uid2feat[17].shape[0]
ifeatdim=iid2feat[2000].shape[0]

In [284]:
features=np.zeros((total_vertices,ufeatdim+ifeatdim))

In [285]:
featureless_cnt=0
for i in range(total_vertices):
    if i in vid2uid and vid2uid[i] in uid2feat:
        features[i,0:ufeatdim] = uid2feat[vid2uid[i]]
        continue
    if i in vid2iid and vid2iid[i] in iid2feat:
        features[i,ufeatdim:] = iid2feat[vid2iid[i]]
        continue
    featureless_cnt+=1

In [286]:
(features/np.max(features,axis=0)).shape

(129813, 260)

In [287]:
featureless_cnt/total_vertices #~25% node are feature less ||should I do an average of neighors of neighbors

0.09240985109349603

In [288]:
features = torch.FloatTensor(features/np.max(features,axis=0)) #normalize each feature
if cuda:
    features = features.cuda()

# Create Model GCN +BCELoss

In [289]:
class GCN(nn.Module):
    def __init__(self,
                 g,
                 in_feats,
                 n_hidden,
                 n_classes,
                 n_layers,
                 activation,
                 dropout):
        super(GCN, self).__init__()
        self.g = g
        self.layers = nn.ModuleList()
        # input layer
        self.layers.append(GraphConv(in_feats, n_hidden, activation=activation))
        # hidden layers
        for i in range(n_layers - 1):
            self.layers.append(GraphConv(n_hidden, n_hidden, activation=activation))
        # output layer
        self.layers.append(GraphConv(n_hidden, n_classes))
        self.dropout = nn.Dropout(p=dropout)
        self.loss_fn = nn.BCELoss() 
        
    def forward(self, features):
        h = features
        for i, layer in enumerate(self.layers):
            if i != 0:
                h = self.dropout(h)
            h = layer(self.g, h)
        return h;
    
    def loss(self, h, pos_usrs,pos_itms,neg_itms):
        #h_i dot h_j

        pos_usr_embedding=h[pos_usrs,:]
        pos_itm_embedding=h[pos_itms,:]
        pos_pred=pos_itm_embedding*pos_usr_embedding
        pos_pred= torch.sum(pos_pred,dim=1)
        pos_pred= torch.sigmoid(pos_pred)
        correct = torch.sum(pos_pred>0.5)
        pred = pos_pred
        for i in range(len(neg_itms)):
            neg_itm_embedding=h[neg_itms[i],:]
            neg_pred = torch.mul(neg_itm_embedding,pos_usr_embedding[i])
            neg_pred= torch.sum(neg_pred,dim=1)
            neg_pred = torch.sigmoid(neg_pred)
            pred = torch.cat((pred, neg_pred),0)
            correct += torch.sum(neg_pred<=.5)
        
        print("Training Accuracy: ", end ="")
        print(float(correct/float(pos_usrs.shape[0]+neg_itms.shape[0]*neg_itms.shape[1])))
            
        label = torch.cat((torch.ones(pos_usrs.shape[0]),torch.zeros(neg_itms.shape[0]*neg_itms.shape[1])),0).cuda();
        return self.loss_fn(pred,label)

In [290]:
model=GCN(g,features.shape[1],512,256,1,F.relu,0.2)
if cuda:
    model.cuda()

In [291]:
optimizer = torch.optim.Adam(model.parameters())

In [292]:
degrees=g.in_degrees(np.arange(0,g.number_of_nodes()))
prob=np.power(degrees,0.75)/float(torch.sum(np.power(degrees,0.75)))

def sample_neg(pos_usr, neg_size):
    """
        For generating batches of data.
    """
    samples = np.random.choice(len(degrees),(len(pos_usr),neg_size),p=prob)
    for i in range(samples.shape[0]):
        for j in range(samples.shape[1]):
            while g.has_edge_between(pos_usr[i],samples[i][j]):
                samples[i][j]=np.random.choice(len(degrees), 1 ,p=prob)
    return samples

def check_accuracy(h,  pos_usrs,pos_itms,neg_itms):#check random n links
    with torch.no_grad():
        pos_usr_embedding=h[pos_usrs,:]
        pos_itm_embedding=h[pos_itms,:]
        pos_pred=pos_itm_embedding*pos_usr_embedding
        pos_pred= torch.sum(pos_pred,dim=1)
        pos_pred= torch.sigmoid(pos_pred)
        correct = torch.sum(pos_pred>0.5)
        
        for i in range(len(neg_itms)):
            neg_itm_embedding=h[neg_itms[i],:]
            neg_pred = torch.mul(neg_itm_embedding,pos_usr_embedding[i])
            neg_pred= torch.sum(neg_pred,dim=1)
            neg_pred = torch.sigmoid(neg_pred)
            correct += torch.sum(neg_pred<=.5)
        return correct/float(pos_usrs.shape[0]+neg_itms.shape[0]*neg_itms.shape[1])
       
        
    
    

In [293]:
neg_size=1
epoch=300
mini_batch=g.number_of_nodes()
for i in range(epoch):
    loss = 0
    h=model(features)
    optimizer.zero_grad()
#     for j in range(10):
    pos_idx=torch.randint(0,len(click_data),(mini_batch,));
    pos_itms=np.array(list(map(lambda x: iid2vid[click_data[x][1]],pos_idx)))
    pos_usrs=np.array(list(map(lambda x: uid2vid[click_data[x][0]],pos_idx)))
    neg_itms=sample_neg(pos_usrs,neg_size)
    loss+=model.loss(h,pos_itms,pos_usrs,neg_itms)
#     loss/=10
    print("Loss: ", end ="")
    print(float(loss))
    if i %10==0:
        pos_idx_test=torch.randint(0,len(click_data),(8000,));
        pos_itms_test=np.array(list(map(lambda x: iid2vid[click_data[x][1]],pos_idx_test)))
        pos_usrs_test=np.array(list(map(lambda x: uid2vid[click_data[x][0]],pos_idx_test)))
        neg_itms_test=sample_neg(pos_usrs_test,neg_size)
        print("Accuracy: ", end ="")
        print(float(check_accuracy(h,pos_itms_test,pos_usrs_test,neg_itms_test)))
        pickle.dump(model, open( "./model_saved", "wb" ))
    loss.backward()
    optimizer.step()
    


Training Accuracy: 0.5
Loss: 3.0833377838134766
Accuracy: 0.5
Training Accuracy: 0.5
Loss: 1.5526411533355713
Training Accuracy: 0.4998333156108856
Loss: 0.9208746552467346
Training Accuracy: 0.500166654586792
Loss: 0.7669858336448669
Training Accuracy: 0.5
Loss: 0.7807765007019043
Training Accuracy: 0.5
Loss: 0.8290231227874756
Training Accuracy: 0.5
Loss: 0.822573721408844
Training Accuracy: 0.5
Loss: 0.789300799369812
Training Accuracy: 0.5
Loss: 0.7554960250854492
Training Accuracy: 0.5
Loss: 0.7358836531639099
Training Accuracy: 0.5
Loss: 0.720391035079956
Accuracy: 0.5
Training Accuracy: 0.500166654586792
Loss: 0.7122898697853088
Training Accuracy: 0.500333309173584
Loss: 0.7012906670570374
Training Accuracy: 0.5005000233650208
Loss: 0.6975200176239014
Training Accuracy: 0.5006666779518127
Loss: 0.6923235654830933
Training Accuracy: 0.5011666417121887
Loss: 0.6871898770332336
Training Accuracy: 0.5013333559036255
Loss: 0.6831796765327454
Training Accuracy: 0.5021666884422302
Loss

Training Accuracy: 0.6588333249092102
Loss: 0.591298520565033
Training Accuracy: 0.6553333401679993
Loss: 0.5929868817329407
Training Accuracy: 0.6641666889190674
Loss: 0.591706395149231
Accuracy: 0.6594375371932983
Training Accuracy: 0.6678333282470703
Loss: 0.5916755199432373
Training Accuracy: 0.6704999804496765
Loss: 0.5852190852165222
Training Accuracy: 0.6671666502952576
Loss: 0.58699631690979
Training Accuracy: 0.6520000100135803
Loss: 0.5939818024635315
Training Accuracy: 0.659333348274231
Loss: 0.5898462533950806
Training Accuracy: 0.6598333120346069
Loss: 0.593115508556366
Training Accuracy: 0.668999969959259
Loss: 0.5865436792373657
Training Accuracy: 0.6696666479110718
Loss: 0.5834317803382874
Training Accuracy: 0.6623333096504211
Loss: 0.5806114077568054
Training Accuracy: 0.6679999828338623
Loss: 0.5841295719146729
Accuracy: 0.6598750352859497
Training Accuracy: 0.6714999675750732
Loss: 0.5823725461959839
Training Accuracy: 0.6781666874885559
Loss: 0.5802643299102783
Trai

Training Accuracy: 0.70333331823349
Loss: 0.5521110892295837
Training Accuracy: 0.6978332996368408
Loss: 0.5528357028961182
Training Accuracy: 0.699833333492279
Loss: 0.5468459725379944
Training Accuracy: 0.6996666789054871
Loss: 0.5545580387115479
Training Accuracy: 0.7081666588783264
Loss: 0.551224946975708
Training Accuracy: 0.699999988079071
Loss: 0.5461563467979431
Training Accuracy: 0.7133333086967468
Loss: 0.5389381647109985
Training Accuracy: 0.7120000123977661
Loss: 0.5446487665176392
Accuracy: 0.7069375514984131
Training Accuracy: 0.7066666483879089
Loss: 0.5504110455513
Training Accuracy: 0.702833354473114
Loss: 0.5476747751235962
Training Accuracy: 0.7133333086967468
Loss: 0.5357396006584167
Training Accuracy: 0.6990000009536743
Loss: 0.5505207777023315
Training Accuracy: 0.7023333311080933
Loss: 0.5470067262649536
Training Accuracy: 0.706166684627533
Loss: 0.5408061742782593
Training Accuracy: 0.7109999656677246
Loss: 0.5473122596740723
Training Accuracy: 0.710333347320556

# Getting Predicton

In [1]:
embedding=h

NameError: name 'h' is not defined

In [6]:
test_clicks=genfromtxt("/home/cocoa/247Proj/underexpose_test/underexpose_test_qtime-0.csv", delimiter=',',dtype=np.int32)
for i in range(1,curr_phase+1):
    test_clicks=np.concatenate((test_clicks, genfromtxt(f'/home/cocoa/247Proj/underexpose_test/underexpose_test_qtime-{i}.csv', delimiter=',',dtype=np.int32)))

In [7]:
target_usr=test_clicks[:,0]

In [8]:
target_usr

array([   11,    22,    44, ..., 35415, 35426, 35437], dtype=int32)

In [298]:
h[list(iid2vid.values()),:].shape

torch.Size([98764, 256])

In [299]:
top_idx=g.out_degrees(np.arange(g.number_of_nodes())).topk(g.number_of_nodes()).indices
popular=list()
for i in top_idx:
    if i in iid2vid.values():
        popular.append(vid2iid[int(i)])
        if len(popular)==50:
            break

In [300]:
def getTopRec(usr_id):
    #get item embedding
    if usr_id in uid2vid:
        dist=torch.norm (h[uid2vid[usr_id]]-h[list(iid2vid.values()),:],dim=1,p=None)
        knn = dist.topk(50, largest=False)
        return knn.indices+min(list(iid2vid.values()))
    else:
        return torch.Tensor(popular) #这里可以联系

In [301]:
result=dict()
for i in target_usr:
    result[i]=getTopRec(i)

In [304]:
#output file

In [305]:
pickle.dump(h, open( "./embedding", "wb" ))

In [306]:
with open( "underexpose_submit-2.csv","w") as f:
    for k,v in result.items():
        f.write(str(k)+",")
        f.write(",".join(list(map(lambda x:str(int(x)),v))))
        f.write("\n")

In [307]:
result[11]

tensor([ 47964,  77562,  50833,  92883,  63136, 100131,  96051,  72854,  64510,
        104093,  44525,  55819,  66826,  86299, 120342,  44448,  92482,  94600,
         42929, 126815,  66149,  36293,  39146,  37589,  58736,  43477,  39048,
        125936, 103643,  40332,  87739,  67759,  58011, 111058,  36901,  97987,
         39031,  40034,  88942,  98864,  52018,  78061,  66971,  66864,  42322,
         39967,  48693,  38566,  62069,  52286], device='cuda:0')