In [None]:
# imports
import re
import time
import pickle
import logging
import gc
import os

import pandas as pd
import numpy as np
import math as m
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.nn.functional as fnc
import torch.optim as optim
from torch.autograd import Variable

from scipy import stats


log = logging.getLogger('log')
log.setLevel(logging.DEBUG)

lhnd = logging.StreamHandler()
lhnd.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
lhnd.setFormatter(formatter)

log.addHandler(lhnd)

In [None]:
def lmap(f, arr):
    return list(map(f, arr))

def lfilter(f, arr):
    return list(filter(f, arr))

def foreach(it, f):
    for e in it:
        f(e)

In [None]:
total = 947528

def raw_data_filter(file):
    # Mapping to events
    res = list()

    i = 0
    
    for line in file:
        cur = line.rstrip().split(',')
        cur = lmap(lambda p: (re.sub(';.*', '', p), re.sub('.*;', '', p)), cur)

        session = list()
        
        for j in range(0, len(cur)):
            try:
                session.append(int(cur[j][1]))
            except ValueError:
                None
                
        res.append(session)

        i = i + 1
                
        if (i % 100000 == 0):
            gc.collect()

            log.debug("%d %% of mapping is done.", i / total * 100)

    
    return res

In [None]:
raw_data = raw_data_filter(open("data/sessions_public.txt","r"))
    


In [None]:
def group_count(data):
    total = dict()

    for i in data:
        for j in i:
            if (j in total.keys()):
                total[j] = total[j] + 1
            else:
                total[j] = 1
                
    return total

In [None]:
def set_map(lst, cnt, min_allowed, min_session_size):
    result = []
    groups = set()
    
    for session in lst:
        unsub = set()
        sub = set()
        
        for event in session:
            if (event < 0):
                sub_event = -event
                
                if (sub_event in sub):
                    sub.remove(sub_event)
                    
                unsub.add(sub_event)
            else:
                if (not event in unsub):
                    if (cnt == None or cnt[event] > min_allowed):
                        sub.add(event)
        
        if (len(sub) >= min_session_size):
            for event in sub:
                groups.add(event)
            
            result.append(sub)
    
    return result, groups

def drop_uncommon(raw_data, min_allowed = 10, min_session_size = 4):
    cnt = None
    sorted_cnt = None
    
    data = raw_data
    groups = None
    
    while (cnt == None or sorted_cnt[0] < min_allowed):
        data, groups = set_map(data, cnt, min_allowed, min_session_size)
        
        cnt = group_count(data) 
        sorted_cnt = sorted(list(cnt.values()))
        
        log.info("Length of data:   %d", len(data))
        log.info("Number of groups: %d", len(groups))
        log.info("Minimum count:    %d", sorted_cnt[0])
        
    return data, groups

In [None]:
#data = drop_uncommon(data)
data, groups = drop_uncommon(raw_data[0:100000], 50)

In [None]:
class SkipGram(nn.Module):
    def __init__(self, vocab_size, embd_size):
        super(SkipGram, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embd_size)
    
    def forward(self, focus, context):
        embed_focus = self.embeddings(focus).view((1, -1))
        embed_ctx = self.embeddings(context).view((1, -1))
        score = torch.mm(embed_focus, torch.t(embed_ctx))
        log_probs = fnc.logsigmoid(score)


    
        return log_probs


In [None]:
from random import shuffle

embd_size = 50
learning_rate = 0.001
n_epoch = 1

def get_train(session):
    train = []
    
    for i in session:
        for j in session:
            if (i == j):
                continue
            train.append((i, j, 1))
    
    sk = shuffle(list(groups))
    
    for i in groups:
        if (not i in session):
            for j in session:
                train.append((i, j, 0))
                train.append((j, i, 0))
            
            break
        
    return train
    
w2i = {w: i for i, w in enumerate(groups)}

def train_skipgram(train):
    losses = []
    loss_fn = nn.MSELoss()
    model = SkipGram(len(groups), embd_size)
    print(model)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    
    for epoch in range(n_epoch):
        total_loss = .0
        total_cnt = 0
        for session in train:
            
            for in_w, out_w, target in session:
                in_w_var = Variable(torch.LongTensor([w2i[in_w]]))
                out_w_var = Variable(torch.LongTensor([w2i[out_w]]))

                model.zero_grad()
                log_probs = model(in_w_var, out_w_var)
                loss = loss_fn(log_probs[0], Variable(torch.Tensor([target])))


                loss.backward()
                optimizer.step()

                total_loss += loss.data.item()
                total_cnt += 1
        losses.append(total_loss / total_cnt)
        print('Epoch ', epoch, ' passed with avg loss ', total_loss)
    return model, losses




In [None]:
train = [get_train(i) for i in data]

In [None]:
sg_models, sg_losses = train_skipgram(train)

In [None]:
lg = list(groups)
tg = list(data[0])[0]
print(data[0])


res = []

for i in range(0, len(groups)):
    if (lg[i] == tg):
        continue
        
    in_w_var = Variable(torch.LongTensor([w2i[tg]]))
    out_w_var = Variable(torch.LongTensor([w2i[lg[i]]]))
    
    sg_model.zero_grad()
    log_probs = sg_model(in_w_var, out_w_var)
    
    _, predicted = torch.max(log_probs.data, 1)

    print(log_probs.data[0], ' ', lg[i])
    print(nn.MSELoss()(log_probs.data[0],Variable(torch.Tensor([0]))))
    #res.append()


In [None]:
print(sg_losses)