In [None]:
# imports
import re
import time
import pickle
import logging
import gc
import os
import math
import functools
import requests

import pandas as pd
import numpy as np
import math as m
import matplotlib.pyplot as plt

from scipy import stats

from six.moves import xrange 
from pathlib import Path

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


log = logging.getLogger('log')
log.setLevel(logging.DEBUG)

lhnd = logging.StreamHandler()
lhnd.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
lhnd.setFormatter(formatter)

log.addHandler(lhnd)

%autonotify -a 30

In [None]:
mode = 0

In [None]:
ignore_dumps = False

def lmap(f, arr):
    return list(map(f, arr))

def lfilter(f, arr):
    return list(filter(f, arr))

def foreach(it, f):
    for e in it:
        f(e)
        
def dump(data, name):
    with open('data/' + name, 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
        
def load(name):
    with open('data/' + name, 'rb') as f:
        return pickle.load(f)
    
def load_or_dump(path, func):
    if not Path('data/' + path).exists() or ignore_dumps:
        res = func()
    
        dump(res, path)
    else:
        res = load(path)
        
    return res


In [None]:
from time import sleep

with open('auth/token') as f:
    token = f.readline().strip()

def get_info(ids):
    sleep(0.2)
    mc = 'members_count'
    payload = {'v': '5.92', 'access_token': token, 'fields':mc}
    
    str_ids = functools.reduce(
        lambda x, y: x + y,
        lmap(lambda x: str(x) + ',', ids)
    )
    
    print(str_ids)
    
    payload['group_ids'] = str_ids[0:- 1]
    
    r = requests.get('https://api.vk.com/method/groups.getById', 
                     params=payload)
    
    if (not 'response' in r.json()):
        print(r.json())
        
    res = lmap(lambda x: (x['name'], x['screen_name'], "{:,}".format(x[mc]) if mc in x else -1),r.json()['response'])
    return(res)

In [None]:
total = 1015925

def raw_data_filter(file):
    # Mapping to events
    res = list()

    i = 0
    
    for line in file:
        cur = line.rstrip().split(',')
        cur = lmap(lambda p: (re.sub(';.*', '', p), re.sub('.*;', '', p)), cur)

        session = list()
        
        for j in range(0, len(cur)):
            try:
                session.append(int(cur[j][1]))
            except ValueError:
                None
                
        res.append(session)

        i = i + 1
                
        if (i % 100000 == 0):
            gc.collect()

            log.debug("%d %% of mapping is done.", i / total * 100)

    
    return res

In [None]:
if (mode == 0):
    raw_data = load_or_dump('raw', lambda: raw_data_filter(open("data/public_sessions_2.txt","r")))

    log.info("Data loaded")


In [None]:
def group_count(data):
    total = dict()

    for i in data:
        for j in i[0]:
            if (j in total.keys()):
                total[j] = total[j] + 1
            else:
                total[j] = 1
                
    return total

In [None]:
def load_words_data(file):
    words = []
    
    for line in file:
        for word in line.split():
            words.append(word)

    return words

In [None]:
if (mode == 1):
    words_size = 50000
    
    ignore_dumps = True
    data = load_or_dump('raw_txt', lambda: load_words_data(open("data/text8.txt","r")))
    groups = group_count([[data]])
    
    dictlist = list(groups.items())
    dictlist.sort(key = lambda x: x[1])
    allowed = set(lmap(lambda x: x[0], dictlist[-words_size:]))

    for i in xrange(len(data)):
        if not data[i] in allowed:
            data[i] = '-1'
            
    groups = group_count([[data]])
    
    data = [[data, []]]
    
    print(len(groups))
    

In [None]:
min_session_size = 2
max_session_size = 20

def initiail_mapping(lst, min_allowed):
    result = []
    groups = set()
    
    for session in lst:
        unsub = set()
        sub = set()
        malformed = set()
        
        for event in session:
            if (event < 0):
                sub_event = -event
                
                if (sub_event in sub or sub_event in malformed):
                    sub.discard(sub_event)
                    unsub.discard(sub_event)
                    malformed.add(sub_event)
                else:
                    unsub.add(sub_event)
            else:
                if (event in unsub or event in malformed):
                    unsub.discard(event)
                    sub.discard(event)
                    malformed.add(event)
                else:
                    sub.add(event)
        
        if (len(sub) >= min_session_size and len(sub) <= max_session_size):
            for event in sub:
                groups.add(event)
            for event in unsub:
                groups.add(event)
            
            result.append((sub, unsub))
    
    return result, groups
    

def set_map(lst, cnt, min_allowed):
    result = []
    groups = set()
    
    for session in lst:
        unsub = set()
        sub = set() 
        
        for event in session[0]:
            if (cnt[event] > min_allowed):
                sub.add(event)
                
        for event in session[1]:
            if (cnt.get(event, -1) > min_allowed):
                unsub.add(event)    
        
        if (len(sub) >= min_session_size):
            for event in sub:
                groups.add(event)
            for event in unsub:
                groups.add(event)
            
            result.append((sub, unsub))
    
    return result, groups

def drop_uncommon(raw_data, min_allowed = 10):
    cnt = None
    sorted_cnt = None
    
    data, groups = initiail_mapping(raw_data, min_allowed)
    cnt = group_count(data) 
    sorted_cnt = sorted(list(cnt.values()))
    
    while (cnt == None or sorted_cnt[0] < min_allowed):
        data, groups = set_map(data, cnt, min_allowed)
                
        cnt = group_count(data) 
        sorted_cnt = sorted(list(cnt.values()))
        
        log.info("Length of data:   %d", len(data))
        log.info("Total length:     %d", 
                functools.reduce((lambda x, y: x + y), lmap(lambda a: len(a), data))
                )
        log.info("Number of groups: %d", len(groups))
        log.info("Minimum count:    %d\n", sorted_cnt[0])
        
    return data, groups

In [None]:
if (mode == 0):
    ignore_dumps = True
    data, groups = load_or_dump('final_data', lambda: drop_uncommon(raw_data, 50))

    most_common = sorted(group_count(data).items(), key=lambda x: x[1], reverse=True)

In [None]:
w2i = {w: i for i, w in enumerate(groups)}
i2w = {i: w for i, w in enumerate(groups)}

In [None]:
print(i2w[0])

In [None]:
raw_data = None

gc.collect()

In [None]:
session_dex = 0
event_dex = 0

def generate_batch(batch_size, negative_size, window_size = 1):
    assert min_session_size > 1
    
    global session_dex
    global event_dex
    
    labels = np.ndarray(shape=(batch_size, window_size), dtype=np.int32)
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    negative = np.ndarray(shape=(batch_size, negative_size), dtype=np.int32)
     
    current = 0
    session = list(data[session_dex][0])
    
    for i in range(0, batch_size):     
        batch[i] = w2i[session[event_dex]]
        
        for j in range(1, window_size + 1):
            labels[i][j - 1] = w2i[session[(event_dex + j) % len(session)]]
            
            if (labels[i][j - 1] == batch[i]):
                labels[i][j - 1] = labels[i][j - 2]
            
        neg = 0
        
        for j in data[session_dex][1]:
            negative[i][neg] = w2i[j]
            neg += 1
            if (neg == negative_size):
                break
                
        rand_neg = np.random.randint(len(groups), size=negative_size - neg)
        
        for j in range(0, negative_size - neg):
            negative[i][neg + j] = rand_neg[j]
            
        event_dex += 1

        if (event_dex == len(session)):
            event_dex = 0
            session_dex = session_dex + 1
            if (session_dex >= len(data)):
                session_dex = 0
            session = list(data[session_dex][0])        
     
#     return batch, labels, []
    return batch, labels, negative


In [None]:
session_dex = 0

print(data[session_dex])
print(data[session_dex + 1])

batch, labels, negative = generate_batch(16, 2, 2)

for i in range(10):
#     print(i2w[batch[i]], '->', lmap(lambda x: i2w[x], labels[i]), )
    print(i2w[batch[i]], '->', lmap(lambda x: i2w[x], labels[i]), '-> (negative)', lmap(lambda x: i2w[x], negative[i]))
    
print(negative)

In [None]:
#raw_data = None
test_ids=lfilter(lambda x: x in w2i, [129440544, 28261334, 92876084, 51016572, 91933860])

if (mode == 1):
    test_ids = ['term', 'first', 'used', 'early', 'against', 'working']

learning_rate = 0.1
vocab_size = len(groups)

window_size = 4
embedding_size = 32
negative_size = 10
batch_size = 1

In [None]:
def get_closest(emb, index, f = None):
    p = emb[index]
    cnst = tf.constant(p, shape=[1, embedding_size])
    d = tf.matmul(cnst, emb, transpose_b=True).eval()[0]

    dxs = np.argsort(np.array(d))
    
    ids = []
    res = []
    
    for i in range(len(dxs) - 10, len(dxs)):
        ids.append(i2w[dxs[i]])
        res.append(d[dxs[i]])
    
    if (mode == 0):
        info = get_info(ids)
    else:
        info = ids
    
    for i in xrange(len(res)):
        print(ids[i], ' ', res[i], ' ', info[i])
        
        if (f != None):
            f.write(str(ids[i]) + ' ' + str(res[i]) + ' ' + str(info[i]) + '\n')
            
def test(model):
    for i in test_ids:
        t = []
        
        for j in range(vocab_size):
            fst = Variable(torch.LongTensor([w2i[i]]))
            snd = Variable(torch.LongTensor([j]))
            t.append([model.score(fst, snd), i2w[j]])
        
        t.sort(key = lambda x: -x[0])
        
        ids = []
        res = t[:10]
       
        for k in res:
            ids.append(k[1])
            
        info = get_info(ids)
        
        print(i)
        for i in range(10):
            print(res[i], ' ', info[i])
#         print(t[0][1])
#         print(get_info(t[:10]))


In [None]:
n_iterations = 1350000

def loss_sampled(scores):
    res = scores[0]
    
    for i in range(1, len(scores)):
        res = res + scores[i]
        
    return res * Variable(torch.Tensor([-1]))

class SkipGram(nn.Module):
    def __init__(self, vocab_size, embd_size):
        super(SkipGram, self).__init__()
        self.in_embeddings = nn.Embedding(vocab_size, embd_size)
        self.out_embeddings = nn.Embedding(vocab_size, embd_size)
    
    def forward(self, focus, context):
        embed_focus = self.in_embeddings(focus).view((1, -1))
        embed_ctx = self.out_embeddings(context).view((1, -1))

        score = torch.mm(embed_focus, torch.t(embed_ctx))
      
        return score
    
    def score(self, focus, context):
        embed_focus = self.in_embeddings(focus).view((1, -1))
        embed_ctx = self.in_embeddings(context).view((1, -1))

        score = F.cosine_similarity(embed_focus, embed_ctx)
    
        return score
    
model = SkipGram(vocab_size, embedding_size)    
pref = "/m1/"
    
def train_skipgram():
    losses = []
    loss_fn = loss_sampled

    print(model)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    
    total_loss = .0
    
    for i in range(n_iterations):
        target, contexts, negative = generate_batch(batch_size, negative_size, window_size)
        
        model.zero_grad()

        it_losses = []
        
        for j in range(len(target)):  
            scores = []
            
            in_w_var = Variable(torch.LongTensor([target[j]]))

            for ctx in contexts[j]:
                out_w_var = Variable(torch.LongTensor([ctx]))

                score = torch.sigmoid(model(in_w_var, out_w_var))

                if (score != 0):
                    scores.append(torch.log(score))
                else:
                    scores.append(torch.log(score + torch.Tensor([0.0000001])))

            for neg in negative[j]:
                out_w_var = Variable(torch.LongTensor([neg]))

                score = torch.sigmoid(model(in_w_var, out_w_var) * Variable(torch.Tensor([-1])))

                if (score != 0):
                    scores.append(torch.log(score))
                else:
                    scores.append(torch.log(score + torch.Tensor([0.0000001])))           
            
#         with torch.autograd.detect_anomaly():
#             print(scores)
            it_losses.append(loss_fn(scores))
    
        loss = it_losses[0]
#         loss = torch.mean(torch.stack(it_losses))
        
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        
        if (i % 2000 == 0):
            if i > 0:
                total_loss /= 2000
                
            if (i % 10000 == 0):
                dump(model, pref + str(i))
                test(model)
            
            log.debug('Average loss at step %d: %.4f', i, total_loss)
            total_loss = 0
            losses.append(total_loss)
      
    
    return model, losses

In [None]:
sg_model, sg_losses = train_skipgram()

In [None]:
test(model)

In [None]:
lst = []
test_ids=[129440544, 28261334, 92876084, 51016572, 91933860]

for i in data:
    for j in test_ids:
        if j in i[0]:
            f = list(i[0])
            f.remove(j)
            f.append(j)
            lst.append(f)

In [None]:
print(len(lst))
print(len(lst[0]))

In [None]:
import random
random.shuffle(lst)

In [None]:
# for i in xrange(100):
#     print(get_info(lst[i]))
#     print()
#     print("==================================================")
#     print()

In [None]:
class SkipGram2(nn.Module):
    def __init__(self, vocab_size, embd_size):
        super(SkipGram2, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embd_size)
    
    def forward(self, focus, context):
        embed_focus = self.embeddings(focus).view((1, -1))
        embed_ctx = self.embeddings(context).view((1, -1))

        score = torch.mm(embed_focus, torch.t(embed_ctx))
      
        return score
    
    def score(self, focus, context):
        embed_focus = self.embeddings(focus).view((1, -1))
        embed_ctx = self.embeddings(context).view((1, -1))

        score = F.cosine_similarity(embed_focus, embed_ctx)
    
        return score
    
model = SkipGram2(vocab_size, embedding_size)    
pref = "/m2/"

In [None]:
sg_model, sg_losses = train_skipgram()

In [None]:
test(model)