In [None]:
# imports
import re
import time
import pickle
import logging
import gc
import os
import math
import functools
import requests
import random
import glob
import json

import pandas as pd
import numpy as np
import math as m
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn

from scipy import stats

from six.moves import xrange 
from pathlib import Path

import torch
from torch.autograd import Variable
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

log = logging.getLogger('log')
log.setLevel(logging.DEBUG)

lhnd = logging.StreamHandler()
lhnd.setLevel(logging.DEBUG)
formatter = logging.Formatter('%(asctime)s %(levelname)s %(message)s')
lhnd.setFormatter(formatter)

log.addHandler(lhnd)

%autonotify -a 30

In [None]:
mode = 0

In [None]:
ignore_dumps = False

def lmap(f, arr):
    return list(map(f, arr))

def lfilter(f, arr):
    return list(filter(f, arr))

def foreach(it, f):
    for e in it:
        f(e)
        
def dump(data, name):
    with open('data/' + name, 'wb') as f:
        pickle.dump(data, f, pickle.HIGHEST_PROTOCOL)
        
def load(name):
    with open('data/' + name, 'rb') as f:
        return pickle.load(f)
    
def load_or_dump(path, func):
    if not Path('data/' + path).exists() or ignore_dumps:
        res = func()
    
        dump(res, path)
    else:
        res = load(path)
        
    return res


In [None]:
from time import sleep

with open('auth/token') as f:
    token = f.readline().strip()

def get_info(ids):
    sleep(0.2)
    mc = 'members_count'
    payload = {'v': '5.92', 'access_token': token, 'fields':mc}
    
    str_ids = functools.reduce(
        lambda x, y: x + y,
        lmap(lambda x: str(x) + ',', ids)
    )
    
    print(str_ids)
    
    payload['group_ids'] = str_ids[0:- 1]
    
    r = requests.get('https://api.vk.com/method/groups.getById', 
                     params=payload)
    
    if (not 'response' in r.json()):
        print(r.json())
        
    res = lmap(lambda x: (x['name'], x['screen_name'], "{:,}".format(x[mc]) if mc in x else -1),r.json()['response'])
    
    return(res)

def info_print(lst):
    info = get_info(lst)
    
    print(lmap(lambda x: x[0], info))

In [None]:
total = 1015925

def raw_data_filter(file):
    # Mapping to events
    res = list()

    i = 0
    
    for line in file:
        cur = line.rstrip().split(',')
        cur = lmap(lambda p: (re.sub(';.*', '', p), re.sub('.*;', '', p)), cur)

        session = list()
        
        for j in range(0, len(cur)):
            try:
                session.append(int(cur[j][1]))
            except ValueError:
                None
                
        res.append(session)

        i = i + 1
                
        if (i % 100000 == 0):
            gc.collect()

            log.debug("%d %% of mapping is done.", i / total * 100)

    
    return res

In [None]:
if (mode == 0):
    raw_data = load_or_dump('raw', lambda: raw_data_filter(open("data/public_sessions_2.txt","r")))

    log.info("Data loaded")


In [None]:
def group_count(data):
    total = dict()

    for i in data:
        for j in i[0]:
            if (j in total.keys()):
                total[j] = total[j] + 1
            else:
                total[j] = 1
                
    return total

In [None]:
def load_words_data(file):
    words = []
    
    for line in file:
        for word in line.split():
            words.append(word)

    return words

In [None]:
if (mode == 1):
    words_size = 50000
    
    ignore_dumps = True
    data = load_or_dump('raw_txt', lambda: load_words_data(open("data/text8.txt","r")))
    groups = group_count([[data]])
    
    dictlist = list(groups.items())
    dictlist.sort(key = lambda x: x[1])
    allowed = set(lmap(lambda x: x[0], dictlist[-words_size:]))

    for i in xrange(len(data)):
        if not data[i] in allowed:
            data[i] = '-1'
            
    groups = group_count([[data]])
    
    data = [[data, []]]
    
    print(len(groups))
    

In [None]:
min_session_size = 2
max_session_size = 20

def initiail_mapping(lst, min_allowed):
    result = []
    groups = set()
    
    for session in lst:
        unsub = set()
        sub = set()
        malformed = set()
        
        for event in session:
            if (event < 0):
                sub_event = -event
                
                if (sub_event in sub or sub_event in malformed):
                    sub.discard(sub_event)
                    unsub.discard(sub_event)
                    malformed.add(sub_event)
                else:
                    unsub.add(sub_event)
            else:
                if (event in unsub or event in malformed):
                    unsub.discard(event)
                    sub.discard(event)
                    malformed.add(event)
                else:
                    sub.add(event)
        
        if (len(sub) >= min_session_size and len(sub) <= max_session_size):
            for event in sub:
                groups.add(event)
            for event in unsub:
                groups.add(event)
            
            result.append((sub, unsub))
    
    return result, groups
    

def set_map(lst, cnt, min_allowed):
    result = []
    groups = set()
    
    for session in lst:
        unsub = set()
        sub = set() 
        
        for event in session[0]:
            if (cnt[event] > min_allowed):
                sub.add(event)
                
        for event in session[1]:
            if (cnt.get(event, -1) > min_allowed):
                unsub.add(event)    
        
        if (len(sub) >= min_session_size):
            for event in sub:
                groups.add(event)
            for event in unsub:
                groups.add(event)
            
            result.append((sub, unsub))
    
    return result, groups

def drop_uncommon(raw_data, min_allowed = 10):
    cnt = None
    sorted_cnt = None
    
    data, groups = initiail_mapping(raw_data, min_allowed)
    cnt = group_count(data) 
    sorted_cnt = sorted(list(cnt.values()))
    
    while (cnt == None or sorted_cnt[0] < min_allowed):
        data, groups = set_map(data, cnt, min_allowed)
                
        cnt = group_count(data) 
        sorted_cnt = sorted(list(cnt.values()))
        
        log.info("Length of data:   %d", len(data))
        log.info("Total length:     %d", 
                functools.reduce((lambda x, y: x + y), lmap(lambda a: len(a), data))
                )
        log.info("Number of groups: %d", len(groups))
        log.info("Minimum count:    %d\n", sorted_cnt[0])
        
    return data, groups

In [None]:
if (mode == 0):
    ignore_dumps = False
    data, groups = load_or_dump('final_data', lambda: drop_uncommon(raw_data, 50))

    most_common = sorted(group_count(data).items(), key=lambda x: x[1], reverse=True)

In [None]:
print(len(groups))

In [None]:
w2i = {w: i for i, w in enumerate(groups)}
i2w = {i: w for i, w in enumerate(groups)}

In [None]:
fil_path = "data/filtered"

if not Path(fil_path).exists():
    with open(fil_path, "w") as out:  
        for id, session in enumerate(data):
            for sub in session[0]:
                out.write(str(id) + " " + str(w2i[sub]) + "\n")
                
            for unsub in session[1]:
                out.write(str(id) + " " + str(-w2i[unsub]) + "\n")
                
    with open("data/filtered_all", "w") as out:  
        for id, session in enumerate(data):
            for sub in session[0]:
                out.write(str(w2i[sub]) + " ")
            
            out.write("\n")

    dump(w2i, "w2i")
    dump(i2w, "i2w")

In [None]:
print(i2w[0])

In [None]:
raw_data = None

gc.collect()

In [None]:
session_dex = 0
event_dex = 0

def generate_window_batch(batch_size, negative_size, window_size = 1):
    assert min_session_size > 1
    
    global session_dex
    global event_dex
    
    labels = np.ndarray(shape=(batch_size, window_size), dtype=np.int32)
    batch = np.ndarray(shape=(batch_size), dtype=np.int32)
    negative = np.ndarray(shape=(batch_size, negative_size), dtype=np.int32)
     
    current = 0
    session = list(data[session_dex][0])
    
    for i in range(0, batch_size):     
        batch[i] = w2i[session[event_dex]]
        
        for j in range(1, window_size + 1):
            labels[i][j - 1] = w2i[session[(event_dex + j) % len(session)]]
            
            if (labels[i][j - 1] == batch[i]):
                labels[i][j - 1] = labels[i][j - 2]
            
        neg = 0
        
        for j in data[session_dex][1]:
            negative[i][neg] = w2i[j]
            neg += 1
            if (neg == negative_size):
                break
                
        rand_neg = np.random.randint(len(groups), size=negative_size - neg)
        
        for j in range(0, negative_size - neg):
            negative[i][neg + j] = rand_neg[j]
            
        event_dex += 1

        if (event_dex == len(session)):
            event_dex = 0
            session_dex = session_dex + 1
            if (session_dex >= len(data)):
                session_dex = 0
            session = list(data[session_dex][0])        
     
    return batch, labels, negative


def generate_batch(negative_size):
    assert min_session_size > 1
    
    global session_dex
    global event_dex
    
    session = list(data[session_dex][0])
    
    labels = lmap(lambda x: w2i[x], session)
    batch = w2i[session[event_dex]]
    negative = np.ndarray(shape=(negative_size), dtype=np.int32)
                
    neg = 0

    for j in data[session_dex][1]:
        negative[neg] = w2i[j]
        neg += 1
        if (neg == negative_size):
            break

    rand_neg = np.random.randint(len(groups), size=negative_size - neg)

    for j in range(0, negative_size - neg):
        negative[neg + j] = rand_neg[j]

    event_dex += 1

    if (event_dex == len(session)):
        event_dex = 0
        session_dex = session_dex + 1
        if (session_dex >= len(data)):
            session_dex = 0 

    return batch, labels, negative

In [None]:
# session_dex = 0

print(data[session_dex])
print(data[session_dex + 1])

batch, labels, negative = generate_batch(10)

print(i2w[batch], '->', lmap(lambda x: i2w[x], labels), '-> (negative)', lmap(lambda x: i2w[x], negative))
    
print(negative)

In [None]:
test_ids = lfilter(lambda x: x in w2i, [129440544, 28261334, 92876084, 51016572, 91933860, 22751485])

if (mode == 1):
    test_ids = ['term', 'first', 'used', 'early', 'against', 'working']

In [None]:
lst = []

for i in data:
    for j in test_ids:
        if j in i[0]:
            f = list(i[0])
            f.remove(j)
            f.append(j)
            lst.append(f)
            
random.shuffle(lst)

for i in xrange(10):
    sleep(1)
    print(get_info(lst[i]))
    print()
    print("==================================================")
    print()

In [None]:
def test(model):
    for i in test_ids:
        t = []
        
        for j in range(vocab_size):
            fst = Variable(torch.LongTensor([w2i[i]]))
            snd = Variable(torch.LongTensor([j]))
            t.append([model.score(fst, snd), i2w[j]])
        
        t.sort(key = lambda x: -x[0])
        
        ids = []
        res = t[:10]
       
        for k in res:
            ids.append(k[1])
            
        info = get_info(ids)
        
        print(i)
        for i in range(10):
            print(res[i], ' ', info[i])

In [None]:
learning_rate = 0.1
vocab_size = len(groups)

window_size = 4
embedding_size = 64
negative_size = 10
batch_size = 1

pref = "/etmp/"

In [None]:
n_iterations = 2000001

def loss_sampled(scores):
    res = scores[0]
    
    for i in range(1, len(scores)):
        res = res + scores[i]
        
    return res * Variable(torch.Tensor([-1]))

class SkipGram(nn.Module):
    def __init__(self, vocab_size, embd_size):
        super(SkipGram, self).__init__()
        self.in_embeddings = nn.Embedding(vocab_size, embd_size)
        self.out_embeddings = nn.Embedding(vocab_size, embd_size)
    
    def forward(self, focus, context):
        embed_focus = self.in_embeddings(focus).view((1, -1))
        embed_ctx = self.out_embeddings(context).view((1, -1))

        score = torch.mm(embed_focus, torch.t(embed_ctx))
      
        return score
    
    def score(self, focus, context):
        embed_focus = self.in_embeddings(focus).view((1, -1))
        embed_ctx = self.in_embeddings(context).view((1, -1))

        score = F.cosine_similarity(embed_focus, embed_ctx)
    
        return score
    
model = SkipGram(vocab_size, embedding_size)    
    
def train_skipgram():
    losses = []
    loss_fn = loss_sampled

    print(model)
    optimizer = optim.SGD(model.parameters(), lr=learning_rate)
    
    total_loss = .0
    
    for i in range(n_iterations):
        target, contexts, negative = generate_batch(negative_size)
        
        model.zero_grad()

        it_losses = []
        
        scores = []

        in_w_var = Variable(torch.LongTensor([target]))

        for ctx in contexts:
            out_w_var = Variable(torch.LongTensor([ctx]))

            score = torch.sigmoid(model(in_w_var, out_w_var))

            if (score != 0):
                scores.append(torch.log(score))
            else:
                scores.append(torch.log(score + torch.Tensor([0.0000001])))

        for neg in negative:
            out_w_var = Variable(torch.LongTensor([neg]))

            score = torch.sigmoid(model(in_w_var, out_w_var) * Variable(torch.Tensor([-1])))

            if (score != 0):
                scores.append(torch.log(score))
            else:
                scores.append(torch.log(score + torch.Tensor([0.0000001])))           

#         with torch.autograd.detect_anomaly():
#             print(scores)

    
        loss = loss_fn(scores)
#         loss = torch.mean(torch.stack(it_losses))
        
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        
        if (i % 2000 == 0):
            if i > 0:
                total_loss /= 2000
                
            if (i % 100000 == 0):
                dump(model, pref + str(i))
                test(model)
            
            log.debug('Average loss at step %d: %.4f', i, total_loss)
            total_loss = 0
            losses.append(total_loss)
      
    
    return model, losses

In [None]:
# tr_model, tr_losses = train_skipgram()

# dump(model, pref + "final")

In [None]:
model = load("final_model")

In [None]:
test(model)

In [None]:
def get_all_embed(model):
    return model.in_embeddings(
            Variable(
                torch.LongTensor(
                    list(range(len(groups)))
                )
            )
        ).detach().numpy()

In [None]:
from sklearn import cluster

num_clusters = 32

all_embed = get_all_embed(model)

km = sklearn.cluster.KMeans(n_clusters = num_clusters)

km.fit(all_embed)

predicitons = km.predict(all_embed)

target = predicitons[w2i[29534144]]

res = []

for i in range(len(groups)):
    if predicitons[i] == target:
        res.append(i2w[i])

if (len(res) < 180):
    print(get_info(res))

In [None]:
target = predicitons[w2i[12648877]]

print (km.cluster_centers_[target])

t = []
        
for j in range(vocab_size):
    cent = Variable(torch.FloatTensor([km.cluster_centers_[target]]))
    embed_ctx = model.in_embeddings(Variable(torch.LongTensor([j]))).view((1, -1))

    score = F.cosine_similarity(cent, embed_ctx)
    
    t.append([score, i2w[j]])

t.sort(key = lambda x: -x[0])

ids = []
res = t[:10]

for k in res:
    ids.append(k[1])

info = get_info(ids)

print(i)
for i in range(10):
    print(res[i], ' ', info[i])

In [None]:
tsne = sklearn.manifold.TSNE(verbose=1)

embed_2d = tsne.fit_transform(all_embed)

In [None]:
ones = [-1.0 for i in range(embedding_size)]

cls_index = [i for i in range(num_clusters)]

cls_index.sort(key = lambda x: F.cosine_similarity(
                    torch.FloatTensor(km.cluster_centers_[x]).view((1, -1)), 
                    torch.FloatTensor(ones).view((1, -1))
              ))

fig, ax = plt.subplots(figsize = (40, 20))

cmap = plt.get_cmap("jet", num_clusters)

sct = ax.scatter(
    x = lmap(lambda x: x[0], embed_2d), 
    y = lmap(lambda x: x[1], embed_2d), 
    c = lmap(lambda x: cls_index.index(x), predicitons), 
    s = 70,
    cmap = cmap,
    alpha = 0.4
)

ax.set_title("Clusters are colored with gradation")

plt.show()

In [None]:
def zoom_in(x_center, y_center, limit, with_data = False):
    sc_data = zip(
        lmap(lambda x: x[0], embed_2d), 
        lmap(lambda y: y[1], embed_2d), 
        lmap(lambda c: cls_index.index(c), predicitons),
        list(range(len(embed_2d)))
    )
      
    sc_data = lfilter(lambda elem: abs(elem[0] - x_center) < limit 
                      and abs(elem[1] - y_center) < limit,
                      sc_data)

    if (len(sc_data) == 0):
        print("No data")
        
        return
    
    fig, ax = plt.subplots(figsize = (10, 5))

    classes = lmap(lambda x: x[2], sc_data)
    
    enum = list(enumerate(set(classes)))
    
    class_remap = dict(lmap(lambda x:(x[1], x[0]), enum))
    classes_map = dict(enum) 
    
    classes = lmap(lambda x: class_remap[x], classes)
    
    cmap = plt.get_cmap("jet", len(set(classes)))
    
    sct = ax.scatter(
        x = lmap(lambda x: x[0], sc_data), 
        y = lmap(lambda x: x[1], sc_data), 
        c = classes, 
        s = 70,
        cmap = cmap,
        alpha = 1
    )
    
    cb = plt.colorbar(sct, spacing = "proportional", ticks = np.linspace(0, len(classes), len(classes) + 1))

    cb.set_alpha(1)
    cb.draw_all()
    
    plt.show()
    
    if (with_data):
        for c in class_remap.keys():
            print(class_remap[c])
            
            sleep(0.5)
            
            info_print(
                lmap(lambda x: i2w[x[3]], lfilter(lambda x: x[2] == c, sc_data))
            )        

In [None]:
raw_df = pd.read_csv("data/categories_predict_dataset_v2.csv", index_col = False)
df = raw_df[raw_df.id.isin(w2i)]

In [None]:
print ("Total groups in main dataset: ", len(groups), " common groups in data sets:", len(df))

In [None]:
print (len(set(df.general)))
print (len(set(df.detailed)))

In [None]:
min_cat_count = 150

cat_dict = dict()

cat_lst = list(df.general)

for c in cat_lst:
    if c in cat_dict:
        cat_dict[c] += 1
    else:
        cat_dict[c] = 1
        
cat_set = set()

for item in cat_dict.items():
    if (item[1] >= min_cat_count):
        cat_set.add(item[0])
        
print(lfilter(lambda x: x[0] in cat_set, cat_dict.items()))

In [None]:
from sklearn import svm
from sklearn import linear_model

from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.model_selection import train_test_split
from sklearn.metrics import *

def get_classifiers():
    params = {'verbose': 0, 'n_estimators': 100}

    gbc = GradientBoostingClassifier(**params)
    
    abc = AdaBoostClassifier(n_estimators = 100)
    
    s_clf = svm.LinearSVC()
    
    sgd = linear_model.SGDClassifier(max_iter = 1000, tol = 0.001)
    
    return [gbc, abc, s_clf, sgd]

In [None]:
def classify(all_embed, classifier):
    cl_data = list(enumerate(all_embed))

    df_ids = list(df["id"])

    cl_data = lfilter(lambda x: i2w[x[0]] in df_ids, cl_data)

    cl_data = lmap(
        lambda x: [x[0], x[1], list(df[df.id == i2w[x[0]]].general)[0]], 
        cl_data
    )
    
    cl_data = lfilter(lambda x: x[2] in cat_set, cl_data)
    
    cl_train, cl_test = train_test_split(cl_data)
    
    trained = classifier.fit(lmap(lambda x: x[1], cl_train), lmap(lambda x: x[2], cl_train))
    
    return [trained, f1_score(
        lmap(lambda x: x[2], cl_test),
        trained.predict(lmap(lambda x: x[1], cl_test)),
        average = "micro"
    )]

def classify_model(model_name, classifier):
    return classify(get_all_embed(load(model_name)), classifier)

def classify_all(embeds):
    classifiers = get_classifiers()
    
    for c in classifiers:
        __, f1 = classify(embeds, c)
        
        print(f1)

In [None]:
als_embed_file = glob.glob("scala/**/out/ALS_embeddings/*.json")[0]

In [None]:
def map_embeds(embed_dict, i2w_file):
    _i2w = load(i2w_file)
    
    res = list(embed_dict.items())
    
    res = lmap(lambda x: (_i2w[x[0]], x[1]), res)
    
    res = lmap(lambda x: (w2i[x[0]], x[1]), res)
    
    sorted(res, key = lambda x: x[0])
    
    return lmap(lambda x: x[1], res)

In [None]:
als_embeds = []

with open(als_embed_file) as json_file: 
    raw_als_embeds = dict()
    
    embed_list = []
    
    for single in json_file.readlines():
        embed_list.append(json.loads(single))
        
    for emb in embed_list:
        raw_als_embeds[emb['id']] = emb['features']
        
    als_embeds = map_embeds(raw_als_embeds, "als_i2w")
    
    assert len(als_embeds) == len(groups)

In [None]:
lda_embed_file = glob.glob("scala/**/out/lda_embeddings")[0]

lda_embeds = []

with open(lda_embed_file) as f: 
    raw_lda_embeds = dict()
    
    embed_list = []
    
    i = 0
    
    for single in f.readlines():
        raw_lda_embeds[i] = lmap(lambda x: float(x),
            lfilter(
                lambda x: len(x), 
                re.split(" ", single.rstrip())
            )
        )
        
        i += 1
            
    lda_embeds = map_embeds(raw_lda_embeds, "lda_i2w")
    
    assert len(lda_embeds) == len(groups)

In [None]:
classify_all(get_all_embed(load("final_model")))

In [None]:
classify_all(als_embeds)

In [None]:
classify_all(lda_embeds)