In [7]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
# from sgt import SGT
# import pandarallel

In [11]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [12]:
path = "../data/dataset.json"

with open(path) as f:
    data = json.load(f)

df = pd.json_normalize(data)
df.head()

In [15]:
sites = df.sites

In [16]:
del data, df

In [19]:
def seq_extract(x):
    output = []
    
    for site in x:
        output.append(site['site'].replace(".", "_"))
    return " ".join(output)

In [20]:
sequences = sites.apply(seq_extract).reset_index().rename(columns = {'index':'id', 'sites':'sequence'})
sequences.head()

Unnamed: 0,id,sequence
0,0,lenta_ru lenta_ru vk_com lenta_ru wikipedia_or...
1,1,windowsupdate_com amazon_com live_com akamaied...
2,2,slack_com slack_com mail_google_com vk_com fac...
3,3,amazon_com microsoft_com gvt2_com bing_net fbc...
4,4,vk_com mail_google_com lenta_ru slack_com airb...


In [21]:
seqs = sequences.sequence.tolist()

In [22]:
tagged_data = [TaggedDocument(words=_d.lower(), tags=[str(i)]) for i, _d in enumerate(seqs)]

In [48]:
tagged_data[0]

TaggedDocument(words='lenta_ru lenta_ru vk_com lenta_ru wikipedia_org instagram_com msn_com amazon_com outlook_com verisign_com googleadservices_com akamaiedge_net stardock_com mediaarea_net', tags=['0'])

In [49]:
max_epochs = 100
vec_size = 25
alpha = 0.025

model = Doc2Vec(vector_size=vec_size,
                alpha=alpha, 
                min_alpha=0.00025,
                min_count=3,
                dm =1, 
               window=3)
  
model.build_vocab(tagged_data)

In [50]:
model.train(tagged_data, total_examples=model.corpus_count, epochs=max_epochs)

In [31]:
path = "../data/dataset.json"

with open(path) as f:
    data = json.load(f)

df = pd.json_normalize(data)
df.head()

Unnamed: 0,browser,os,locale,user_id,gender,location,sites,time,date
0,Chrome,Windows 8,de-DE,164,m,Canada/Toronto,"[{'site': 'lenta.ru', 'length': 296}, {'site':...",03:57:00,2016-08-14
1,Chrome,Windows 10,pt-PT,99,f,Netherlands/Amsterdam,"[{'site': 'windowsupdate.com', 'length': 56}, ...",13:52:00,2016-05-31
2,Chrome,Windows 10,bg-BG,28,m,Brazil/Rio de Janeiro,"[{'site': 'slack.com', 'length': 158}, {'site'...",17:40:00,2019-04-15
3,Internet Explorer,Ubuntu,en-US,163,f,New Zealand/Auckland,"[{'site': 'amazon.com', 'length': 168}, {'site...",20:23:00,2018-03-25
4,Firefox,Debian,ro-RO,177,f,New Zealand/Auckland,"[{'site': 'vk.com', 'length': 43}, {'site': 'm...",01:10:00,2018-09-08


In [51]:
embeds = [model.dv[str(i)] for i in range(len(seqs))]

In [52]:
embeds = np.array(embeds)

In [35]:
df["target"] = 1*(df.user_id == 0)

In [36]:
from sklearn.linear_model import LogisticRegression

In [37]:
embeds.shape

(160000, 50)

In [53]:
lr = LogisticRegression()

In [54]:
lr.fit(embeds, df.target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [55]:
lr.score(embeds, df.target)

0.995

In [43]:
from sklearn.model_selection import cross_val_score

In [56]:
cv = cross_val_score(lr, embeds, df.target, cv = 5)

In [57]:
cv

array([0.995, 0.995, 0.995, 0.995, 0.995])

In [60]:
preds = lr.predict_proba(embeds)[: , 1]

In [61]:
preds

array([0.00618836, 0.00340772, 0.00415972, ..., 0.00100301, 0.00131412,
       0.00179374])

In [63]:
pd.Series(preds).describe()

count    1.600000e+05
mean     5.000527e-03
std      1.029953e-02
min      2.409539e-07
25%      5.838604e-04
50%      1.643320e-03
75%      4.841554e-03
max      3.774888e-01
dtype: float64

In [46]:
cv.mean()

0.9949999999999999

In [47]:
model.build_vocab()

1