In [None]:
from idarkvec.models.word2vec import Word2Vec
from idarkvec.preprocessing import CorpusExtractor

In [None]:
import pandas as pd
import numpy as np

In [None]:
TRACES = 'data'
darknet = 'darknet01'
START, END = '20210501', '20210531'

## Embedding generation

In [None]:
from datetime import datetime, timedelta
def get_day(i, startfrom=START):
    start = datetime.strptime(startfrom, '%Y%m%d')
    DATE = start + timedelta(days=i)
    day = DATE.strftime('%Y%m%d')
    return day

In [None]:
for day in range(31):
    print("#" * 70)
    print(f'Processing day {day+1}/31')
    current_day = get_day(day)
    # extract corpus
    corpus = CorpusExtractor(trace_path=f'{TRACES}/{darknet}', day=current_day, 
                             min_freq=5).from_darknet(top_ports=2500, verbose=True)
    
    mname = f'{darknet}_{START}'
    
    if current_day == START:
        word2vec = Word2Vec(c=5, e=200, destination=mname)
        word2vec.train(corpus, save=False) # you can change save=True to save the model
    else:
        word2vec.update(corpus, save=False)
        ## Or you can first load the saved model
        # word2vec = Word2Vec(c=5, e=200, source=sname, destination=mname)
        # word2vec.update(corpus, save=True)
print("#" * 70)
print('Training completed!')

In [None]:
# load the ground truth (acknowledged IPs)
GT = pd.read_csv('ground_truth.csv.gz',index_col=[0])

In [None]:
# Get the embeddings
Emb = word2vec.get_embeddings(labels = GT)
# save the embeddings
# Emb.to_csv(f'{current_day}_embedding.csv')

In [None]:
Emb.head()

## Down-stream task example -- K-nn classification validation

In [None]:
from idarkvec.downstream.knn import KnnClassifier
from sklearn.metrics import classification_report

In [None]:
X = Emb.iloc[:, :200].values
y = Emb.label.values
# not validate unknown IPs
y_val = y[y != 'unknown']
valid_classes = np.unique(y_val)

In [None]:
knn = KnnClassifier(n_neighbors=5, metric='cosine')
knn.fit(X, y, scale_data=True)

In [None]:
# Leave-One-Out validation, Predict the labels only for labelled samples
to_keep = np.where(y!="unknown")[0].reshape(-1, 1) # Get the indices
y_pred = knn.predict(to_keep, scale_data=True, loo=True)

In [None]:
print(classification_report(y_val, y_pred, labels = valid_classes))