# Create Word2Vec Embeddings

In [1]:
###
# imports
###

# for running in Google Colab
from google.colab import drive
drive.mount('/content/drive')

import os
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
import nltk
nltk.download('punkt')

Mounted at /content/drive


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

## Helper Functions

In [None]:
def read_file_as_one_document(filename):
    with open(filename, 'r') as f:
        text = f.read()
        tokens = word_tokenize(text)
        return tokens

def get_documents_from_subfolders(base_path, subfolder, num_files=2, subset = False):
    documents = []
    # labels = []
    subfolder_path = os.path.join(base_path, subfolder)
    print(f"subfolder_path: {subfolder_path}")

    # if subset:
    #   files = sorted(os.listdir(subfolder_path))[:num_files]  # Get the first x files from each subfolder
    # else:
    #   files = sorted(os.listdir(subfolder_path))              # Get all the files
    files = os.listdir(subfolder_path)
    for filename in files:
        if filename.endswith('.txt'):
            file_path = os.path.join(subfolder_path, filename)
            documents.append(read_file_as_one_document(file_path))
            # labels.append(subfolder)  # Add the label for the subfolder
    return documents

# document vector
def document_vector(word2vec_model, doc_tokens):
    doc_tokens = [token for token in doc_tokens if token in word2vec_model.wv.key_to_index]
    if not doc_tokens:
        return np.zeros(word2vec_model.vector_size)
    return np.mean(word2vec_model.wv[doc_tokens], axis=0)

subfolder_path: /content/drive/MyDrive/REU/ResearchProject/v077_clean/Vobfus


## Conversion Code

In [None]:
base_path = '/content/drive/MyDrive/REU/ResearchProject/v077_clean/'
#### from maleware2
# subfolders = ['winwebsec', 'zeroaccess'] #'zbot',
#### from v077
subfolders = ['Vobfus', 'Diplugem', 'Obfuscator', 'Vundo', 'VBInject',
              'Delf', 'Beebone', 'Enterak.A','OnLineGames',
              'Startpage', 'Allaple.A', 'Injector', 'Systex.A', 'Expiro.BK',
              'FakeRean', 'Small', 'Toga!rfn', 'Lamechi.B', 'CeeInject',
              'Renos', 'Hotbar', 'DelfInject']
vector_size = 104


for family in subfolders:
  # Get the documents from each subfolder
  documents = get_documents_from_subfolders(base_path, subfolder = family)

  print(f"family: {family}")
  print(f"documents: {len(documents)}")
  print("------------------------------------------------------")

  #Word2Vec model
  word2vec_model = Word2Vec(sentences=documents, vector_size=vector_size,  window=10, min_count=0, workers=4, seed=42)

  #document embedding
  doc_vectors = [document_vector(word2vec_model, doc) for doc in documents]
  doc_vectors_df = pd.DataFrame(doc_vectors)

  #normalize to [0,1]
  max_val = doc_vectors_df.to_numpy().max()
  min_val = doc_vectors_df.to_numpy().min()
  range_vals = max_val - min_val

  for i in range(vector_size):
    doc_vectors_df[i] = (doc_vectors_df[i] - min_val) / range_vals

  #save csv
  new_filename = str(f"data/embeddings/top25_{vector_size}/{family}.csv")
  doc_vectors_df.to_csv(new_filename, header=False, index=False)


#