# Doc2Vec

In [2]:
!pip install sctokenizer
!pip install nltk
!pip install scipy==1.12
!pip install gensim

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting scipy==1.12
  Using cached scipy-1.12.0-cp312-cp312-win_amd64.whl.metadata (60 kB)
Using cached scipy-1.12.0-cp312-cp312-win_amd64.whl (45.8 MB)
Installing collected packages: scipy
Successfully installed scipy-1.12.0
Defaulting to user installation because normal site-packages is not writeable


In [3]:
from sctokenizer import JavaTokenizer, TokenType
import math
def tokenize_code(code):
    """
    Tokenize Java code using JavaTokenizer from sctokenizer.

    Parameters:
    code (str): The Java code to tokenize.

    Returns:
    list: A list of token types or token values.
    """

    tokenizer = JavaTokenizer()
    tokens = tokenizer.tokenize(code)
    token_list = []
    for token in tokens:
      if token.token_type == TokenType.COMMENT_SYMBOL:
        continue
      if token.token_type == TokenType.IDENTIFIER:
        token_list.append(token.token_type)
      else:
        token_list.append(token.token_value)

    return token_list

In [4]:
import re

class Document:
  """
  Represents a document object for storing and processing text content.
  """
  all_documents = []

  def __init__(self, doc_name: str) -> None:
    """
    Initializes a Document object with the provided document name.

    Args:
        doc_name (str): The name of the document file.
    """
    self.doc_name = doc_name
    self.word_dict: dict[str, int] = {}
    self.raw_text = ""
    self.text = ""
    self.tokens: list[str] = []
    self.raw_comments: str = ""
    self.token_comments: list[str] = []
    self.one_line_text: str = ""

    self.__read_text()
    self.__create_word_dict()
    self.__get_comments(self.raw_text)
    self.__get_one_line_text()
    Document.all_documents.append(self)

  def __get_comments(self, code):
    """
    Extracts comments from the provided code string.

    Args:
        code (str): The code string to extract comments from.
    """
    single_line_comment_pattern = r'//.*'
    multi_line_comment_pattern = r'/\*[\s\S]*?\*/'

    single_line_comments = re.findall(single_line_comment_pattern, code)
    multi_line_comments = re.findall(multi_line_comment_pattern, code)

    stripped_single_line_comments = [comment.lstrip('//').strip() for comment in single_line_comments]
    stripped_multi_line_comments = [re.sub(r'(^/\*|\*/$)', '', comment).strip() for comment in multi_line_comments]

    all_comments = ""
    for comment in stripped_single_line_comments:
      all_comments += f"{comment} "

    for comment in stripped_multi_line_comments:
      all_comments += f"{comment}"

    self.raw_comments = all_comments
    self.token_comments = self.raw_comments.split(" ")

  def __read_text(self) -> None:
    """
    Reads the text content from the document file.
    """
    raw_text = ""
    text = ""
    file = open(f"{self.doc_name}", "r", encoding="utf-8")
    while True:
      line = file.readline()
      if not line:
        break

      raw_text += line
      tokenized_line = tokenize_code(line)
      for token in tokenized_line:
        self.tokens.append(str(token))

      text += " ".join(str(tokenized_line))
    file.close()

    self.raw_text = raw_text
    self.text = text

  def __create_word_dict(self):
    """
    Creates a dictionary to store the frequency of each word in the document.
    """
    for token in self.tokens:
      self.word_dict[token] = self.word_dict.get(token, 0) + 1
  
  def __get_one_line_text(self):
    str_tokens = []
    for token in self.tokens:
      str_tokens.append(str(token))

    self.one_line_text = " ".join(str_tokens)

In [5]:
import os
import shutil

def generate_documents(class_name: str, directory: str = "") -> list[dict[str, float]]:
    """
    Generate a table of document comparison metrics for a given class name.

    Parameters:
    class_name (str): The name of the class.

    Returns:
    list: A list of dictionaries containing comparison metrics for each document pair.
    """

    directorio_principal = directory

    data = []

    ruta_clase = os.path.join(directorio_principal, class_name)

    if os.path.exists(ruta_clase):
        for carpeta_id in os.listdir(ruta_clase):
            ruta_carpeta = os.path.join(ruta_clase, carpeta_id)

            if os.path.isdir(ruta_carpeta):
                for archivo in os.listdir(ruta_carpeta):
                    ruta_archivo = os.path.join(ruta_carpeta, archivo)
                    Document(ruta_archivo)
                    
    else:
        print(f'La ruta {ruta_clase} no existe')

    return data

In [6]:
dir_name = "data_set_splitted/"

generate_documents("no_plagiado", f"{dir_name}train")
generate_documents("plagiado", f"{dir_name}train")
generate_documents("no_plagiado", f"{dir_name}val")
generate_documents("plagiado", f"{dir_name}val")
generate_documents("no_plagiado", f"{dir_name}test")
generate_documents("plagiado", f"{dir_name}test")
generate_documents("no_plagiado", "queries")


[]

In [8]:
print(len(Document.all_documents))

data = []

for document in Document.all_documents:  
  data.append(document.tokens)


1832


In [11]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

vector_size = 100
epochs = 100

tagged_data = [TaggedDocument(words=code, tags=[str(i)]) for i, code in enumerate(data)]

model = Doc2Vec(vector_size=vector_size, epochs=epochs)

model.build_vocab(tagged_data)

model.train(tagged_data, 
  total_examples=model.corpus_count,
  epochs=model.epochs
)


In [12]:
model.save("doc_2_vec_model")

In [13]:
code_vectors = []
for code in data:
  code_vectors.append(model.infer_vector(code))

print(code_vectors[0])

[ 0.39622623  0.76118845  1.4314494  -0.05048709  0.17532209  0.46299627
 -0.34288982 -0.4091335   0.38640738 -0.8947137   1.0105364   0.21155538
 -0.11648066 -0.5988063   0.7520589   0.3919415  -0.8178045  -0.76191
 -0.4989501   0.3184066  -0.14023519  0.77493364  0.6081415  -0.23907092
 -1.4507564  -0.30758205  0.6136947  -0.36435512 -0.09029232 -0.45981783
  0.17073308 -0.43309942 -0.68137574  0.49596515 -0.25680435 -0.9114806
  1.6861128   0.56535053 -0.59712774  0.00327066  0.19090734 -0.5718223
  0.5532488   1.5670247   0.6426095   1.053093    0.02012687  0.6191046
 -0.11427987 -0.3921164  -0.66345453  0.65601456  0.04763686 -0.7044579
 -0.51289093  0.5714094   0.04158123  0.3122712  -0.07183295 -0.790684
 -0.2534778  -0.34256777  0.2479571  -0.78059345 -0.27723572  0.5058676
  1.2426094  -0.09071164 -0.28360447  0.7178948  -0.9028868   0.6027275
  0.61436576 -0.33075935 -1.0134513   0.4228074   0.5216753   0.22657597
 -0.14012007 -0.93241066  0.4436081   0.0636168  -0.37429228 -