# Doc2Vec

In [2]:
!pip install sctokenizer
!pip install nltk
!pip install scipy==1.12
!pip install gensim

Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Defaulting to user installation because normal site-packages is not writeable
Collecting scipy==1.12
  Using cached scipy-1.12.0-cp312-cp312-win_amd64.whl.metadata (60 kB)
Using cached scipy-1.12.0-cp312-cp312-win_amd64.whl (45.8 MB)
Installing collected packages: scipy
Successfully installed scipy-1.12.0
Defaulting to user installation because normal site-packages is not writeable


In [3]:
from sctokenizer import JavaTokenizer, TokenType
import math
def tokenize_code(code):
    """
    Tokenize Java code using JavaTokenizer from sctokenizer.

    Parameters:
    code (str): The Java code to tokenize.

    Returns:
    list: A list of token types or token values.
    """

    tokenizer = JavaTokenizer()
    tokens = tokenizer.tokenize(code)
    token_list = []
    for token in tokens:
      if token.token_type == TokenType.COMMENT_SYMBOL:
        continue
      if token.token_type == TokenType.IDENTIFIER:
        token_list.append(token.token_type)
      else:
        token_list.append(token.token_value)

    return token_list

In [4]:
import re

class Document:
  """
  Represents a document object for storing and processing text content.
  """
  all_documents = []

  def __init__(self, doc_name: str) -> None:
    """
    Initializes a Document object with the provided document name.

    Args:
        doc_name (str): The name of the document file.
    """
    self.doc_name = doc_name
    self.word_dict: dict[str, int] = {}
    self.raw_text = ""
    self.text = ""
    self.tokens: list[str] = []
    self.raw_comments: str = ""
    self.token_comments: list[str] = []
    self.one_line_text: str = ""

    self.__read_text()
    self.__create_word_dict()
    self.__get_comments(self.raw_text)
    self.__get_one_line_text()
    Document.all_documents.append(self)

  def __get_comments(self, code):
    """
    Extracts comments from the provided code string.

    Args:
        code (str): The code string to extract comments from.
    """
    single_line_comment_pattern = r'//.*'
    multi_line_comment_pattern = r'/\*[\s\S]*?\*/'

    single_line_comments = re.findall(single_line_comment_pattern, code)
    multi_line_comments = re.findall(multi_line_comment_pattern, code)

    stripped_single_line_comments = [comment.lstrip('//').strip() for comment in single_line_comments]
    stripped_multi_line_comments = [re.sub(r'(^/\*|\*/$)', '', comment).strip() for comment in multi_line_comments]

    all_comments = ""
    for comment in stripped_single_line_comments:
      all_comments += f"{comment} "

    for comment in stripped_multi_line_comments:
      all_comments += f"{comment}"

    self.raw_comments = all_comments
    self.token_comments = self.raw_comments.split(" ")

  def __read_text(self) -> None:
    """
    Reads the text content from the document file.
    """
    raw_text = ""
    text = ""
    file = open(f"{self.doc_name}", "r", encoding="utf-8")
    while True:
      line = file.readline()
      if not line:
        break

      raw_text += line
      tokenized_line = tokenize_code(line)
      for token in tokenized_line:
        self.tokens.append(str(token))

      text += " ".join(str(tokenized_line))
    file.close()

    self.raw_text = raw_text
    self.text = text

  def __create_word_dict(self):
    """
    Creates a dictionary to store the frequency of each word in the document.
    """
    for token in self.tokens:
      self.word_dict[token] = self.word_dict.get(token, 0) + 1
  
  def __get_one_line_text(self):
    str_tokens = []
    for token in self.tokens:
      str_tokens.append(str(token))

    self.one_line_text = " ".join(str_tokens)

In [5]:
import os
import shutil

def generate_documents(class_name: str, directory: str = "") -> list[dict[str, float]]:
    """
    Generate a table of document comparison metrics for a given class name.

    Parameters:
    class_name (str): The name of the class.

    Returns:
    list: A list of dictionaries containing comparison metrics for each document pair.
    """

    directorio_principal = directory

    data = []

    ruta_clase = os.path.join(directorio_principal, class_name)

    if os.path.exists(ruta_clase):
        for carpeta_id in os.listdir(ruta_clase):
            ruta_carpeta = os.path.join(ruta_clase, carpeta_id)

            if os.path.isdir(ruta_carpeta):
                for archivo in os.listdir(ruta_carpeta):
                    ruta_archivo = os.path.join(ruta_carpeta, archivo)
                    Document(ruta_archivo)
                    
    else:
        print(f'La ruta {ruta_clase} no existe')

    return data

In [6]:
dir_name = "data_set_splitted/"

generate_documents("no_plagiado", f"{dir_name}train")
generate_documents("plagiado", f"{dir_name}train")
generate_documents("no_plagiado", f"{dir_name}val")
generate_documents("plagiado", f"{dir_name}val")
generate_documents("no_plagiado", f"{dir_name}test")
generate_documents("plagiado", f"{dir_name}test")
generate_documents("no_plagiado", "queries")


[]

In [8]:
print(len(Document.all_documents))

data = []

for document in Document.all_documents:  
  data.append(document.tokens)


1832


In [14]:
from gensim.models import Doc2Vec
from gensim.models.doc2vec import TaggedDocument

vector_size = 60
epochs = 100

tagged_data = [TaggedDocument(words=code, tags=[str(i)]) for i, code in enumerate(data)]

model = Doc2Vec(vector_size=vector_size, epochs=epochs)

model.build_vocab(tagged_data)

model.train(tagged_data, 
  total_examples=model.corpus_count,
  epochs=model.epochs
)


In [15]:
model.save("doc_2_vec_model")

In [16]:
code_vectors = []
for code in data:
  code_vectors.append(model.infer_vector(code))

print(code_vectors[0])

[-1.0704253e+00 -2.3402269e-01 -5.9948975e-01  3.9359775e-01
  4.0985331e-01 -1.3024232e-01 -1.5597981e-01 -5.2251959e-01
 -8.7454826e-01 -8.2927752e-01  2.6500143e-02 -7.7603894e-01
 -3.4269625e-01  1.6587644e+00  4.2544621e-01 -1.0857534e+00
  5.2693123e-01  2.5648770e-01  1.8171641e+00  2.5332472e-01
  8.0578291e-01  2.0665821e-01  9.5942754e-01 -1.2399166e+00
 -8.0167770e-02  9.5934473e-04 -3.6394474e-01  1.3132125e+00
  8.3329946e-01 -2.1232757e-01 -6.5241568e-03 -8.3940662e-02
 -9.5506740e-01 -4.5085961e-01 -7.8117388e-01 -7.4281543e-01
  7.0676714e-01 -5.6994754e-01 -1.6101278e-02 -1.1873368e+00
 -3.7913325e-01  4.0421683e-01  4.6186656e-02 -1.5572073e-01
 -2.6295093e-01 -1.5952112e-02  1.5770830e+00 -1.5088477e+00
 -4.7695443e-01  1.0494606e+00 -7.0793241e-01  5.8350984e-02
  3.0346436e-02 -1.6266169e-01 -1.0991377e-01 -1.2480173e+00
 -2.8784546e-01  5.5258179e-01 -1.4954592e+00 -1.5448141e-02]
