In [None]:
!pip install sctokenizer
!pip install nltk


## Tokenización de código
Se utilizó la librería sctokenizer para tokenizar los códigos de Java


In [1]:
from sctokenizer import JavaTokenizer, TokenType
import math
def tokenize_code(code):
    """
    Tokenize Java code using JavaTokenizer from sctokenizer.

    Parameters:
    code (str): The Java code to tokenize.

    Returns:
    list: A list of token types or token values.
    """

    tokenizer = JavaTokenizer()
    tokens = tokenizer.tokenize(code)
    token_list = []
    for token in tokens:
      if token.token_type == TokenType.COMMENT_SYMBOL:
        continue
      if token.token_type == TokenType.IDENTIFIER:
        token_list.append(token.token_type)
      else:
        token_list.append(token.token_value)

    return token_list

In [2]:
import re

class Document:
  """
  Represents a document object for storing and processing text content.
  """
  all_documents = []

  def __init__(self, doc_name: str) -> None:
    """
    Initializes a Document object with the provided document name.

    Args:
        doc_name (str): The name of the document file.
    """
    self.doc_name = doc_name
    self.word_dict: dict[str, int] = {}
    self.raw_text = ""
    self.text = ""
    self.tokens: list[str] = []
    self.raw_comments: str = ""
    self.token_comments: list[str] = []
    self.one_line_text: str = ""

    self.__read_text()
    self.__create_word_dict()
    self.__get_comments(self.raw_text)
    self.__get_one_line_text()
    Document.all_documents.append(self)

  def __get_comments(self, code):
    """
    Extracts comments from the provided code string.

    Args:
        code (str): The code string to extract comments from.
    """
    single_line_comment_pattern = r'//.*'
    multi_line_comment_pattern = r'/\*[\s\S]*?\*/'

    single_line_comments = re.findall(single_line_comment_pattern, code)
    multi_line_comments = re.findall(multi_line_comment_pattern, code)

    stripped_single_line_comments = [comment.lstrip('//').strip() for comment in single_line_comments]
    stripped_multi_line_comments = [re.sub(r'(^/\*|\*/$)', '', comment).strip() for comment in multi_line_comments]

    all_comments = ""
    for comment in stripped_single_line_comments:
      all_comments += f"{comment} "

    for comment in stripped_multi_line_comments:
      all_comments += f"{comment}"

    self.raw_comments = all_comments
    self.token_comments = self.raw_comments.split(" ")

  def __read_text(self) -> None:
    """
    Reads the text content from the document file.
    """
    raw_text = ""
    text = ""
    file = open(f"{self.doc_name}", "r", encoding="utf-8")
    while True:
      line = file.readline()
      if not line:
        break

      raw_text += line
      tokenized_line = tokenize_code(line)
      for token in tokenized_line:
        self.tokens.append(str(token))

      text += " ".join(str(tokenized_line))
    file.close()

    self.raw_text = raw_text
    self.text = text

  def __create_word_dict(self):
    """
    Creates a dictionary to store the frequency of each word in the document.
    """
    for token in self.tokens:
      self.word_dict[token] = self.word_dict.get(token, 0) + 1
  
  def __get_one_line_text(self):
    str_tokens = []
    for token in self.tokens:
      str_tokens.append(str(token))

    self.one_line_text = " ".join(str_tokens)

In [3]:
import math

class Matrix:
  """
  Provides static methods for performing matrix operations.
  """
  @staticmethod
  def transpose_matrix(matrix: list[list[float]]) -> list[list[float]]:
    """
    Transposes a given matrix.

    Args:
        matrix (list[list[float]]): The matrix to transpose.

    Returns:
        list[list[float]]: The transposed matrix.
    """
    new_matrix: list[list[float]] = []
    for col in range(len(matrix[0])):
      new_row = []
      for row in range(len(matrix)):
        new_row.append(matrix[row][col])
      new_matrix.append(new_row)

    return new_matrix

  @staticmethod
  def trace_matrix(matrix: list[list[float]]) -> float:
    """
    Calculates the trace of a square matrix.

    Args:
        matrix (list[list[float]]): The square matrix to calculate the trace for.

    Returns:
        float: The trace of the matrix (sum of diagonal elements).

    Raises:
        ValueError: If the input matrix is not square.
    """
    trace: float = 0
    row: int = 0
    col: int = 0
    for _ in range(len(matrix)):
      trace += matrix[row][col]
      row += 1
      col += 1
    return trace

  @staticmethod
  def equalize_matrixes(matrix_a: list[list[float]], matrix_b: list[list[float]]) -> list[list[list[float]]]:
    """
    Equalizes the dimensions of two matrices by padding with zeros if necessary.

    Args:
        matrix_a (list[list[float]]): The first matrix.
        matrix_b (list[list[float]]): The second matrix.

    Returns:
        list[list[list[float]]]: A list containing the equalized matrices.
    """
    equalized_matrixes: list[list[list[float]]] = []
    matrix_a_col_len: int = len(matrix_a[0])
    matrix_b_row_len: int = len(matrix_b)

    if matrix_a_col_len > matrix_b_row_len:
      diff: int = matrix_a_col_len - matrix_b_row_len
      row_zeroes: list[float] = [0] * matrix_a_col_len
      col_zeroes: list[float] = [0] * diff

      for row in matrix_b:
        row += col_zeroes

      for _ in range(diff):
        matrix_b.append(row_zeroes)

    else:
      diff: int = matrix_b_row_len - matrix_a_col_len
      row_zeroes: list[float] = [0] * matrix_b_row_len
      col_zeroes: list[float] = [0] * diff

      for row in matrix_a:
        row = row + col_zeroes

      for _ in range(diff):
        matrix_a.append(row_zeroes)

    equalized_matrixes = [matrix_a, matrix_b]

    return equalized_matrixes


  @staticmethod
  def multiply_matrix(matrix_a: list[list[float]], matrix_b: list[list[float]]) -> list[list[float]]:
    """
    Multiplies two matrices.

    Args:
        matrix_a (list[list[float]]): The first matrix.
        matrix_b (list[list[float]]): The second matrix.

    Returns:
        list[list[float]]): The resulting product matrix.

    Raises:
        ValueError: If the inner dimensions of the matrices are not compatible for multiplication.
    """
    matrix_c: list[list[float]] = []
    row_a: int = 0

    if len(matrix_a[0]) != len(matrix_b):
      matrix_a, matrix_b = Matrix.equalize_matrixes(matrix_a, matrix_b)

    for row_a in range(len(matrix_a)):
      new_row: list[float] = []
      for col_b in range(len(matrix_b[0])):
        new_val: float = 0
        for col_a in range(len(matrix_a[0])):
          new_val += matrix_a[row_a][col_a] * matrix_b[col_a][col_b]
        new_row.append(new_val)
      matrix_c.append(new_row)

    return matrix_c

  @staticmethod
  def normalize_matrix(matrix: list[list[float]]) -> float:
    """
    Calculates the Frobenius norm of a matrix.

    Args:
        matrix (list[list[float]]): The matrix to calculate the norm for.

    Returns:
        float: The Frobenius norm of the matrix.
    """
    matrix_t = Matrix.transpose_matrix(matrix)
    matrix_c = Matrix.multiply_matrix(matrix_t, matrix)
    trace = Matrix.trace_matrix(matrix_c)

    return math.sqrt(trace)

  @staticmethod
  def print_matrix(matrix: list[list[float]]) -> None:
    """
    Prints a matrix in a formatted way.

    Args:
        matrix (list[list[float]]): The matrix to print.
    """
    print("--------------------------------------------")
    for row in range(len(matrix)):
      for col in range(len(matrix[0])):
        print(f"{round(matrix[row][col], 4)} |", end="")
      print("\n")
    print("--------------------------------------------")

## Cadena de Markov
Representa la probabilidad de transiciones entre los tokens de un código:
- Matrix: contiene los métodos necesarios para hacer operaciones entre matrices
- Markov_Chain: contiene la matriz de transición de los tokens de un archivo de código y la información necesaria para hacer la similitud de cosenos

In [4]:
class Markov_Chain:
  """
  Represents a Markov chain for text generation based on a document.
  """
  def __init__(self, doc_name: str, tokenize: bool = False):
    """
    Initializes a MarkovChain object.

    Args:
        doc_name (str): The name of the document to build the chain from.
        tokenize (bool, optional): Whether to tokenize the document
            before processing. Defaults to False.
    """
    self.markov_chain: list[list[float]] = []
    self.doc_name = doc_name
    self.text = ""
    self.tokens: list[str] = []
    self.token_transitions: dict[str, dict[str, int]] = {}

    if tokenize:
      self.__tokenize_file()
    else:
      self.__read_file()

    self.__generate_token_transitions()
    self.__generate_markov_chain()

  def __read_file(self):
    """
    Reads the text content from the document file.
    """
    text = ""
    file = open(f"{self.doc_name}", "r", encoding="utf-8")
    while True:
      line = file.readline()

      if not line:
        break

      tokenized_line = tokenize_code(line)
      for token in tokenized_line:
        self.tokens.append(token)
      text += " ".join(str(tokenized_line))
    file.close()

    self.text = text

  def __generate_token_transitions(self):
    """
    Generates a dictionary representing token transitions in the document.
    """
    for i, token in enumerate(self.tokens):
      if token not in self.token_transitions:
        self.token_transitions[token] = {}

      if i < len(self.tokens) - 1 and self.tokens[i + 1] not in self.token_transitions[token]:
        self.token_transitions[token][self.tokens[i + 1]] = 1
      elif i < len(self.tokens) - 1 and self.tokens[i + 1] in self.token_transitions[token]:
        self.token_transitions[token][self.tokens[i + 1]] += 1

  def __generate_markov_chain(self):
    """
    Generates the Markov chain transition matrix from token transitions.
    """
    self.markov_chain = []

    for key in self.token_transitions:
      row: list[float] = []
      total_transitions: int = 0
      for freq in self.token_transitions[key].values():
        total_transitions += freq
      for next_key in self.token_transitions:
        if next_key not in self.token_transitions[key]:
          row.append(0)
        else:
          row.append(self.token_transitions[key][next_key] / total_transitions)
      self.markov_chain.append(row)

# Similitudes calculadas en Compare

## Distribución de probabilidad (TF-IDF)

Fórmula de TF-IDF:
$$
\text{tf-idf}(t, d) = \underbrace{tf(t, d)}{\text{Term Frequency}} \times \underbrace{\log{10}\left(\dfrac{N}{df(t)}\right)}_{\text{Inverse Document Frequency}}
$$

## Similitud de Euclidean
A partir de los vectores obtenidos por TF-IDF, se calculó la distancia euclideana para ver qué tan cerca están los dos vectores en una línea recta.

La fórmula de la distancia euclidian es:

$$d(p,q) = \sqrt{(q_1-p_1)^2 + (q_2-p_2)^2 + \cdots + (q_n-p_n)^2} = \sqrt{\sum_{i=1}^n (q_i-p_i)^2}$$

## Similitud de Manhattan
A partir de los vectores obtenidos por TF-IDF, se calculó la distancia manhattan para ver qué tan cerca están los vectores simulando un camino de una calle.

La fórmula de la distancia de Manhattan en LaTeX es:

$$d(p,q) = \sum_{i=1}^n |q_i-p_i|$$

## Similitud de Jaccard

Calcula la similitud entre dos conjuntos generados a partir de los tokens de un par de códigos

$$
J(A, B) = \frac{|A \cap B|} {|A \cup B|}
$$

## Similitud de espacios y saltos de línea
Se calcula obteniendo la diferencia de la cuenta de saltos de línea y espacios entre ambos códigos. Posteriormente se suman, y se dividen entre el total de los máximos de cuenta. Esto se expresa de la siguiente manera:

$$\text{Diferencia} = 1 - \frac{|\text{NewLines}_A - \text{NewLines}_B| + |\text{Spaces}_A - \text{Spaces}_B|}{\max(\text{NewLines}_A, \text{NewLines}_B) + \max(\text{Spaces}_A, \text{Spaces}_B)}$$

## Similitud de llaves
La función `classify_braces` clasifica las llaves (`{` y `}`) en un código según su posición en cada línea. Devuelve una cadena de notación de llaves, donde:
- '1' significa que la llave está al principio de la línea y hay otros caracteres después
- '2' significa que la llave está al final de la línea
- '3' significa que la llave está en el medio de la línea
- '4' significa que la llave está sola en la línea

La función `lcs_length` calcula la longitud de la subsecuencia común más larga (LCS) entre dos cadenas utilizando programación dinámica.

La función `calculate_brace_similarity` calcula la similitud de llaves entre dos códigos. Primero, clasifica las llaves en cada código utilizando la función `classify_braces` para obtener las notaciones de llaves. Luego, calcula la longitud de la LCS entre las dos notaciones de llaves utilizando la función `lcs_length`. Finalmente, calcula la similitud de llaves utilizando la siguiente fórmula:

$$\text{BraceSimilarity}(\text{code1}, \text{code2}) = \frac{2 \times \text{LCS}(\text{notation1}, \text{notation2})}{\text{L1} \times \text{L2}}$$

## Similitud del estilo de código
Calcula la similitud del estilo de código a partir de la similitud de los parentesis, comentarios, saltos de línea y espacios
$$
\text{code style similarity} =  {(bs + cs + snl) / 3}
$$

In [10]:
from gensim.models import Doc2Vec

class Compare:
  """
  Compares documents using TF-IDF (Term Frequency-Inverse Document Frequency).
  """
  word_doc_freq: dict[str, tuple[list[int], set[str]]] = {}
  def __init__(self):
    """
    Initializes a Compare object. Sets the number of documents to be compared (default 2).
    """
    self.__n_docs = 2

  def __get_global_dict(self, doc_1: Document, doc_2: Document) -> None:
    """
    Creates a dictionary to store word frequencies for both documents.

    Args:
        doc_1 (Document): The first document to compare.
        doc_2 (Document): The second document to compare.
    """
    Compare.word_doc_freq = {}
    for word in doc_1.word_dict:
      Compare.word_doc_freq[word] = ([doc_1.word_dict[word]], set([doc_1.doc_name]))

    for word in doc_2.word_dict:
      if word not in Compare.word_doc_freq:
        Compare.word_doc_freq[word] = ([doc_2.word_dict[word]], set([doc_2.doc_name]))
      else:
        Compare.word_doc_freq[word][0][0] += doc_2.word_dict[word]
        Compare.word_doc_freq[word][1].add(doc_2.doc_name)

  def __calc_idf(self) -> list[float]:
    """
    Calculates Inverse Document Frequency (IDF) for each word in the vocabulary.

    Returns:
        list[float]: A list containing IDF values for all words.
    """
    list_idf = []
    for word in Compare.word_doc_freq:
      idf = math.log((self.__n_docs) / (len(Compare.word_doc_freq[word][1]) + 1)) + 1
      list_idf.append(idf)

    return list_idf

  def __generate_tf(self, doc: Document) -> list[float]:
    """
    Calculates Term Frequency (TF) for each word in a document.

    Args:
        doc (Document): The document to calculate TF for.

    Returns:
        list[float]: A list containing TF values for all words in the document.
    """
    doc_tf: list[float] = []
    n_words: int = sum(doc.word_dict.values())

    for word in Compare.word_doc_freq:
      if word not in doc.word_dict:
        doc_tf.append(0)
      else:
        doc_tf.append(doc.word_dict[word] / n_words)

    return doc_tf

  def __calc_tf_idf(self, tf: list[float], idf: list[float]) -> list[float]:
    """
    Calculates TF-IDF (Term Frequency-Inverse Document Frequency) for each word.

    Args:
        tf (list[float]): A list containing TF values.
        idf (list[float]): A list containing IDF values.

    Returns:
        list[float]: A list containing TF-IDF values for all words.
    """
    tf_idf: list[float] = []
    for i in range(len(idf)):
      tf_idf.append(tf[i] * idf[i])

    return tf_idf

  @staticmethod
  def calc_dot_product(u_vector: list[float], v_vector: list[float]) -> float:
    """
    Calculates the dot product of two vectors.

    Args:
        u_vector (list[float]): The first vector.
        v_vector (list[float]): The second vector.

    Returns:
        float: The dot product of the two vectors.

    Raises:
        Exception: If the vectors have different lengths.
    """
    if len(u_vector) != len(v_vector):
      raise Exception("Length of vectors is not equal")

    product: float = 0
    for i in range(len(u_vector)):
      product += u_vector[i] * v_vector[i]
    return product

  @staticmethod
  def calc_magnitude(vector: list[float]) -> float:
    """
    Calculates the magnitude (length) of a vector.

    Args:
        vector (list[float]): The vector to calculate the magnitude for.

    Returns:
        float: The magnitude of the vector.
    """
    magnitude = 0
    for num in vector:
      magnitude += num ** 2
    return math.sqrt(magnitude)

  def get_tf_idf(self, doc_1: Document, doc_2: Document) -> list[list[float]]:
    """
    Generates TF-IDF word vectors for the two documents.

    Args:
        doc_1 (Document): The first document.
        doc_2 (Document): The second document.

    Returns:
        list[list[float]]: A list containing TF-IDF word vectors for both documents.
    """
    self.__get_global_dict(doc_1, doc_2)
    doc_1_tf: list[float] = self.__generate_tf(doc_1)
    doc_2_tf: list[float] = self.__generate_tf(doc_2)
    idf = self.__calc_idf()
    doc_1_tf_idf: list[float] = self.__calc_tf_idf(doc_1_tf, idf)
    doc_2_tf_idf: list[float] = self.__calc_tf_idf(doc_2_tf, idf)

    len_doc_1 = len(doc_1_tf_idf)
    len_doc_2 = len(doc_2_tf_idf)

    if len_doc_1 != len_doc_2:
      if len_doc_1 > len_doc_2:
        diff = len_doc_1 - len_doc_2
        for i in range(diff):
          doc_2_tf_idf.append(0)
      else:
        diff = len_doc_2 - len_doc_1
        for i in range(diff):
          doc_1_tf_idf.append(0)

    return [doc_1_tf_idf, doc_2_tf_idf]

  def cosine_similarity_vector(self, vector_1: list[float], vector_2: list[float]) -> float:
    """
    Compares the similarity of two documents using cosine similarity.

    Args:
        vector_1 (list[float]): The first vector
        vector_2 (list[float]): The second vector.

    Returns:
        float: The cosine similarity score between the documents (0.0 to 1.0).
    """

    product: float = Compare.calc_dot_product(vector_1, vector_2)
    doc_1_magn: float = Compare.calc_magnitude(vector_1)
    doc_2_magn: float = Compare.calc_magnitude(vector_2)

    similarity: float = product / (doc_1_magn * doc_2_magn)

    return round(similarity, 4)
  
  def cosine_similarity_matrix(self, matrix_a: list[list[float]], matrix_b: list[list[float]]) -> float:
    """
    Calculates the cosine similarity between two matrices.

    Args:
        matrix_a (list[list[float]]): The first matrix.
        matrix_b (list[list[float]]): The second matrix.

    Returns:
        float: The cosine similarity between the matrices (0.0 to 1.0).
    """
    norm_matrix_a = Matrix.normalize_matrix(matrix_a)
    norm_matrix_b = Matrix.normalize_matrix(matrix_b)
    matrix_bt = Matrix.transpose_matrix(matrix_b)
    matrix_c = Matrix.multiply_matrix(matrix_bt, matrix_a)
    trace = Matrix.trace_matrix(matrix_c)

    return round(trace / (norm_matrix_a * norm_matrix_b), 4)
  
  def euclidean_similarity(self, vector_1: list[float], vector_2: list[float]) -> float:
    """
    Calculate the Euclidean distance between two vectors.

    Parameters:
    vector_1 (list): The first vector.
    vector_2 (list): The second vector.

    Returns:
    float: The Euclidean distance between the two vectors.

    Raises:
    ValueError: If the vectors are not of the same length.
    """
    if len(vector_1) != len(vector_2):
        raise ValueError("Vectors should have the same length")

    sum_squares = sum((a - b) ** 2 for a, b in zip(vector_1, vector_2))
    distance = math.sqrt(sum_squares)

    return distance
  
  def manhattan_similarity(self, vector_1: list[float], vector_2: list[float]):
    """
    Calculate the Manhattan distance between two vectors.

    Parameters:
    vector_1 (list): The first vector.
    vector_2 (list): The second vector.

    Returns:
    float: The Manhattan distance between the two vectors.

    Raises:
    ValueError: If the vectors are not of the same length.
    """
    if len(vector_1) != len(vector_2):
        raise ValueError("Vectors should have the same length")
    distance = sum(abs(a - b) for a, b in zip(vector_1, vector_2))

    return distance
  
  def jaccard_similarity(self, tokens_1: list[str], tokens_2: list[str]) -> float:
    """
    Calculates the Jaccard similarity between two sets of tokens.

    Args:
        tokens_1 (set[str]): The set of tokens from the first document.
        tokens_2 (set[str]): The set of tokens from the second document.

    Returns:
        float: The Jaccard similarity score between the documents (0.0 to 1.0).
    """
    tokens1 = set(tokens_1)
    tokens2 = set(tokens_2)

    intersection = tokens1.intersection(tokens2)
    union = tokens1.union(tokens2)

    similarity = len(intersection) / len(union)

    return similarity
  
  def space_new_line_similarity(self, code_1: str, code_2: str) -> float:
    """
    Calculate the similarity between two pieces of code based on spaces, tabs, and newlines.

    Parameters:
    code_1 (str): The first piece of code.
    code_2 (str): The second piece of code.

    Returns:
    float: The similarity score between the two pieces of code.
    """
    tabs_distance = abs(code_1.count('\t') - code_2.count('\t'))
    spaces_distance = abs(code_1.count(' ') - code_2.count(' '))
    newlines_distance = abs(code_1.count('\n') - code_2.count('\n'))

    total_tabs = max(code_1.count('\t'), code_2.count('\t'))
    total_spaces = max(code_1.count(' '), code_2.count(' '))
    total_newlines = max(code_1.count('\n'), code_2.count('\n'))

    ED = tabs_distance + spaces_distance + newlines_distance
    total = total_tabs + total_spaces + total_newlines
    if total == 0:
        SNS = 1.0
    else:
        SNS = 1 - ED / total
    return SNS
  
  @staticmethod
  def classify_braces(code: str) -> str:
    """
    Classify braces in the given code and return a string representation.

    Parameters:
    code (str): The code to classify braces.

    Returns:
    str: A string representing the classification of braces in the code.
    """
    brace_notation = []
    lines = code.split('\n')
    for line in lines:
        stripped = line.strip()
        if '{' in stripped or '}' in stripped:
            if stripped.startswith('{') or stripped.startswith('}'):
                if len(stripped) > 1:
                    brace_notation.append('1')
                else:
                    brace_notation.append('4')
            elif stripped.endswith('{') or stripped.endswith('}'):
                brace_notation.append('2')
            else:
                brace_notation.append('3')
    return ''.join(brace_notation)

  @staticmethod
  def lcs(iter_1: list, iter_2: list) -> int:
      """
      Calculate the length of the longest common subsequence (LCS) between two strings.

      Parameters:
      iter_1 (iterable[any]): The first iterable.
      iter_2 (iterable[any]): The second iterable

      Returns:
      int: The length of the LCS.
      """
      m, n = len(iter_1), len(iter_2)
      dp = [[0] * (n + 1) for _ in range(m + 1)]
      for i in range(m):
          for j in range(n):
              if iter_1[i] == iter_2[j]:
                  dp[i + 1][j + 1] = dp[i][j] + 1
              else:
                  dp[i + 1][j + 1] = max(dp[i + 1][j], dp[i][j + 1])
      return dp[m][n]
  
  def calculate_brace_similarity(self, code_1: str, code_2: str) -> float:
      """
      Calculate the similarity between two pieces of code based on their brace notation.

      Parameters:
      code_1 (str): The first piece of code.
      code_2 (str): The second piece of code.

      Returns:
      float: The similarity score between the two pieces of code.
      """
      notation1 = Compare.classify_braces(code_1)
      notation2 = Compare.classify_braces(code_2)

      LCS = Compare.lcs(notation1, notation2)
      L1, L2 = len(notation1), len(notation2)

      if L1 == 0 or L2 == 0:
          return 0.0
      BS = 2 * LCS / (L1 * L2)
      return BS
  
  def code_style_similarity(self, bs: float, cs: float, snl: float) -> float:
    """
    Calculates a code style similarity score based on three metrics.

    Args:
        bs (float): The bracket similarity of the code.
        cs (float): The comment similarity of the code.
        snl (float): The space-newline similarity of the code.

    Returns:
        float: The combined code style similarity score (average of the three inputs).
    """
    return (bs + cs + snl) / 3
  
  def levenshtein_distance(self, iter_1: list, iter_2: list) -> float:
    m = len(iter_1)
    n = len(iter_2)

    dp = [[0 for _ in range(n + 1)] for _ in range(m + 1)]

    for i in range(m + 1):
      dp[i][0] = i
    for j in range(n + 1):
      dp[0][j] = j

    for i in range(1, m + 1):
       for j in range(1, n + 1):
          if iter_1[i - 1] == iter_2[j - 1]:
            dp[i][j] = dp[i - 1][j - 1]
          else:
             dp[i][j] = 1 + min(dp[i][j - 1], dp[i - 1][j], dp[i - 1][j - 1])
    
    return dp[m][n]
  
  def get_doc_2_vec(self, token_vector_1: list[str], token_vector_2: list[str]) -> list[list[str]]:
    model = Doc2Vec.load("doc_2_vec_model")
    vector_1 = model.infer_vector(token_vector_1)
    vector_2 = model.infer_vector(token_vector_2)

    return [vector_1, vector_2]
     


In [28]:
import os
import shutil

def table_generator(class_name: str, directory: str = "") -> list[dict[str, float]]:
    """
    Generate a table of document comparison metrics for a given class name.

    Parameters:
    class_name (str): The name of the class.

    Returns:
    list: A list of dictionaries containing comparison metrics for each document pair.
    """

    directorio_principal = directory

    data = []

    compare = Compare()

    ruta_clase = os.path.join(directorio_principal, class_name)

    if os.path.exists(ruta_clase):
        for carpeta_id in os.listdir(ruta_clase):
            ruta_carpeta = os.path.join(ruta_clase, carpeta_id)

            if os.path.isdir(ruta_carpeta):
                archivos = []
                for archivo in os.listdir(ruta_carpeta):
                    ruta_archivo = os.path.join(ruta_carpeta, archivo)
                    archivos.append(ruta_archivo)

                doc_1 = Document(archivos[0])
                doc_2 = Document(archivos[1])
                doc_1_mkv = Markov_Chain(archivos[0])
                doc_2_mkv = Markov_Chain(archivos[1])

                # vector_1, vector_2 = compare.get_tf_idf(doc_1, doc_2)
                vector_1, vector_2 = compare.get_doc_2_vec(doc_1.tokens, doc_2.tokens)
                cos_vector = compare.cosine_similarity_vector(vector_1, vector_2)
                cos_markov = compare.cosine_similarity_matrix(doc_1_mkv.markov_chain, doc_2_mkv.markov_chain)
                euc = compare.euclidean_similarity(vector_1, vector_2)
                mht = compare.manhattan_similarity(vector_1, vector_2)
                jac = compare.jaccard_similarity(doc_1.tokens, doc_2.tokens)
                snl = compare.space_new_line_similarity(doc_1.raw_text, doc_2.raw_text)
                bs = compare.calculate_brace_similarity(doc_1.raw_text, doc_2.raw_text)
                cs = compare.jaccard_similarity(doc_1.token_comments, doc_2.token_comments)
                style_similarity = compare.code_style_similarity(bs, cs, snl)
                lcs_score = compare.lcs(doc_1.tokens, doc_2.tokens) / len(doc_2.tokens)
                levenshtein_score = compare.levenshtein_distance(doc_1.tokens, doc_2.tokens) / len(doc_2.tokens)

                veredict = 1 if class_name == "plagiado" else 0

                _map = {
                    "Doc2Vec": cos_vector,
                    "Markov": cos_markov,
                    "Euclidean" : euc,
                    "Manhattan": mht,
                    "Jaccard": jac,
                    "Space_NewLine": snl,
                    "BraceSimilarity": bs,
                    "CommentSimilarity": cs,
                    "CodeStyleSimilarity": style_similarity,
                    "LCS_Score": lcs_score,
                    "LevenshteinScore": levenshtein_score,
                    "Veredict": veredict }

                data.append(_map)
    else:
        print(f'La ruta {ruta_clase} no existe')

    return data

In [29]:
import csv
def generate_table(data: list[dict[str, float]], output_name: str) -> None:
  """
    Generate a CSV table from the provided data.

    Parameters:
    data (list of dict): The data to be written into the CSV file.

    Returns:
    None
  """
  csv_file = f'{output_name}.csv'

  campos = [
    "Doc2Vec",
    "Markov",
    "Euclidean",
    "Manhattan",
    "Jaccard",
    "Space_NewLine",
    "BraceSimilarity",
    "CommentSimilarity",
    "CodeStyleSimilarity",
    "LCS_Score",
    "LevenshteinScore",
    "Veredict"]

  with open(csv_file, mode='w', newline='') as file:
      writer = csv.DictWriter(file, fieldnames=campos)

      writer.writeheader()

      for row in data:
          writer.writerow(row)

  print(f'Archivo {csv_file} creado exitosamente.')

In [30]:
#------------------PLAYGROUND---------------

compare = Compare()
doc_1 = Document("queries/plagiado/2/og_2.java")
doc_2 = Document("queries/plagiado/2/cp_2.java")

new_metric = compare.levenshtein_distance(doc_1.tokens, doc_2.tokens)
doc_2_len = len(doc_2.tokens)

v1, v2 = compare.get_doc_2_vec(doc_1.tokens, doc_2.tokens)

print(v1)

[-0.23487532  1.2888407   0.9773964   2.0828087  -0.5498208  -1.1947503
  0.60770047 -0.90992045  0.05750665 -0.6877957  -0.01889475  2.1811159
  1.0289016   0.11861807 -0.49091274 -0.059469    0.3286159   0.03399688
 -0.81135345  0.62422943 -0.12692899  2.0113661   0.4618638  -0.6277114
  0.0567116   0.48162276 -0.5948959  -0.2650232  -0.43622696 -0.23889601
 -1.2884675   0.20290206  1.0441239   0.68333125  1.7129894  -0.8224682
 -1.0937785  -0.85413337  0.81623137 -0.50794476 -0.12729672  1.3159592
  0.27488938  0.23508945 -0.43936047  1.016698   -1.0261252  -0.72687405
  0.5828255  -0.33677155 -0.09925272 -0.0795472   0.65745914  0.6303941
 -0.528853    0.05797065  0.3941228   2.0790372   1.3805856   0.33475447]


In [37]:
splits = ["train", "val", "test", "queries"]
curr_split = splits[3]
dir_name = f"data_set_splitted/{curr_split}" if curr_split != "queries" else curr_split

data1 = table_generator("plagiado", dir_name)
data2 = table_generator("no_plagiado", dir_name)
data = data1 + data2

In [38]:
print(data)
generate_table(data, curr_split)

[{'Doc2Vec': 0.9842, 'Markov': 1.0, 'Euclidean': 0.7779753923815512, 'Manhattan': 4.810112617909908, 'Jaccard': 0.9047619047619048, 'Space_NewLine': 0.9358974358974359, 'BraceSimilarity': 0.3333333333333333, 'CommentSimilarity': 0.21428571428571427, 'CodeStyleSimilarity': 0.4945054945054945, 'LCS_Score': 0.9838709677419355, 'LevenshteinScore': 0.016129032258064516, 'Veredict': 1}, {'Doc2Vec': 0.9861, 'Markov': 1.0, 'Euclidean': 1.1264646905563314, 'Manhattan': 6.6635792795568705, 'Jaccard': 0.9393939393939394, 'Space_NewLine': 0.9952380952380953, 'BraceSimilarity': 0.13333333333333333, 'CommentSimilarity': 0.17647058823529413, 'CodeStyleSimilarity': 0.4350140056022409, 'LCS_Score': 0.9941176470588236, 'LevenshteinScore': 0.0058823529411764705, 'Veredict': 1}, {'Doc2Vec': 0.6083, 'Markov': 0.2987, 'Euclidean': 6.8364974658692885, 'Manhattan': 40.08920247852802, 'Jaccard': 0.72, 'Space_NewLine': 0.3950762016412661, 'BraceSimilarity': 0.045, 'CommentSimilarity': 0.0, 'CodeStyleSimilarity'