In [1]:
!pip install sentence_transformers > log

In [2]:
import numpy as np
import json
import os
import time
import torch
from typing import List

import nltk
from nltk.translate.bleu_score import sentence_bleu
from sentence_transformers import SentenceTransformer, util
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances
from transformers import AutoTokenizer, AutoModelWithLMHead, SummarizationPipeline, AutoConfig, T5Tokenizer, T5ForConditionalGeneration

In [3]:
nltk.download('punkt')
nltk.download('perluniprops')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package perluniprops to /root/nltk_data...
[nltk_data]   Unzipping misc/perluniprops.zip.


True

In [3]:
from google.colab import drive
drive.mount('/content/drive/')

In [4]:
tokenizer = AutoTokenizer.from_pretrained("SEBIS/code_trans_t5_base_commit_generation")
model = AutoModelWithLMHead.from_pretrained("SEBIS/code_trans_t5_base_commit_generation")
device = "cuda:0" if torch.cuda.is_available() else "cpu"
model.to(device)
max_source_length = 512

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/43.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/617 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/797k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

In [7]:
def make_embedding(input_sequences):
  """
  Make embedding from git diff using encoder.
  """

  encoding = tokenizer(
      input_sequences,
      padding="longest",
      max_length=max_source_length,
      truncation=True,
      return_tensors="pt",
  ).to(device)

  input_ids, attention_mask = encoding.input_ids, encoding.attention_mask

  output = model.encoder(input_ids=input_ids, attention_mask=attention_mask)
  pooled_sentence = output.last_hidden_state
  pooled_sentence = torch.mean(pooled_sentence, dim=1)

  return pooled_sentence

In [None]:
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

def load_data(path):
    with open(path, 'r') as f:
        lines = f.read().split('\n')[0:-1]
    lines = [l.strip() for l in lines]
    return lines


def find_mixed_nn(simi, diffs, test_diff, bleu_thre: int = 5) -> int:
    candidates = simi.argsort()[-bleu_thre:][::-1]
    max_score = 0
    max_idx = 0

    for j in candidates:
      score = sentence_bleu([diffs[j].split()], test_diff.split())
      if score > max_score:
          max_score = score
          max_idx = j
    msg = None
    return max_idx, msg


def find_nn(simi) -> int:
    max_idx = simi.argsort()[-1]
    return max_idx

def nngen(train_diffs: List[str], train_msgs: List[str], test_diffs: List[str], bleu_thre=5) -> List[str]:
    batch_size = 8
    train_matrix = np.stack([make_embedding(train_batch).cpu().detach().numpy() for train_batch in batch(train_diffs, batch_size) if len(train_batch) == batch_size])
    test_matrix = np.stack([make_embedding(test_batch).cpu().detach().numpy() for test_batch in batch(test_diffs, batch_size) if len(test_batch) == batch_size])

    # train_matrix = np.load('/content/drive/MyDrive/train_mx_22112.npy')
    # test_matrix = np.load('/content/drive/MyDrive/test_mx_2520.npy')

    similarities = cosine_similarity(
        np.reshape(test_matrix, (test_matrix.shape[0] * test_matrix.shape[1], test_matrix.shape[2])),
        np.reshape(train_matrix, (train_matrix.shape[0] * train_matrix.shape[1], train_matrix.shape[2])),
    )

    test_msgs = []
    for idx, test_simi in enumerate(similarities):
        max_idx = find_nn(test_simi)
        if msg is not None:
          test_msgs.append(msg)
        else:
          test_msgs.append(train_msgs[max_idx])
    return test_msgs


train_limit = 22112
test_limit = 2520
diffs_train = load_data("/content/drive/MyDrive/cleaned.train.diff")[:train_limit]
diffs_test = load_data("/content/drive/MyDrive/cleaned.test.diff")[:test_limit]

target_train = load_data("/content/drive/MyDrive/cleaned.train.msg")[:train_limit]
target_test = load_data("/content/drive/MyDrive/cleaned.test.msg")[:test_limit]

pred_test = nngen(diffs_train, target_train, diffs_test)

In [None]:
import string
from nltk import word_tokenize
from nltk.tokenize.nist import NISTTokenizer

nist = NISTTokenizer()

weights = (0.25,0.25,0.25,0.25,)
arr = []
a = []
c = 0
for target, pred in zip(target_test, pred_test):
  tokenized = nist.tokenize(target)
  if len(tokenized) >= 4:
    weights = (0.25,0.25,0.25,0.25,)
  elif len(tokenized) == 3:
    weights = (1/3,1/3,1/3,)
  elif len(tokenized) == 2:
    weights = (0.5,0.5,)
  else:
    weights = (1.,)
  score = sentence_bleu(
      [[t.lower() for idx, t in enumerate(nist.tokenize(target))]],
       [t.lower() for idx, t in enumerate(nist.tokenize(pred))],
      weights=weights,
    )
  a.append((pred, target))
  arr.append(score)
print("mean:" , sum(arr) / len(arr))

In [None]:
nn_pred = load_data("nngen.cleaned.test.msg")[:test_limit]

In [None]:
# NNGen

import string
from nltk import word_tokenize
from nltk.tokenize.nist import NISTTokenizer

nist = NISTTokenizer()


weights = (0.25,0.25,0.25,0.25,)
arr = []
b = []
c = 0
for target, pred in zip(target_test, nn_pred):
  tokenized = nist.tokenize(target)
  if len(tokenized) >= 4:
    weights = (0.25,0.25,0.25,0.25,)
  elif len(tokenized) == 3:
    weights = (1/3,1/3,1/3,)
  elif len(tokenized) == 2:
    weights = (0.5,0.5,)
  else:
    weights = (1.,)
  score = sentence_bleu(
      [[t.lower() for idx, t in enumerate(nist.tokenize(target))]],
       [t.lower() for idx, t in enumerate(nist.tokenize(pred))],
      weights=weights,
    )
  arr.append(score)
print("mean:" , sum(arr) / len(arr))