In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import sys

address = 'CDG-for-jupyter-notebooks/LLMSamples'
sys.path.append(f'content/drive/MyDrive/{address}')

%cd /content/drive/MyDrive/$address
%pwd

/content/drive/MyDrive/CDG-for-jupyter-notebooks/LLMSamples


'/content/drive/MyDrive/CDG-for-jupyter-notebooks/LLMSamples'

In [None]:
!pip install transformers
!pip install accelerate
!pip install git+https://github.com/ISE-Code-Documentation-Generators/utility.git
!pip install git+https://github.com/ISE-Code-Documentation-Generators/data.git

Collecting accelerate
  Downloading accelerate-0.28.0-py3-none-any.whl (290 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m47.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m57.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.1.105 (from torch>=1.10.0->accelerate)
  Downloading nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
from typing import Dict
import os
import json

import torch, random, pandas as pd
from transformers import pipeline
from ise_cdg_utility.metrics import NLPMetricInterface, CodeMetric, get_metrics
from ise_cdg_data.tokenize import get_source_and_markdown_tokenizers

In [None]:
random.seed(4444)

In [None]:
dataset_path = "samples_dataset.csv"
dataset = pd.read_csv(dataset_path)

def dataset_get_markdown(index: int, project_id=None):
    df = dataset
    if project_id is not None:
        df = df[df['project_ID']==project_id]
    return str(df.iloc[index]['markdown'])

def dataset_get_source(index: int, project_id=None):
    df = dataset
    if project_id is not None:
        df = df[df['project_ID']==project_id]
    return str(df.iloc[index]['source'])

In [None]:
class Pipeline:
    def __init__(self, l):
        self.l = l

    def to_map(self, f):
        return Pipeline(list(map(f, self.l)))

    def to_reduce(self, f, initial=None):
        from functools import reduce
        return reduce(f, self.l, initial)

    def to_list(self):
        return self.l

class Template:
    def __init__(self, index: int, row_index: int, project_id=None):
        self.index = index
        self.row_index = row_index
        self.project_id = project_id

    def representative_index(template):
        return str(template.index + 1)

    def generate_prompt(template):
        return (f"#Code\n{dataset_get_markdown(template.row_index)}\n"
                + f"#Summary: {dataset_get_source(template.row_index)}\n"
                )

class Sample:
    def __init__(self, template_indices, question_index, project_id=None):
        self.templates = (Pipeline(range(len(template_indices)))
                          .to_map(lambda template_index: Template(template_index, template_indices[template_index], project_id=project_id))
                          .to_list())
        self.question_index = question_index
        self.project_id = project_id

    def generate_prompt(self):
        return ("You are an expert Python programmer, please describe the functionality of the method:\n"
                + "".join(Pipeline(self.templates)
                           .to_map(lambda template: template.generate_prompt())
                           .to_list())
                + f"\n#Code\n{dataset_get_source(self.question_index)}\n#Summary:"
               )

    def get_ground_truth(self):
        return dataset_get_markdown(self.question_index)

def generate_prompt_data(samples):
    prompt_list = Pipeline(samples).to_map(lambda sample: sample.generate_prompt()).to_list()
    grund_truth = Pipeline(samples).to_map(lambda sample: sample.get_ground_truth()).to_list()
    return prompt_list, grund_truth

In [None]:
sample_size = 10
shot_size = 5

def get_samples():
    df = dataset
    samples = []
    for pid in df['project_ID'].unique():
        qid = qid = random.sample(range(0, len(df[df['project_ID']==pid])), 1)[0]
        templates = [
            ind
            for ind in random.sample(
                range(0, dataset.shape[0]
                    ), min(shot_size, len(df[df['project_ID']==pid])))
            if ind != qid
            ]
        samples.append(Sample(
            template_indices=templates, question_index=qid,
        ))
    return samples

prompt_list, ground_truth = generate_prompt_data(get_samples())

In [None]:
dolly = pipeline(model="databricks/dolly-v2-3b",
                            torch_dtype=torch.bfloat16,
                            trust_remote_code=True,
                            device_map="auto")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/819 [00:00<?, ?B/s]

instruct_pipeline.py:   0%|          | 0.00/9.16k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/databricks/dolly-v2-3b:
- instruct_pipeline.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


pytorch_model.bin:   0%|          | 0.00/5.68G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
def prompt_model(input):
  dolly_response = dolly(input, max_new_tokens=100)
  return dolly_response[0]["generated_text"]

In [None]:
metrics: Dict[CodeMetric, NLPMetricInterface] = get_metrics()
_, md_tokenizer = get_source_and_markdown_tokenizers(cleanse_markdown=False)

In [None]:
references = []
for i in range(len(ground_truth)):
  gt = ground_truth[i]
  reference = md_tokenizer(gt)
  references.append([reference])

print(references)

[[['left', 'the', 'kids', 'downstairs', '.', 'parents', 'that', 'have', 'much', 'better', 'room', 'than', 'their', 'children', '.']], [['party', 'all', 'night', 'third', 'class', 'and', 'alone', 'and', 'spent', 'less', 'than', '5', 'dollars']], [['duffing', 'map']], [['what', 'a', 'coincidence', 'travelling', 'with', 'a', 'woman', 'and', 'she', "'s", 'not', 'his', 'wife', '.']], [['nothing', 'but', 'the', 'best', 'a', 'person', 'that', 'paid', 'an', 'unusually', 'high', 'price', '(', 'z', '-', 'score', ')', 'for', 'his', 'ticket', '.']], [['nanny', 'in', 'every', 'corner', 'travelling', 'first', 'class', 'and', 'have', 'at', 'least', 'one', 'nanny']], [['stole', 'the', 'ticket', 'spent', 'less', 'than', '5', 'dollars']], [['is', 'rich', 'travelling', 'first', 'class', 'and', 'alone', 'or', 'with', 'a', 'nanny', 'and', 'spent', 'more', 'than', '40', 'dollars']], [['party', 'all', 'night', 'third', 'class', 'and', 'alone', 'and', 'spent', 'less', 'than', '5', 'dollars']], [['who', 'the',

In [None]:
for metric in metrics.values():
  metric.set_references(references)

In [None]:
candidates = []
for i in range(len(prompt_list)):
  prompt = prompt_list[i]
  candidate = md_tokenizer(prompt_model(prompt))
  candidates.append(candidate)

print(candidates)

[['the', 'area', 'of', 'a', 'circle', 'is', 'equal', 'to', 'pi*(radius)^2', '.', 'the', 'radius', 'is', 'the', 'value', 'of', 'the', 'diameter', 'divided', 'by', '2', '.', 'round', 'the', 'value', 'of', 'the', 'area', 'to', 'two', 'decimal', 'places', '.', 'the', 'area', 'of', 'a', 'circle', 'is', 'equal', 'to', 'pi', '*', '(', 'd/2)^2', '.'], ['please', 'note', 'that', 'this', 'example', 'uses', 'numpy', ',', 'the', 'python', 'library', 'for', 'numerical', 'computation', ',', 'as', 'well', 'as', 'pyplot', '.', 'also', ',', 'note', 'that', 'both', 'a', 'and', 'b', 'are', 'lists', ',', 'and', 'the', 'indexing', 'of', 'each', 'element', 'can', 'be', 'performed', 'with', 'the', 'enumerate', '(', ')', 'function', '.'], ['this', 'just', 'takes', 'the', 'mean', 'and', 'standard', 'deviation', 'of', 'a', 'dataset', 'and', 'scales', 'the', 'data', 'to', 'fall', 'in', 'that', 'range'], ['windmi', 'attractor', '\\begin{alignedat}{0', '}', '\\frac{dx}{dt', '}', '=', 'y\\', '\\', '\\frac{dy}{dt', 

In [None]:
results = {}
for metric_name, metric in metrics.items():
  uncleaned_result = metric(candidates)
  result = {}
  for k, v in uncleaned_result.items():
    result[k] = float(v)
  results[metric_name.value] = result



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



In [None]:
results

{'bleu': {'bleu_1': 0.06927710771560669,
  'bleu_2': 0.020743494853377342,
  'bleu_3': 0.0,
  'bleu_4': 0.0},
 'rouge': {'rouge1_fmeasure': 0.0963231697678566,
  'rouge1_precision': 0.09426577389240265,
  'rouge1_recall': 0.14226017892360687,
  'rouge2_fmeasure': 0.022457709535956383,
  'rouge2_precision': 0.023563219234347343,
  'rouge2_recall': 0.03511904925107956,
  'rougeL_fmeasure': 0.0963231697678566,
  'rougeL_precision': 0.09426577389240265,
  'rougeL_recall': 0.14226017892360687,
  'rougeLsum_fmeasure': 0.0963231697678566,
  'rougeLsum_precision': 0.09426577389240265,
  'rougeLsum_recall': 0.14226017892360687},
 'bert': {'bert_score': 0.5274179577827454}}

In [None]:
# Open a file in write mode
with open('Dolly-result.json', 'w') as json_file:
    # Write the JSON data to the file using json.dump()
    json.dump(results, json_file)