In [1]:
!pip -q install git+https://github.com/huggingface/transformers # need to install from github
!pip -q install accelerate>=0.12.0
!pip install datasets

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m48.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
Collecting datasets
  Downloading datasets-2.14.4-py3-none-any.whl (519 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m519.3/519.3 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.8,>=0.3.0 (from datasets)
  Downloading dill-0.3.7-py3-none-any.whl (115 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m8.0 MB/s[0

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
%cd drive/MyDrive/en2sparql

Mounted at /content/drive
/content/drive/MyDrive/en2sparql


In [3]:
import torch
import json
from transformers import pipeline, AutoTokenizer
from datasets import load_dataset

tokenizer = AutoTokenizer.from_pretrained("databricks/dolly-v2-3b", padding_side="left")

dolly = pipeline(model="databricks/dolly-v2-3b", torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto")
raw_datasets = load_dataset("orkg/SciQA")
print(raw_datasets)


def divide_chunks(l_, n_):
    for i_ in range(0, len(l_), n_):
        yield l_[i_:i_ + n_]


def clean(st):
    st = st.replace("\n", " ")
    st = st.replace("?", " ?")
    st = st.replace("{", " { ")
    st = st.replace("}", " } ")
    st = st.replace("\\'", "'")

    while "  " in st:
        st = st.replace("  ", " ")
    return st


def get_key(q):
    t0 = q.get('template_id')
    if t0 is None:
        t0 = "None"
    t = str(q.get("number_of_patterns")) + "-" + t0
    return t


def get_keys(n_):
    train = raw_datasets.get("train")
    patterns = {}
    for q in train:
        t = get_key(q)
        query = clean(q["query"]["sparql"])
        question = q["question"]["string"]
        if t not in patterns:
            patterns[t] = [[query, question, len(query)]]
        else:
            patterns[t].append([query, question, len(query)])

    for t in patterns:
        code = patterns.get(t)
        code = sorted(code, key=lambda x: x[2], reverse=False)
        patterns[t] = code[:n_]
    return patterns


def prepare_queries(n_):
    keys = get_keys(n_)
    data = raw_datasets.get("test")
    queries = []
    for q in data:
        t = get_key(q)
        question = q["question"]["string"]
        suggestion = keys.get(t)
        if suggestion is None:
            print("Error with key", t)
            queries.append("translate the following English text '" + question + "' to a sparql query")
        else:
            final_q = ""
            for i_, k in enumerate(suggestion):
                final_q += "\n input (English text): " + k[1]
                final_q += "\n output (Sparql query): " + k[0]
            # works better with gpt
            # final_q += "\n with this example what is the sparql query for:  " + question

            # works better with dolly
            final_q += "\n input (English text): " + question
            final_q += "\n output (Sparql query): "

            queries.append(final_q)
    return queries


def save_json(filename,data):
    with open(filename, "w", encoding="utf-8") as json_file:
        print(json.dumps(data), file=json_file)


shots = 4
query_list = prepare_queries(shots)

print(len(query_list))

n = 50
q_list = list(divide_chunks(query_list, n))
sparql = [clean(x["query"]["sparql"]) for x in raw_datasets.get("test")]

gs = []
lens =[]

i = 0

for group in q_list:
    print(str(i) + "%", end="  ")
    i += 1/len(q_list)*100

    res_ = [tokenizer.encode(question) for question in group]
    len_ = [len(x) for x in res_]
    warning = [x for x in len_ if x > 2048]
    if len(warning)>0:
      quit()
    lens += len_

    res = dolly(group)
    print(res)
    gst = [x[0]["generated_text"] for x in res]

    for ii, l in enumerate(gst):
        for iii in range(5):
            if "SELECT" not in l:
                print(iii,ii)
                res = dolly(group[ii])
                gst[ii] = res[0]["generated_text"]
                l = gst[ii]
            else:
                break
    gs += gst

    result = {"questions": query_list, "sparql": sparql, "generated_sparql": gs, "prompt_len": lens}
    save_json("dolly_"+str(shots)+"_shot_results_tok.json", result)
    # break



Downloading (…)okenizer_config.json:   0%|          | 0.00/450 [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/819 [00:00<?, ?B/s]

Downloading (…)instruct_pipeline.py:   0%|          | 0.00/9.16k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/databricks/dolly-v2-3b:
- instruct_pipeline.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Downloading pytorch_model.bin:   0%|          | 0.00/5.68G [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.54k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/6.08k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/553k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'query_type', 'question', 'paraphrased_question', 'query', 'template_id', 'query_shape', 'query_class', 'auto_generated', 'number_of_patterns'],
        num_rows: 1795
    })
    validation: Dataset({
        features: ['id', 'query_type', 'question', 'paraphrased_question', 'query', 'template_id', 'query_shape', 'query_class', 'auto_generated', 'number_of_patterns'],
        num_rows: 257
    })
    test: Dataset({
        features: ['id', 'query_type', 'question', 'paraphrased_question', 'query', 'template_id', 'query_shape', 'query_class', 'auto_generated', 'number_of_patterns'],
        num_rows: 513
    })
})
Error with key 10-None
513
0%  [[{'generated_text': 'SELECT DISTINCT?model?model_lbl WHERE {?metric a orkgc:Metric; rdfs:label?metric_lbl. FILTER (str(?metric_lbl) = "Accuracy") { SELECT?model?model_lbl WHERE {?dataset a orkgc:Dataset; rdfs:label?dataset_lbl. FILTER (str(?dataset_lbl) = "Story Cloze Test")?benchmark 



0 15
1 15
2 15
3 15
0 19
0 22
1 22
0 26
0 29
0 30
1 30
0 32
0 34
1 34
2 34
3 34
4 34
0 35
1 35
2 35
3 35
0 36
1 36
0 37
0 41
0 44
9.090909090909092%  [[{'generated_text': 'The highest score for the metric "accessa-hits" achieved on the WSC dataset is 1000.00.\nThe highest score for the metric "accessa-page-views" achieved on the WSC dataset is 3000.00.'}], [{'generated_text': 'SELECT DISTINCT?paper?paper_lbl WHERE { \n   ?dataset a orkgc:Dataset; \n    rdfs:label?dataset_lbl. \n    FILTER (str(?dataset_lbl) = "Oxford-IIIT Pets") \n   ?benchmark orkgp:HAS_DATASET?dataset. \n   ?cont orkgp:HAS_BENCHMARK?benchmark. \n   ?paper orkgp:P31?cont; \n    rdfs:label?paper_lbl. \n}'}], [{'generated_text': 'SELECT DISTINCT?metric?metric_lbl (MAX(?value) AS?score) WHERE { { SELECT?metric?metric_lbl?value WHERE {?dataset a orkgc:Dataset; rdfs:label?dataset_lbl. FILTER (str(?dataset_lbl) = "WOS-46985")?benchmark orkgp:HAS_DATASET?dataset; orkgp:HAS_EVALUATION?eval.?eval orkgp:HAS_VALUE?value. OPTIONA