In [4]:
!pip install -q bitsandbytes datasets accelerate loralib
!pip install -q git+https://github.com/huggingface/transformers.git@main git+https://github.com/huggingface/peft.git

[0m

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained("EleutherAI/gpt-j-6b", return_dict=True, load_in_8bit=True, device_map="auto")
tokenizer = AutoTokenizer.from_pretrained("EleutherAI/gpt-j-6b")

Downloading (…)lve/main/config.json:   0%|          | 0.00/930 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/24.2G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/619 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.37M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/4.04k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/357 [00:00<?, ?B/s]

In [2]:
tokenizer.padding_side="left"

In [3]:
from transformers import GenerationConfig, pipeline
config = GenerationConfig(
    do_sample=True,
    max_new_tokens=150,
    top_p=1,
    pad_token_id = tokenizer.eos_token_id)

In [4]:
pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    batch_size=8,
    generation_config=config,
)
pipe.tokenizer.pad_token_id = model.config.eos_token_id

In [5]:
inp_text ="""The following is task of context based long-form question answering. It has Context, Question and Answer. Answer is in long format.
Context:
Stat. § 14-32.1(a), does not make the definition an essential element of the crime pursuant to N.C. Gen. Stat. § 14-32.1(e). Therefore, we reject Defendant’s argument that it is not sufficient for the indictment to “merely state that the victim was ‘handicapped.’ ” Furthermore, the indictment provided Defendant with enough information to prepare a defense for the offense of felony assault on a handicapped person. See Leonard, _ N.C. App. at _, 711 S.E.2d at 873 (rejecting the defendant’s argument that the indictment was not sufficient because the indictment tracked the relevant language of the statute, listed “the essential elements of the offense[,]” and provided the defendant “with enough information to prepare a defense”); State v. Crisp, 126 N.C. App. 30, 36, 483 S.E.2d 462, 466 (<HOLDING>), appeal dismissed and disc. review denied, 346
Question:
Is it necessary for the definition of the crimle to be stated in the indictment according to N.C. Gen. Stat. § 14-32.1(a)?
Answer:
"""

pipe(inp_text)

[{'generated_text': 'The following is task of context based long-form question answering. It has Context, Question and Answer. Answer is in long format.\nContext:\nStat. § 14-32.1(a), does not make the definition an essential element of the crime pursuant to N.C. Gen. Stat. § 14-32.1(e). Therefore, we reject Defendant’s argument that it is not sufficient for the indictment to “merely state that the victim was ‘handicapped.’ ” Furthermore, the indictment provided Defendant with enough information to prepare a defense for the offense of felony assault on a handicapped person. See Leonard, _ N.C. App. at _, 711 S.E.2d at 873 (rejecting the defendant’s argument that the indictment was not sufficient because the indictment tracked the relevant language of the statute, listed “the essential elements of the offense[,]” and provided the defendant “with enough information to prepare a defense”); State v. Crisp, 126 N.C. App. 30, 36, 483 S.E.2d 462, 466 (<HOLDING>), appeal dismissed and disc. re

In [7]:
import pandas as pd

df = pd.read_csv('test_extracted.csv')
df

Unnamed: 0,Context,Question,Legal Reasoning,Answer
0,and Statistical Manual of Mental Disorders 446...,Can the defendants force Dr. Nadel to provide ...,The issue at hand pertains to the compulsion o...,"No, according to the Graham v. Gielchinsky cas..."
1,"one of his trial counsel as a witness, so the ...",Did the Appellee's trial counsel have knowledg...,The question at hand pertains to the knowledge...,It is uncertain whether the Appellee's trial c...
2,Some of Cruikshank’s objections on appeal appe...,Can Cruikshank's objections on the trial court...,The issue at hand pertains to the admissibilit...,"No, Cruikshank's objections on the trial court..."
3,noting that the additional materials provided ...,Can the motion to strike filed by PFM be consi...,The question pertains to the categorization of...,"Yes, the motion to strike filed by PFM can be ..."
4,The State is within the powers reserved to it ...,Can the State refuse to enter into agreements ...,The question revolves around the State's power...,"Yes, the State has the power to refuse to ente..."
...,...,...,...,...
1492,The new biological opinion included an inciden...,Is the Consent Decree considered an injunction...,The question revolves around whether the Conse...,The classification of the Consent Decree as an...
1493,(a) Appealable Orders. (1) The following order...,"Can a party appeal a disposition, review, no r...",The question pertains to the appealability of ...,"Yes, a party can appeal a disposition, review,..."
1494,"criminal laws.” Spaziano v. Florida, 468 U.S. ...",Can the sentence imposed on Williams be consid...,The question pertains to the constitutionality...,"No, the sentence imposed on Williams is not un..."
1495,"F.Supp.3d 700, 703, No. 3:14CV200 (JMM), 2014 ...",Can an entity be held liable for unsolicited c...,The question pertains to the liability of an e...,"Yes, an entity can be held liable for unsolici..."


In [8]:
df['Prompt'] = df.apply(lambda row:f"""The following is task of context based long-form question answering. It has Context, Question and Answer. Answer is in long format.
{row['Context']}
Question:
{row['Question']}
Answer:
""", axis=1)

prompts = df['Prompt'].to_list()

In [9]:
from datasets import Dataset
test_dataset = Dataset.from_pandas(df[['Prompt']])

In [10]:
test_dataset

Dataset({
    features: ['Prompt'],
    num_rows: 1497
})

In [11]:
i=0
from transformers.pipelines.pt_utils import KeyDataset
for out in pipe(KeyDataset(test_dataset, "Prompt")):
    df.loc[i, 'Generated Text'] = out[0]['generated_text']
    df.to_csv('gptj-zeroshot_test_generated.csv', index=False)
    i+=1
    print(i, end='\r')

1497