In [2]:
import json
from datasets import Dataset, load_from_disk

from tqdm import tqdm
import pickle as pkl
import torch
from torch.optim import AdamW
from torch.utils.data import DataLoader
from transformers import (
    AutoModelForSequenceClassification,
    get_linear_schedule_with_warmup,
    BitsAndBytesConfig,
    LlamaForCausalLM,
    LlamaTokenizer,
    AutoModelForCausalLM,
    AutoTokenizer
)
import re
import argparse

  from .autonotebook import tqdm as notebook_tqdm


In [109]:
with open('qa_text_scienceq_train_all.json', 'r') as f:
    train=json.load(f)

In [110]:
train[0]

{'image': None,
 'question': 'Which tense does the sentence use?\nMona will print her name with care.',
 'choices': ['present tense', 'future tense', 'past tense'],
 'answer': 'B',
 'hint': '',
 'task': 'closed choice',
 'grade': 'grade2',
 'subject': 'language science',
 'topic': 'verbs',
 'category': 'Verb tense',
 'skill': 'Is the sentence in the past, present, or future tense?',
 'lecture': 'Present tense verbs tell you about something that is happening now.\nMost present-tense verbs are regular. They have no ending, or they end in -s or -es.\nTwo verbs are irregular in the present tense, to be and to have. You must remember their forms.\nPast tense verbs tell you about something that has already happened.\nMost past-tense verbs are regular. They end in -ed.\nSome verbs are irregular in the past tense. You must remember their past-tense forms.\nFuture tense verbs tell you about something that is going to happen.\nAll future-tense verbs use the word will.\nPresent | Past | Future\nw

In [111]:
with open('qa_text_scienceq_validation_all.json', 'r') as f:
    val=json.load(f)

In [112]:
val[0]

{'image': None,
 'question': "What does the verbal irony in this text suggest?\nAccording to Mr. Herrera's kids, his snoring is as quiet as a jackhammer.",
 'choices': ['The snoring is loud.', 'The snoring occurs in bursts.'],
 'answer': 'A',
 'hint': '',
 'task': 'closed choice',
 'grade': 'grade8',
 'subject': 'language science',
 'topic': 'figurative-language',
 'category': 'Literary devices',
 'skill': 'Interpret figures of speech',
 'lecture': 'Figures of speech are words or phrases that use language in a nonliteral or unusual way. They can make writing more expressive.\nVerbal irony involves saying one thing but implying something very different. People often use verbal irony when they are being sarcastic.\nOlivia seems thrilled that her car keeps breaking down.\nEach breakdown is as enjoyable as a punch to the face.',
 'solution': "The text uses verbal irony, which involves saying one thing but implying something very different.\nAs quiet as a jackhammer suggests that the snorin

In [113]:
def combine_keys(json_data, key1, key2, new_key):
    for item in json_data:
        if key1 in item and key2 in item:
            item[new_key] = f"{item[key1]} {item[key2]}"  # Customize how you combine values
    return json_data

In [114]:
new_train=combine_keys(train,'instruction', 'output', 'text')

In [115]:
new_val=combine_keys(val,'instruction', 'output', 'text')

In [116]:
val[1]


{'image': None,
 'question': 'Is this a sentence fragment?\nDuring the construction of Mount Rushmore, approximately eight hundred million pounds of rock from the mountain to create the monument.',
 'choices': ['no', 'yes'],
 'answer': 'B',
 'hint': '',
 'task': 'yes or no',
 'grade': 'grade12',
 'subject': 'language science',
 'topic': 'writing-strategies',
 'category': 'Sentences, fragments, and run-ons',
 'skill': 'Identify sentence fragments',
 'lecture': "A sentence is a group of words that expresses a complete thought.\nThe band I'm in has been rehearsing daily because we have a concert in two weeks.\nA sentence fragment is a group of words that does not express a complete thought.\nRehearsing daily because we have a concert in two weeks.\nThis fragment is missing a subject. It doesn't tell who is rehearsing.\nThe band I'm in.\nThis fragment is missing a verb. It doesn't tell what the band I'm in is doing.\nBecause we have a concert in two weeks.\nThis fragment is missing an inde

In [117]:
new_val[1]

{'image': None,
 'question': 'Is this a sentence fragment?\nDuring the construction of Mount Rushmore, approximately eight hundred million pounds of rock from the mountain to create the monument.',
 'choices': ['no', 'yes'],
 'answer': 'B',
 'hint': '',
 'task': 'yes or no',
 'grade': 'grade12',
 'subject': 'language science',
 'topic': 'writing-strategies',
 'category': 'Sentences, fragments, and run-ons',
 'skill': 'Identify sentence fragments',
 'lecture': "A sentence is a group of words that expresses a complete thought.\nThe band I'm in has been rehearsing daily because we have a concert in two weeks.\nA sentence fragment is a group of words that does not express a complete thought.\nRehearsing daily because we have a concert in two weeks.\nThis fragment is missing a subject. It doesn't tell who is rehearsing.\nThe band I'm in.\nThis fragment is missing a verb. It doesn't tell what the band I'm in is doing.\nBecause we have a concert in two weeks.\nThis fragment is missing an inde

In [118]:
new_train[0].keys()

dict_keys(['image', 'question', 'choices', 'answer', 'hint', 'task', 'grade', 'subject', 'topic', 'category', 'skill', 'lecture', 'solution', 'input', 'instruction', 'output', 'text'])

In [119]:
train_dataset= Dataset.from_list(new_train)

In [120]:
train_dataset

Dataset({
    features: ['image', 'question', 'choices', 'answer', 'hint', 'task', 'grade', 'subject', 'topic', 'category', 'skill', 'lecture', 'solution', 'input', 'instruction', 'output', 'text'],
    num_rows: 6508
})

In [121]:
val_dataset = Dataset.from_list(new_val)

In [122]:
val_dataset

Dataset({
    features: ['image', 'question', 'choices', 'answer', 'hint', 'task', 'grade', 'subject', 'topic', 'category', 'skill', 'lecture', 'solution', 'input', 'instruction', 'output', 'text'],
    num_rows: 2144
})

In [123]:
train_dataset.save_to_disk("qa_text_scienceq_train_IF.hf")

Saving the dataset (1/1 shards): 100%|██████████| 6508/6508 [00:00<00:00, 51566.23 examples/s]


In [124]:
val_dataset.save_to_disk("qa_text_scienceq_validation_IF.hf")

Saving the dataset (1/1 shards): 100%|██████████| 2144/2144 [00:00<00:00, 41752.39 examples/s]


In [125]:
# test_dataset = Dataset.from_list(new_test)

In [126]:
# test_dataset = Dataset.from_list(new_test)

In [127]:
# test_dataset.save_to_disk("qa_mrpc_test_IF.hf")

In [128]:
# new_test=combine_keys(test,'instruction', 'output', 'text')

In [129]:
# with open('qa_mrpc_train_IF.json', 'w') as f:
#     json.dump(new_train, f)

In [130]:
# with open('qa_mrpc_test_IF.json', 'w') as f:
#     json.dump(new_test, f)

In [163]:
dataset = load_from_disk("qa_text_scienceq_train_IF.hf")

In [186]:
dataset=load_from_disk('/nas02/Hadi/Model-Selection-IF/alphalora/datasets/qa_text_scienceq_train_IF-short.hf')

In [187]:
dataset[0]

{'question': 'The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?',
 'question_concept': 'punishing',
 'choices': {'label': ['A', 'B', 'C', 'D', 'E'],
  'text': ['ignore', 'enforce', 'authoritarian', 'yell at', 'avoid']},
 'answerKey': 'A',
 'answer': 'A',
 'input': '',
 'instruction': 'Question: The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?\nOptions: (A) ignore (B) enforce (C) authoritarian (D) yell at (E) avoid\n',
 'output': 'Answer: The answer is A.',
 'text': 'Question: The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?\nOptions: (A) ignore (B) enforce (C) authoritarian (D) yell at (E) avoid\n Answer: The answer is A.'}

In [54]:
dataset = load_from_disk("qa_text_scienceq_train_IF.hf")

In [55]:
short_dataset = dataset.select(range(300))
short_dataset.save_to_disk("qa_text_scienceq_train_IF-short.hf")

Saving the dataset (1/1 shards): 100%|██████████| 300/300 [00:00<00:00, 15816.42 examples/s]


In [56]:
dataset = load_from_disk("qa_text_scienceq_validation_IF.hf")

In [57]:
short_dataset = dataset.select(range(50))
short_dataset.save_to_disk("qa_text_scienceq_validation_IF-short.hf")

Saving the dataset (1/1 shards): 100%|██████████| 50/50 [00:00<00:00, 4753.29 examples/s]
