In [5]:
from datasets import load_dataset

dataset = load_dataset("json", data_files="../../data/daccord_yifei_v2_leo_supp_anti_oth.json", split="train")

Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 1266.40it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 214.26it/s]
Generating train split: 113 examples [00:00, 5788.99 examples/s]


In [6]:
len(dataset)

113

In [7]:
dataset[0]

{'id': 'f4acjs9',
 'body_cleaned': 'as a federal leo , the very idea of confiscating guns is laughable . i swore an oath to the constitution , not to beto or any other politician .',
 'User label': 'support gun',
 'author': 'MDeXY',
 'subreddit': 'progun',
 'predicted_community': 0,
 'score': 454,
 'created_utc': 1571492410}

In [8]:
from collections import Counter
Counter(dataset["User label"])

Counter({'other': 62, 'support gun': 27, 'anti gun': 24})

In [9]:
def convert_into_prompt_template(system_prompt, user_message, train=True, label_text=""):
    if train:
        text = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\nSentence: {user_message}\nUser label: [/INST] {label_text} </s>"
    else:
        text = f"<s>[INST] <<SYS>>\n{system_prompt}\n<</SYS>>\n\nSentence: {user_message}\nUser label: [/INST] "

    return text

In [10]:
system_prompt_with_definition = """You are a sentence sentiment polarity classification assistant about gun control. And here are definitions of labels: \
Support Gun: Explicitly opposes gun ownership or is in favor of legal policies such as banning guns and confiscating personal guns. \
Anti Gun: Explicitly in favor of individual gun ownership, or against gun bans and gun confiscation. \
Neutral: The statement is centered around the debate on gun control, but there is no clear opinion expressed. \
Not Relevant: Don't have any obvious relationship to guns. \
Not Sure: The sentence statements are describing gun support for owning / banning guns, but due to a lack of relevant context, or some other reason, we can sense the emotional inclination, but not the specific opinion or polarized aspect. \
And the sentences are considered as polarized if they are or about antagonizing statements / hostility / belittling / animosity: 'us vs them',  inter-group antagonism, radicalization, conflictive confrontation, and so on. \
the sentences are considered as non-polarized if they are or aboutc onstructive civic conversation, bring together to a common ground, peaceful dialogue, and so on. \
Please classify the sentiment polarity of the following sentence about gun support into one of the following categories: \
'Support Gun Polarized', 'Support Gun non-Polarized', 'Neutral', 'Anti Gun Polarized', 'Anti Gun non-Polarized', 'Not relevant' or 'Not Sure'."""

system_prompt = """You are a sentence sentiment polarity classification assistant about gun control. \
Please classify the sentiment polarity of the following sentence about gun support into one of the following categories: \
'Support Gun Polarized', 'Support Gun non-Polarized', 'Neutral', 'Anti Gun Polarized', 'Anti Gun non-Polarized', 'Not relevant' or 'Not Sure'."""

sp_pol_unpol_oth = """You are a sentence sentiment polarity classification assistant about gun control. And here are definitions of labels:\
Polarized: The sentences are considered as 'polarized' if they are or about antagonizing statements / hostility / belittling / animosity: 'us vs them', inter-group antagonism, radicalization, conflictive confrontation, \
closed to contradiction, trolling, affective, ideological, bad faith, with cognitive bias, with social or demographic bias, irony, sarcasm, hate speech, offensive, toxic, fake news, dismiss language, stereotypes and so on.\
Unpolarized: The sentences are considered as 'unpolarized' if they are or about constructive civic conversation, bring together to a common ground, peaceful dialogue, open-minded demeanour focused on learning and forming opinions, \
genuinely open to contradiction, open exchange of ideas, educational, humble, good faith and so on.\
Other: The sentences are considered as 'other' if they not polarized or unpolarized or hard to tell.\
Please classify the sentiment polarity of the following sentence about gun support into one of the following categories: \
'polarized', 'unpolarized' or 'other'."""

sp_pro_anti_oth = """You are a sentence sentiment political tendency classification assistant about gun control. And here are definitions of labels: \
Support Gun: The sentences are considered as 'support gun' if they are explicitly in favor of individual gun ownership, or against gun bans and gun confiscation. \
Anti Gun: The sentences are considered as 'anti gun' if they are explicitly opposes gun ownership or are in favor of legal policies such as banning guns and confiscating personal guns. \
Other: The sentences are considered as 'other' if they are not 'support gun' or 'anti gun' or hard to tell or not relevent to gun control. \
Please classify the sentiment political tendency of the following sentence about gun support into one of the following categories: \
'support gun', 'anti gun' or 'other'."""


complete_sentence = convert_into_prompt_template(
    system_prompt=sp_pro_anti_oth,
    user_message=dataset[0]['body_cleaned'],
    train=True, 
    label_text=dataset[0]['User label'],
)

In [11]:
len(complete_sentence.split(" "))

162

In [12]:
def map_dataset(system_prompt, dataset, train=True):
    # Convert the dataset into the format required by the model
    def convert(sentence, label_text):
        converted_inputs = convert_into_prompt_template(system_prompt, sentence, train, label_text)
        return {'text': converted_inputs, 'label_text': label_text}

    return dataset.map(convert, input_columns=['body_cleaned', 'User label'], batched=False, remove_columns=['id', 'body_cleaned', 'User label', 'author', 'subreddit', 'predicted_community', 'score', 'created_utc'])

In [13]:
new_dataset = map_dataset(sp_pro_anti_oth, dataset, train=True)
new_validation_dataset = map_dataset(sp_pro_anti_oth, dataset, train=False)

Map:   0%|          | 0/113 [00:00<?, ? examples/s]

Map: 100%|██████████| 113/113 [00:00<00:00, 2027.69 examples/s]
Map: 100%|██████████| 113/113 [00:00<00:00, 2389.90 examples/s]


In [14]:
print(new_dataset[0]['text'])

<s>[INST] <<SYS>>
You are a sentence sentiment political tendency classification assistant about gun control. And here are definitions of labels: Support Gun: The sentences are considered as 'support gun' if they are explicitly in favor of individual gun ownership, or against gun bans and gun confiscation. Anti Gun: The sentences are considered as 'anti gun' if they are explicitly opposes gun ownership or are in favor of legal policies such as banning guns and confiscating personal guns. Other: The sentences are considered as 'other' if they are not 'support gun' or 'anti gun' or hard to tell or not relevent to gun control. Please classify the sentiment political tendency of the following sentence about gun support into one of the following categories: 'support gun', 'anti gun' or 'other'.
<</SYS>>

Sentence: as a federal leo , the very idea of confiscating guns is laughable . i swore an oath to the constitution , not to beto or any other politician .
User label: [/INST] support gun </

In [15]:
print(new_validation_dataset[0]['text'])

<s>[INST] <<SYS>>
You are a sentence sentiment political tendency classification assistant about gun control. And here are definitions of labels: Support Gun: The sentences are considered as 'support gun' if they are explicitly in favor of individual gun ownership, or against gun bans and gun confiscation. Anti Gun: The sentences are considered as 'anti gun' if they are explicitly opposes gun ownership or are in favor of legal policies such as banning guns and confiscating personal guns. Other: The sentences are considered as 'other' if they are not 'support gun' or 'anti gun' or hard to tell or not relevent to gun control. Please classify the sentiment political tendency of the following sentence about gun support into one of the following categories: 'support gun', 'anti gun' or 'other'.
<</SYS>>

Sentence: as a federal leo , the very idea of confiscating guns is laughable . i swore an oath to the constitution , not to beto or any other politician .
User label: [/INST] 


In [16]:
from datasets import DatasetDict

dataset_to_upload = DatasetDict({
    'train': new_dataset,
    'validation': new_validation_dataset
})

In [17]:
dataset_to_upload

DatasetDict({
    train: Dataset({
        features: ['text', 'label_text'],
        num_rows: 113
    })
    validation: Dataset({
        features: ['text', 'label_text'],
        num_rows: 113
    })
})

In [18]:
dataset_to_upload.push_to_hub("OneFly7/llama2-politosphere-fine-tuning-supp-anti-oth")

Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 94.87ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:01<00:00,  1.21s/it]
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:00<00:00,  4.60it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 331.09ba/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:01<00:00,  1.14s/it]
Deleting unused files from dataset repository: 100%|██████████| 1/1 [00:00<00:00,  4.87it/s]
Downloading metadata: 100%|██████████| 622/622 [00:00<00:00, 845kB/s]
