In [1]:
print("Success!")

Success!


# Load Dataset and Model

In [2]:
from datasets import load_dataset

file_path = "../data/to_annotate_150_cleaned.json"
dataset = load_dataset("json", data_files=file_path, split="train")

Found cached dataset json (/home/9130/.cache/huggingface/datasets/json/default-c0867ada11e1561a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline

# Version 2-7b
# base_model_name = "meta-llama/Llama-2-7b-hf"
# Version 2-13b-chat
base_model_name = "meta-llama/Llama-2-13b-chat-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

device_map = {"": 0}

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map=device_map,
    trust_remote_code=True,
    use_auth_token=True
)

base_model.config.use_cache = False

# More info: https://github.com/huggingface/transformers/pull/24906
base_model.config.pretraining_tp = 1 



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "left"
tokenizer

LlamaTokenizerFast(name_or_path='meta-llama/Llama-2-13b-chat-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False), 'pad_token': '</s>'}, clean_up_tokenization_spaces=False)

In [5]:
dataset[0]

{'id': 'f4acjs9',
 'body_cleaned': 'as a federal leo , the very idea of confiscating guns is laughable . i swore an oath to the constitution , not to beto or any other politician .',
 'User label': '',
 'author': 'MDeXY',
 'subreddit': 'progun',
 'predicted_community': 0,
 'score': 454,
 'created_utc': 1571492410}

# Define Custom Dataset and Prompt Template

In [46]:
from langchain.prompts.few_shot import FewShotPromptTemplate
from langchain.prompts.prompt import PromptTemplate

# Define examples
examples = [
    {
        "instruction": "<<SYS>>\nYou are a sentence sentiment polarity classification assistant about gun control. And here are definitions of labels: \
Support Gun: Explicitly opposes gun ownership or is in favor of legal policies such as banning guns and confiscating personal guns. \
Anti Gun: Explicitly in favor of individual gun ownership, or against gun bans and gun confiscation. \
Neutral: The statement is centered around the debate on gun control, but there is no clear opinion expressed. \
Not Relevant: Don't have any obvious relationship to guns. \
Not Sure: The sentence statements are describing gun support for owning / banning guns, but due to a lack of relevant context, or some other reason, we can sense the emotional inclination, but not the specific opinion or polarized aspect. \
And the sentences are considered as polarized if they are or about antagonizing statements / hostility / belittling / animosity: 'us vs them',  inter-group antagonism, radicalization, conflictive confrontation, and so on. \
the sentences are considered as non-polarized if they are or aboutc onstructive civic conversation, bring together to a common ground, peaceful dialogue, and so on. \
\n<<SYS>>\n\nPlease classify the sentiment polarity of the following sentence about gun support into one of the following categories: \
'Support Gun Polarized', 'Support Gun non-Polarized', 'Neutral', 'Anti Gun Polarized', 'Anti Gun non-Polarized', 'Not relevant' or 'Not Sure':",
        "sentence": "as a federal leo , the very idea of confiscating guns is laughable . i swore an oath to the constitution , not to beto or any other politician .",
        "label": "Support Gun Polarized",
    },
    {
        "instruction": "Please classify the sentiment polarity of the following sentence about gun support into one of the following categories: \
'Support Gun Polarized', 'Support Gun non-Polarized', 'Neutral', 'Anti Gun Polarized', 'Anti Gun non-Polarized', 'Not relevant' or 'Not Sure':",
        "sentence": "we must ban this horrible weapon of war",
        "label": "Anti Gun Polarized",
    },
    {
        "instruction": "Please classify the sentiment polarity of the following sentence about gun support into one of the following categories: \
'Support Gun Polarized', 'Support Gun non-Polarized', 'Neutral', 'Anti Gun Polarized', 'Anti Gun non-Polarized', 'Not relevant' or 'Not Sure':",
        "sentence": "this is also why i love hickok45 . he 's just an all-around good guy . one of the best representatives out there for the pro-2a community .",
        "label": "Support Gun Non-Polarized",
    },
    {
        "instruction": "Please classify the sentiment polarity of the following sentence about gun support into one of the following categories: \
'Support Gun Polarized', 'Support Gun non-Polarized', 'Neutral', 'Anti Gun Polarized', 'Anti Gun non-Polarized', 'Not relevant' or 'Not Sure':",
        "sentence": "repeal the 2nd amendment make real national laws , strictly enforced .",
        "label": "Anti Gun Non-Polarized",
    },
    {
        "instruction": "Please classify the sentiment polarity of the following sentence about gun support into one of the following categories: \
'Support Gun Polarized', 'Support Gun non-Polarized', 'Neutral', 'Anti Gun Polarized', 'Anti Gun non-Polarized', 'Not relevant' or 'Not Sure':",
        "sentence": "when seconds matter , the police are only minutes away ...",
        "label": "Not Sure",
    },
    {
        "instruction": "Please classify the sentiment polarity of the following sentence about gun support into one of the following categories: \
'Support Gun Polarized', 'Support Gun non-Polarized', 'Neutral', 'Anti Gun Polarized', 'Anti Gun non-Polarized', 'Not relevant' or 'Not Sure':",
        "sentence": "i 'm convinced those two issues inspire most of the 5 % of the population that votes libertarian .",
        "label": "Neutral",
    },
    {
        "instruction": "Please classify the sentiment polarity of the following sentence about gun support into one of the following categories: \
'Support Gun Polarized', 'Support Gun non-Polarized', 'Neutral', 'Anti Gun Polarized', 'Anti Gun non-Polarized', 'Not relevant' or 'Not Sure':",
        "sentence": "for good reason . god she sucked so much",
        "label": "Not Relevant",
    },
]

In [47]:
# Define example prompt and few-shot prompt
example_prompt = PromptTemplate(
    input_variables=["instruction", "sentence", "label"], 
    template="<s><INST> {instruction}\nSentence: {sentence}\nSentiment Polarity: </INST> {label} </s>",
)

prompt = FewShotPromptTemplate(
    examples=examples, 
    example_prompt=example_prompt, 
    suffix="<s><INST> Please classify the sentiment polarity of the following sentence about gun support into one of the following categories: \
'Support Gun Polarized', 'Support Gun non-Polarized', 'Neutral', 'Anti Gun Polarized', 'Anti Gun non-Polarized', 'Not relevant' or 'Not Sure':\nSentence: {sentence}\nSentiment Polarity: </INST>",
    # suffix="<s><INST> Classify the sentiment of the following text only into these two categories :'positive' or 'negative':\nSentence: {sentence}\nSentiment: </INST>", 
    input_variables=["sentence"]
)

In [48]:
sent2test = "I am so happy!"
prompt2test = prompt.format(sentence=sent2test)
print(prompt2test)

<s><INST> <<SYS>>
You are a sentence sentiment polarity classification assistant about gun control. And here are definitions of labels: Support Gun: Explicitly opposes gun ownership or is in favor of legal policies such as banning guns and confiscating personal guns. Anti Gun: Explicitly in favor of individual gun ownership, or against gun bans and gun confiscation. Neutral: The statement is centered around the debate on gun control, but there is no clear opinion expressed. Not Relevant: Don't have any obvious relationship to guns. Not Sure: The sentence statements are describing gun support for owning / banning guns, but due to a lack of relevant context, or some other reason, we can sense the emotional inclination, but not the specific opinion or polarized aspect. And the sentences are considered as polarized if they are or about antagonizing statements / hostility / belittling / animosity: 'us vs them',  inter-group antagonism, radicalization, conflictive confrontation, and so on. t

In [49]:
import torch
from torch.utils.data import Dataset, DataLoader

class ChatDataset(Dataset):
    def __init__(self, dataset, tokenizer, prompt):
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.prompt = prompt

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        val = self.dataset[idx]
        sentence = val['body_cleaned']
        text = self.prompt.format(sentence=sentence)
        inputs = self.tokenizer(text, return_tensors="pt").to("cuda")
        
        return {
            'input_ids': inputs['input_ids'].squeeze(),
            'attention_mask': inputs['attention_mask'].squeeze(),
        }


In [50]:
def collate_fn(batch):
    input_ids = [item['input_ids'].tolist() for item in batch]
    attention_mask = [item['attention_mask'].tolist() for item in batch]

    # Left Padding
    max_length = max([len(item) for item in input_ids])
    input_ids = [[0]*(max_length - len(item)) + item for item in input_ids]
    attention_mask = [[0]*(max_length - len(item)) + item for item in attention_mask]

    # Convert lists to tensors
    input_ids = torch.tensor(input_ids)
    attention_mask = torch.tensor(attention_mask)

    return {
        'input_ids': input_ids,
        'attention_mask': attention_mask,
    }

# Few-shot Inference

In [51]:
import re
import copy
from tqdm import tqdm


def inference(dataset, prompt, batch_size=8):
    to_annotate_dataset = copy.deepcopy(dataset)
    to_annotate_dataset = to_annotate_dataset.remove_columns(["User label"])

    chatDataset = ChatDataset(dataset, tokenizer, prompt)
    data_loader = DataLoader(chatDataset, batch_size=batch_size, shuffle=False, collate_fn=collate_fn)

    invalid_label = []
    all_labels = []

    for i, batch in enumerate(tqdm(data_loader)):
        # Move batch to GPU
        input_ids = batch["input_ids"].to("cuda")
        attention_mask = batch["attention_mask"].to("cuda")

        # Generate for the entire batch
        outputs = base_model.generate(
            input_ids=input_ids,
            attention_mask=attention_mask,
            max_new_tokens=512,
            pad_token_id=tokenizer.eos_token_id
        )

        # Decode the generated text and labels
        outputs_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        
        # Evaluate the generated text
        for idx in range(len(outputs_text)):
            # Extract the last sentence
            selected_sentiment = outputs_text[idx].split("\n")[-1].lower()
            # Remove the prompt
            selected_sentiment = selected_sentiment.split("</inst> ")[-1]

            if selected_sentiment not in ['support gun polarized', 'support gun non-polarized', 'neutral', 'anti gun non-polarized', 'anti gun polarized', 'not sure', 'not relevant']:
                invalid_label.append(selected_sentiment)
            
            all_labels.append(selected_sentiment)
            # to_annotate_dataset[i * batch_size + idx]["User label"] = selected_sentiment
    
    # Add labels to the dataset
    to_annotate_dataset = to_annotate_dataset.add_column("User label", all_labels)

    return to_annotate_dataset, invalid_label


In [52]:
annotated_dataset, invalid_label = inference(dataset, prompt, 8)

100%|██████████| 19/19 [01:51<00:00,  5.89s/it]


In [53]:
annotated_dataset

Dataset({
    features: ['id', 'body_cleaned', 'author', 'subreddit', 'predicted_community', 'score', 'created_utc', 'User label'],
    num_rows: 150
})

In [58]:
annotated_dataset[3]

{'id': 'dwx41ns',
 'body_cleaned': 'meanwhile in england , an old man is being charged with murder after he killed one of the men breaking into his home',
 'author': 'qhsBh',
 'subreddit': 'progun',
 'predicted_community': 0,
 'score': 339,
 'created_utc': 1523030793,
 'User label': 'support gun polarized'}

## Save annotated dataset

In [59]:
annotated_dataset.to_json('../data/annotated_data_llama2_v2_context.json', orient='records', lines=True)

Creating json from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

44907