In [1]:
print("Success!")

Success!


# Load Dataset and Model

In [29]:
from datasets import load_dataset

file_path = "../data/to_annotate_150_cleaned.json"
dataset = load_dataset("json", data_files=file_path, split="train")

Downloading and preparing dataset json/default to /home/9130/.cache/huggingface/datasets/json/default-c0867ada11e1561a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset json downloaded and prepared to /home/9130/.cache/huggingface/datasets/json/default-c0867ada11e1561a/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


In [4]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline

# Version 2-7b
# base_model_name = "meta-llama/Llama-2-7b-hf"
# Version 2-13b-chat
base_model_name = "meta-llama/Llama-2-13b-chat-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
)

device_map = {"": 0}

base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map=device_map,
    trust_remote_code=True,
    use_auth_token=True
)

base_model.config.use_cache = False

# More info: https://github.com/huggingface/transformers/pull/24906
base_model.config.pretraining_tp = 1 



Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

# Few-shot Inference

In [14]:
# System Prompt
system_prompt = "You are a helpful, respectful and honest assistant in sentence polarization analysis. \
And you are supposed to classify the sentiment polarity of the user's message about gun supoort into one of the following categories: \
'Support Gun Polarized', 'Support Gun non-Polarized', 'Neutral', 'Anti Gun Polarized', 'Anti Gun non-Polarized', 'Not relevant' or 'Not Sure'."
# User Prompt
user_prompt = "Please classify the sentiment polarity of the following sentence about gun supoort into one of the following categories: \
'Support Gun Polarized', 'Support Gun non-Polarized', 'Neutral', 'Anti Gun Polarized', 'Anti Gun non-Polarized', 'Not relevant' or 'Not Sure':"
# Examples
examples = """
Here is an inference example:
Sentence: we must ban this horrible weapon of war
Sentiment Polarity: Anti Gun Polarized

Here is another inference example:
Sentence: an armed society is a polite society , keep it up friends.
Sentiment Polarity: Support Gun non-Polarized
"""

In [18]:
sentence = "not every act of violence needs to be about gun law , i see two idiots here ."

In [20]:
text = user_prompt + "\n" + examples + "\n" + "Sentence: " + sentence + "\n" + "Sentiment Polarity:"

inputs = tokenizer(text, return_tensors="pt").to("cuda")
outputs = base_model.generate(input_ids=inputs["input_ids"].to("cuda"), attention_mask=inputs["attention_mask"], max_new_tokens=150, pad_token_id=tokenizer.eos_token_id)

print(tokenizer.decode(outputs[0], skip_special_tokens=True))

Please classify the sentiment polarity of the following sentence about gun supoort into one of the following categories: 'Support Gun Polarized', 'Support Gun non-Polarized', 'Neutral', 'Anti Gun Polarized', 'Anti Gun non-Polarized', 'Not relevant' or 'Not Sure':

Here is an inference example:
Sentence: we must ban this horrible weapon of war
Sentiment Polarity: Anti Gun Polarized

Here is another inference example:
Sentence: an armed society is a polite society , keep it up friends.
Sentiment Polarity: Support Gun non-Polarized

Sentence: not every act of violence needs to be about gun law , i see two idiots here .
Sentiment Polarity: Not relevant

Sentence: the second amendment is not absolute , it is subject to reasonable regulation .
Sentiment Polarity: Support Gun non-Polarized

Sentence: the right to bear arms is a fundamental human right , it is not subject to regulation .
Sentiment Polarized: Anti Gun Polarized

Sentence: the government should not be allowed to take away our ri

In [48]:
import re
from tqdm import tqdm

def annotate(dataset, user_prompt, examples):
    df_data = dataset.to_pandas()
    for i, val in enumerate(tqdm(dataset)):
        sentence = val['body_cleaned']

        # Make input
        text = user_prompt + "\n" + examples + "\n" + "Sentence: " + sentence + "\n" + "Sentiment Polarity:"
        inputs = tokenizer(text, return_tensors="pt").to("cuda")

        # Generate
        outputs = base_model.generate(input_ids=inputs["input_ids"].to("cuda"), attention_mask=inputs["attention_mask"], max_new_tokens=150, pad_token_id=tokenizer.eos_token_id)
        # print(tokenizer.decode(outputs[0], skip_special_tokens=True))
        outputs_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

        matches = re.findall(r"Sentiment Polarity: (.+)", outputs_text)
        selected_sentiment = matches[2].strip() if len(matches) >= 3 else None
        print(selected_sentiment)

        # Update dataset
        df_data.loc[i, 'User label'] = selected_sentiment
        
    return df_data


In [49]:
new_dataset = annotate(dataset, user_prompt, examples)

  1%|          | 1/150 [00:12<30:37, 12.33s/it]

Support Gun Polarized


  1%|▏         | 2/150 [00:18<20:57,  8.49s/it]

Anti Gun Polarized


  2%|▏         | 3/150 [00:31<25:58, 10.60s/it]

Anti Gun Polarized


  3%|▎         | 4/150 [00:43<27:17, 11.22s/it]

Anti Gun non-Polarized


  3%|▎         | 5/150 [00:54<26:42, 11.05s/it]

Neutral


  4%|▍         | 6/150 [01:06<27:28, 11.45s/it]

Support Gun non-Polarized


  5%|▍         | 7/150 [01:19<28:18, 11.87s/it]

Anti Gun Polarized


  5%|▌         | 8/150 [01:32<29:03, 12.28s/it]

Support Gun Polarized


  6%|▌         | 9/150 [01:43<27:51, 11.86s/it]

Anti Gun Polarized


  7%|▋         | 10/150 [01:55<27:39, 11.85s/it]

Anti Gun non-Polarized


  7%|▋         | 11/150 [02:06<27:26, 11.85s/it]

Support Gun Polarized


  8%|▊         | 12/150 [02:19<27:43, 12.06s/it]

Anti Gun Polarized


  9%|▊         | 13/150 [02:33<28:56, 12.68s/it]

Support Gun Polarized


  9%|▉         | 14/150 [02:46<29:00, 12.80s/it]

Support Gun non-Polarized


 10%|█         | 15/150 [02:58<28:11, 12.53s/it]

Support Gun Polarized


 11%|█         | 16/150 [03:10<27:28, 12.30s/it]

Not relevant


 11%|█▏        | 17/150 [03:22<27:23, 12.36s/it]

Anti Gun Polarized


 12%|█▏        | 18/150 [03:34<26:56, 12.25s/it]

Not relevant


 13%|█▎        | 19/150 [03:47<26:58, 12.36s/it]

Support Gun non-Polarized


 13%|█▎        | 20/150 [03:59<26:51, 12.39s/it]

Support Gun Polarized


 14%|█▍        | 21/150 [04:05<22:09, 10.31s/it]

Support Gun Polarized


 15%|█▍        | 22/150 [04:17<23:05, 10.83s/it]

Anti Gun Polarized


 15%|█▌        | 23/150 [04:31<24:57, 11.79s/it]

Support Gun Polarized


 16%|█▌        | 24/150 [04:43<24:56, 11.88s/it]

Support Gun non-Polarized


 17%|█▋        | 25/150 [04:55<24:50, 11.93s/it]

Support Gun Polarized


 17%|█▋        | 26/150 [05:00<20:12,  9.78s/it]

Anti Gun Polarized


 18%|█▊        | 27/150 [05:12<21:45, 10.62s/it]

Not relevant


 19%|█▊        | 28/150 [05:25<23:02, 11.33s/it]

Anti Gun non-Polarized


 19%|█▉        | 29/150 [05:38<23:31, 11.66s/it]

Not relevant


 20%|██        | 30/150 [05:50<23:54, 11.96s/it]

Anti Gun non-Polarized


 21%|██        | 31/150 [06:03<23:52, 12.04s/it]

Not relevant


 21%|██▏       | 32/150 [06:14<23:33, 11.98s/it]

Anti Gun Polarized


 22%|██▏       | 33/150 [06:27<23:48, 12.21s/it]

Anti Gun Polarized


 23%|██▎       | 34/150 [06:40<23:50, 12.33s/it]

Not relevant


 23%|██▎       | 35/150 [06:54<24:39, 12.87s/it]

Not relevant


 24%|██▍       | 36/150 [07:07<24:31, 12.91s/it]

Not relevant


 25%|██▍       | 37/150 [07:19<23:38, 12.55s/it]

Anti Gun Polarized


 25%|██▌       | 38/150 [07:32<23:51, 12.78s/it]

Not relevant


 26%|██▌       | 39/150 [07:45<23:34, 12.74s/it]

Not relevant


 27%|██▋       | 40/150 [07:57<23:04, 12.58s/it]

Support Gun non-Polarized


 27%|██▋       | 41/150 [08:10<23:08, 12.73s/it]

Support Gun non-Polarized


 28%|██▊       | 42/150 [08:22<22:31, 12.52s/it]

Anti Gun Polarized


 29%|██▊       | 43/150 [08:35<22:42, 12.73s/it]

Not relevant


 29%|██▉       | 44/150 [08:48<22:21, 12.66s/it]

Support Gun Polarized


 30%|███       | 45/150 [09:00<22:10, 12.67s/it]

Anti Gun Polarized


 31%|███       | 46/150 [09:13<21:59, 12.69s/it]

Not relevant


 31%|███▏      | 47/150 [09:26<21:41, 12.64s/it]

Anti Gun Polarized


 32%|███▏      | 48/150 [09:38<21:11, 12.47s/it]

Not relevant


 33%|███▎      | 49/150 [09:50<20:46, 12.34s/it]

Anti Gun Polarized


 33%|███▎      | 50/150 [10:03<21:10, 12.70s/it]

Anti Gun Polarized


 34%|███▍      | 51/150 [10:15<20:28, 12.41s/it]

Support Gun Polarized


 35%|███▍      | 52/150 [10:27<20:08, 12.34s/it]

Anti Gun Polarized


 35%|███▌      | 53/150 [10:40<20:08, 12.46s/it]

Anti Gun Polarized


 36%|███▌      | 54/150 [10:52<19:47, 12.37s/it]

Anti Gun Polarized


 37%|███▋      | 55/150 [11:05<19:35, 12.38s/it]

Not relevant


 37%|███▋      | 56/150 [11:18<19:55, 12.72s/it]

Anti Gun non-Polarized


 38%|███▊      | 57/150 [11:31<19:43, 12.73s/it]

Not relevant


 39%|███▊      | 58/150 [11:43<19:15, 12.56s/it]

Support Gun Polarized


 39%|███▉      | 59/150 [11:55<18:54, 12.47s/it]

Anti Gun Polarized


 40%|████      | 60/150 [12:07<18:31, 12.35s/it]

Anti Gun non-Polarized


 41%|████      | 61/150 [12:17<17:08, 11.55s/it]

Support Gun Polarized


 41%|████▏     | 62/150 [12:29<17:07, 11.67s/it]

Support Gun Polarized


 42%|████▏     | 63/150 [12:41<17:09, 11.83s/it]

Support Gun Polarized


 43%|████▎     | 64/150 [12:46<13:52,  9.69s/it]

Not relevant


 43%|████▎     | 65/150 [12:58<14:54, 10.52s/it]

Support Gun non-Polarized


 44%|████▍     | 66/150 [13:11<15:31, 11.09s/it]

Not relevant


 45%|████▍     | 67/150 [13:23<15:53, 11.49s/it]

Support Gun Polarized


 45%|████▌     | 68/150 [13:32<14:40, 10.74s/it]

Not relevant


 46%|████▌     | 69/150 [13:44<15:09, 11.23s/it]

Support Gun Polarized


 47%|████▋     | 70/150 [13:57<15:24, 11.56s/it]

Anti Gun Polarized


 47%|████▋     | 71/150 [14:10<15:49, 12.02s/it]

Anti Gun Polarized


 48%|████▊     | 72/150 [14:22<15:37, 12.03s/it]

Support Gun non-Polarized


 49%|████▊     | 73/150 [14:34<15:35, 12.15s/it]

Anti Gun Polarized


 49%|████▉     | 74/150 [14:47<15:28, 12.22s/it]

Support Gun Polarized


 50%|█████     | 75/150 [15:00<15:30, 12.41s/it]

Support Gun non-Polarized


 51%|█████     | 76/150 [15:05<12:44, 10.33s/it]

Anti Gun Polarized


 51%|█████▏    | 77/150 [15:09<10:18,  8.48s/it]

Not Sure


 52%|█████▏    | 78/150 [15:22<11:44,  9.79s/it]

Support Gun Polarized


 53%|█████▎    | 79/150 [15:27<09:50,  8.31s/it]

Support Gun Polarized


 53%|█████▎    | 80/150 [15:39<11:01,  9.44s/it]

Anti Gun Polarized


 54%|█████▍    | 81/150 [15:50<11:29,  9.99s/it]

Not relevant


 55%|█████▍    | 82/150 [16:03<12:14, 10.79s/it]

Not relevant


 55%|█████▌    | 83/150 [16:16<12:52, 11.53s/it]

Support Gun Polarized


 56%|█████▌    | 84/150 [16:28<12:50, 11.67s/it]

Not relevant


 57%|█████▋    | 85/150 [16:41<13:02, 12.03s/it]

Anti Gun Polarized


 57%|█████▋    | 86/150 [16:55<13:34, 12.73s/it]

Anti Gun Polarized


 58%|█████▊    | 87/150 [17:08<13:10, 12.55s/it]

Not relevant


 59%|█████▊    | 88/150 [17:20<12:59, 12.57s/it]

Anti Gun Polarized


 59%|█████▉    | 89/150 [17:33<12:47, 12.59s/it]

Anti Gun non-Polarized


 60%|██████    | 90/150 [17:46<12:39, 12.65s/it]

Anti Gun Polarized


 61%|██████    | 91/150 [17:58<12:26, 12.65s/it]

Anti Gun Polarized


 61%|██████▏   | 92/150 [18:11<12:06, 12.53s/it]

Anti Gun non-Polarized


 62%|██████▏   | 93/150 [18:24<12:01, 12.66s/it]

Support Gun Polarized


 63%|██████▎   | 94/150 [18:36<11:48, 12.66s/it]

Support Gun Polarized


 63%|██████▎   | 95/150 [18:42<09:45, 10.64s/it]

Support Gun non-Polarized


 64%|██████▍   | 96/150 [18:55<10:06, 11.23s/it]

Support Gun non-Polarized


 65%|██████▍   | 97/150 [19:08<10:33, 11.96s/it]

Support Gun Polarized


 65%|██████▌   | 98/150 [19:20<10:24, 12.01s/it]

Not relevant


 66%|██████▌   | 99/150 [19:32<10:11, 11.99s/it]

Anti Gun Polarized


 67%|██████▋   | 100/150 [19:45<10:05, 12.12s/it]

Support Gun Polarized


 67%|██████▋   | 101/150 [19:58<10:08, 12.42s/it]

Anti Gun Polarized


 68%|██████▊   | 102/150 [20:13<10:27, 13.07s/it]

Anti Gun Polarized


 69%|██████▊   | 103/150 [20:25<10:09, 12.97s/it]

Support Gun Polarized


 69%|██████▉   | 104/150 [20:39<10:01, 13.08s/it]

Anti Gun Polarized


 70%|███████   | 105/150 [20:50<09:27, 12.61s/it]

Anti Gun Polarized


 71%|███████   | 106/150 [21:03<09:14, 12.60s/it]

Anti Gun Polarized


 71%|███████▏  | 107/150 [21:15<09:03, 12.65s/it]

Support Gun Polarized


 72%|███████▏  | 108/150 [21:27<08:39, 12.36s/it]

Support Gun Polarized


 73%|███████▎  | 109/150 [21:40<08:26, 12.35s/it]

Anti Gun Polarized


 73%|███████▎  | 110/150 [21:52<08:20, 12.52s/it]

Anti Gun Polarized


 74%|███████▍  | 111/150 [22:07<08:31, 13.10s/it]

Anti Gun Polarized


 75%|███████▍  | 112/150 [22:19<08:09, 12.87s/it]

Anti Gun Polarized


 75%|███████▌  | 113/150 [22:33<08:01, 13.02s/it]

Anti Gun Polarized


 76%|███████▌  | 114/150 [22:45<07:42, 12.84s/it]

Anti Gun Polarized


 77%|███████▋  | 115/150 [22:57<07:20, 12.59s/it]

Not relevant


 77%|███████▋  | 116/150 [23:10<07:14, 12.78s/it]

Support Gun Polarized


 78%|███████▊  | 117/150 [23:23<07:00, 12.73s/it]

Anti Gun Polarized


 79%|███████▊  | 118/150 [23:36<06:56, 13.00s/it]

Support Gun non-Polarized


 79%|███████▉  | 119/150 [23:50<06:45, 13.09s/it]

Anti Gun Polarized


 80%|████████  | 120/150 [24:02<06:25, 12.84s/it]

Support Gun Polarized


 81%|████████  | 121/150 [24:15<06:13, 12.87s/it]

Support Gun non-Polarized


 81%|████████▏ | 122/150 [24:29<06:06, 13.08s/it]

Anti Gun Polarized


 82%|████████▏ | 123/150 [24:41<05:50, 12.98s/it]

Anti Gun Polarized


 83%|████████▎ | 124/150 [24:54<05:34, 12.87s/it]

Support Gun Polarized


 83%|████████▎ | 125/150 [25:08<05:32, 13.29s/it]

Anti Gun non-Polarized


 84%|████████▍ | 126/150 [25:20<05:06, 12.77s/it]

Anti Gun Polarized


 85%|████████▍ | 127/150 [25:32<04:53, 12.76s/it]

Anti Gun Polarized


 85%|████████▌ | 128/150 [25:45<04:39, 12.73s/it]

Support Gun Polarized


 86%|████████▌ | 129/150 [25:57<04:24, 12.62s/it]

Anti Gun Polarized


 87%|████████▋ | 130/150 [26:10<04:11, 12.58s/it]

Anti Gun Polarized


 87%|████████▋ | 131/150 [26:22<03:55, 12.37s/it]

Anti Gun Polarized


 88%|████████▊ | 132/150 [26:27<03:06, 10.36s/it]

Support Gun non-Polarized


 89%|████████▊ | 133/150 [26:39<03:02, 10.71s/it]

Anti Gun Polarized


 89%|████████▉ | 134/150 [26:51<02:58, 11.15s/it]

Anti Gun Polarized


 90%|█████████ | 135/150 [27:02<02:47, 11.17s/it]

Anti Gun Polarized


 91%|█████████ | 136/150 [27:13<02:35, 11.13s/it]

Anti Gun Polarized


 91%|█████████▏| 137/150 [27:14<01:43,  7.97s/it]

Anti Gun Polarized


 92%|█████████▏| 138/150 [27:20<01:28,  7.36s/it]

Not relevant


 93%|█████████▎| 139/150 [27:25<01:14,  6.80s/it]

Anti Gun Polarized


 93%|█████████▎| 140/150 [27:36<01:19,  7.98s/it]

Anti Gun Polarized


 94%|█████████▍| 141/150 [27:41<01:04,  7.15s/it]

Not relevant


 95%|█████████▍| 142/150 [27:52<01:05,  8.23s/it]

Support Gun non-Polarized


 95%|█████████▌| 143/150 [28:03<01:03,  9.02s/it]

Anti Gun Polarized


 96%|█████████▌| 144/150 [28:15<00:59,  9.85s/it]

Not Sure


 97%|█████████▋| 145/150 [28:24<00:48,  9.71s/it]

Anti Gun Polarized


 97%|█████████▋| 146/150 [28:31<00:35,  8.80s/it]

Support Gun Polarized


 98%|█████████▊| 147/150 [28:41<00:27,  9.24s/it]

Not relevant


 99%|█████████▊| 148/150 [28:51<00:19,  9.54s/it]

Anti Gun Polarized


 99%|█████████▉| 149/150 [29:02<00:09,  9.85s/it]

Anti Gun Polarized


100%|██████████| 150/150 [29:12<00:00, 11.69s/it]

Anti Gun Polarized





In [50]:
new_dataset

Unnamed: 0,id,body_cleaned,User label,author,subreddit,predicted_community,score,created_utc
0,f4acjs9,"as a federal leo , the very idea of confiscati...",Support Gun Polarized,MDeXY,progun,0,454,1571492410
1,f56wn75,we must ban this horrible weapon of war,Anti Gun Polarized,MAp6s,progun,0,437,1572028587
2,f9nx9zd,`` you 're pretty stupid to not just have arme...,Anti Gun Polarized,a9GR9,progun,0,343,1575492221
3,dwx41ns,"meanwhile in england , an old man is being cha...",Anti Gun non-Polarized,qhsBh,progun,0,339,1523030793
4,ekz86fj,thanks but no thanks .,Neutral,L7DdD,progun,0,311,1555369507
...,...,...,...,...,...,...,...,...
145,ew7u70o,`` i respect the flag . that 's why i always f...,Support Gun Polarized,I35V7,GunsAreCool,3,48,1565197239
146,cxku7v9,mental health guys we need to do something abo...,Not relevant,n5s7P,GunsAreCool,3,47,1449090536
147,dv8iym6,what ? a nazi waving a gun around ? prepostero...,Anti Gun Polarized,mYqmA,GunsAreCool,3,47,1520289868
148,cyptaqf,i like how they keep switching from mass murde...,Anti Gun Polarized,K5aJM,GunsAreCool,3,46,1452202132


In [51]:
new_dataset.to_json('../data/polito-llama2-13b-chat-few-inference.json', orient='records', lines=True)