In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/gemma-2/transformers/gemma-2-2b/1/model.safetensors.index.json
/kaggle/input/gemma-2/transformers/gemma-2-2b/1/model-00003-of-00003.safetensors
/kaggle/input/gemma-2/transformers/gemma-2-2b/1/config.json
/kaggle/input/gemma-2/transformers/gemma-2-2b/1/README.md
/kaggle/input/gemma-2/transformers/gemma-2-2b/1/tokenizer.json
/kaggle/input/gemma-2/transformers/gemma-2-2b/1/model-00001-of-00003.safetensors
/kaggle/input/gemma-2/transformers/gemma-2-2b/1/tokenizer_config.json
/kaggle/input/gemma-2/transformers/gemma-2-2b/1/gitattributes
/kaggle/input/gemma-2/transformers/gemma-2-2b/1/model-00002-of-00003.safetensors
/kaggle/input/gemma-2/transformers/gemma-2-2b/1/special_tokens_map.json
/kaggle/input/gemma-2/transformers/gemma-2-2b/1/tokenizer.model
/kaggle/input/gemma-2/transformers/gemma-2-2b/1/generation_config.json
/kaggle/input/pewdata/4.json
/kaggle/input/pewdata/18.json
/kaggle/input/pewdata/2.json
/kaggle/input/pewdata/19.json
/kaggle/input/pewdata/3.json
/kaggle/input

In [3]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

model = AutoModelForCausalLM.from_pretrained('/kaggle/input/gemma-2/transformers/gemma-2-2b/1/')
tokenizer = AutoTokenizer.from_pretrained('/kaggle/input/gemma-2/transformers/gemma-2-2b/1/')

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
def generate_prompts(q_dict, context=None, context_type=None):
    context_str = ''
    if context is not None:
        # Context: "Democrat", "Republican", "Independent"
        # Context Type: "QA", "BIO", "PORTRAY"
        if context_type == 'QA':
            context_options = {'republican': '1', 'democrat': '2', 'independent':'3'}
            option = context_options.get(context.strip().lower(), '4')
            context_str = f"""
Question: In politics today, do you consider yourself a
1. Republican
2. Democrat
3. Independent
4. Something else
5. Refused
Answer: {option}
"""
        elif context_type == 'BIO':
            context_str = f"""
Below you will be asked to provide a short description of your political affiliation and then answer some questions.
Description: In politics today, I consider myself a {context.strip().title()}.
"""
        
        elif context_type == 'PORTRAY':
            context_str = f"""
Answer the following question as if in politics today, you considered yourself a {context.strip().title()}.
"""
    
    main_question = q_dict['main_question']
    sub_questions = q_dict['sub_questions']
    
    expected_answers = [str(ans["id"]) for ans in q_dict["answers"]]
    if not sub_questions:
        prompt = f"""
{context_str}

Question: {main_question}
"""
        
        for ans in q_dict['answers']:
            prompt += f"{ans['id']}: {ans['text']}\n"
        prompt += f"Answer: "
        return [prompt], expected_answers
    
    prompts = []
    for sq in sub_questions:
        prompt = f"""
{context_str}

Question: {main_question}
{sq['id']}: {sq['text']}
"""
        for ans in q_dict['answers']:
            prompt += f"{ans['id']}: {ans['text']}\n"
        prompt += f"Answer: "
        prompts.append(prompt)
    
    return prompts, expected_answers

In [5]:
def get_distribution(prompt, expected_answers) -> dict:
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    if torch.cuda.is_available():
        model.to('cuda')
        input_ids = input_ids.to('cuda')
    
    with torch.no_grad():
        outputs = model(input_ids)
        logits = outputs.logits[:, -1, :]  # Get logits for the last token

    # Calculate probabilities and log probabilities
    probabilities = torch.softmax(logits, dim=-1)
    log_probs = torch.log(probabilities)

    # Get top k candidates
    top_k = 100 
    top_probabilities, top_indices = torch.topk(log_probs, top_k)

    # Decode the top tokens to words
    top_words = tokenizer.convert_ids_to_tokens(top_indices[0].tolist())

    # Display results
    probabilities = {}
    for word, prob in zip(top_words, top_probabilities[0].tolist()):
        # print(f"Word: {word}, Log Probability: {prob:.4f}")
        probabilities[word] = prob
    
    distribution = {}
    for ans in expected_answers:
        if ans in probabilities:
            distribution[ans] = probabilities[ans]
    
    # distribution = {k:probabilities[k] for k in expected_answers}
    
    return distribution
    

# Q1

In [6]:
import json

with open('/kaggle/input/pewdata/1.json') as f:
    q1 = json.load(f)

In [7]:
prompts, expected_answers = generate_prompts(q1)

In [8]:
for prompt in prompts:
    dist = get_distribution(prompt, expected_answers)
    print(dist)

{'1': -0.7716595530509949, '2': -2.4069342613220215, '3': -2.9473226070404053, '4': -3.073322057723999, '5': -3.4372403621673584, '6': -3.3486154079437256, '7': -3.4659056663513184}


In [9]:
prompts

['\n\n\nQuestion: How often, if at all, do you follow the news?\n1: Every day\n2: More than once a week\n3: Once a week\n4: Once or twice a month\n5: A few times a year\n6: Seldom\n7: Never\n98: Don’t know\n99: Refused\nAnswer: ']

# Q2

In [10]:
import json

with open('/kaggle/input/pewdata/2.json') as f:
    q2 = json.load(f)

In [11]:
prompts, expected_answers = generate_prompts(q2)

In [12]:
for prompt in prompts:
    print(prompt)
    dist = get_distribution(prompt, expected_answers)
    print(dist)
    print('======')




Question: How have you been getting the news?
1: Television
2: Newspapers or magazines
3: Radio
4: Internet
5: Social media (Facebook, Twitter, WhatsApp)
6: Word of mouth, family, friends
7: Other
98: Don’t know
99: Refused
Answer: 
{'1': -0.9498236179351807, '2': -2.2938311100006104, '3': -2.7825376987457275, '4': -2.802652597427368, '5': -3.1623213291168213, '6': -3.0022356510162354, '7': -3.2199079990386963}


# Q3

In [13]:
import json

with open('/kaggle/input/pewdata/3.json') as f:
    q3 = json.load(f)

In [14]:
prompts, expected_answers = generate_prompts(q3)

In [15]:
for prompt in prompts:
    print(prompt)
    dist = get_distribution(prompt, expected_answers)
    print(dist)
    print('======')




Question: Now I am going to read you a list of things that may be problems in India. As I read each one, please tell me if you think it is a ...
a: Unemployment
1: Very big problem
2: Moderately big problem
3: Small problem
4: Not a problem at all
5: Don’t know
98: Don’t know
99: Refused
Answer: 
{'1': -1.1406806707382202, '2': -2.5130715370178223, '3': -3.0320188999176025, '4': -3.201448678970337, '5': -3.168264627456665}



Question: Now I am going to read you a list of things that may be problems in India. As I read each one, please tell me if you think it is a ...
b: Crime
1: Very big problem
2: Moderately big problem
3: Small problem
4: Not a problem at all
5: Don’t know
98: Don’t know
99: Refused
Answer: 
{'1': -1.00242280960083, '2': -2.923882007598877, '3': -3.547506809234619, '4': -3.728100299835205, '5': -3.7165932655334473}



Question: Now I am going to read you a list of things that may be problems in India. As I read each one, please tell me if you think it is a ...
c:

In [16]:
for i in range(1, 20):
    foo = open(f'/kaggle/working/output_{i}.txt', 'w')
    with open(f'/kaggle/input/pewdata/{i}.json') as f:
        q = json.load(f)
    prompts, expected_answers = generate_prompts(q)
    for prompt in prompts:
        print(prompt, file=foo)
        dist = get_distribution(prompt, expected_answers)
        print(dist, file=foo)
        print('===', file=foo)
    foo.close()