# PREREQUISITES:

In [3]:
import os
import pandas as pd
from transformers import AutoModelForCausalLM
from sklearn.model_selection import train_test_split
import torch
from sklearn.model_selection import train_test_split
from transformers import GemmaTokenizerFast
from transformers import pipeline
from sklearn.metrics import accuracy_score
import torch
from huggingface_hub import login
import json
import numpy as np

In [None]:
login()

In [4]:
current_directory = os.getcwd() # get current directory
file_path = os.path.join(current_directory, '..', '..', 'data', 'processed', 'clean_data.csv') # navigate to folder with preprocessed data
data = pd.read_csv(file_path)

# Filter out data points with label "Center" 

In [5]:
filtered_data = data[data['bias_rating'] != 1]
filtered_data = filtered_data.reset_index(drop=True)
print(filtered_data.iloc[0])
print(data.head(5))

heading_text    chicago gun violence spikes and increasingly f...
bias_rating                                                     0
Name: 0, dtype: object
                                        heading_text  bias_rating
0  chicago gun violence spikes and increasingly f...            0
1  'bullets just came from nowhere': fourth of ju...            1
2  dozens of shootings across us mark bloody july...            2
3  federal government will run out of cash on oct...            2
4  yellen tells congress that u.s. will run out o...            0


# Prompt

In [49]:
prompt_1 = """This is a political bias classifier for news headlines.
Please provide the response in the following JSON format:
{
    "Clues": "<List the CLUES here (i.e., keywords, tones, references, contextual information)>",
    "Reasoning": "<Provide the diagnostic REASONING process here in 130 words based on the clues and input>",
    "Political Bias": "<Determine the overall POLITICAL BIAS as Left or Right. You MUST write only one word in this section either right or left.>"
}
INPUT: """

In [50]:
# For Prompt where No center included
df_prompt_1 = filtered_data.copy()
df_prompt_1['heading_text'] = df_prompt_1['heading_text'].apply(lambda x: prompt_1 + x)

print(df_prompt_1.iloc[0]['heading_text'])


This is a political bias classifier for news headlines.
Please provide the response in the following JSON format:
{
    "Clues": "<List the CLUES here (i.e., keywords, tones, references, contextual information)>",
    "Reasoning": "<Provide the diagnostic REASONING process here in 130 words based on the clues and input>",
    "Political Bias": "<Determine the overall POLITICAL BIAS as Left or Right. You MUST write only one word in this section either right or left.>"
}
INPUT: chicago gun violence spikes and increasingly finds the youngest victims as yasmin miller drove home from a laundromat in chicago's englewood neighborhood last weekend, a gunman in another car peppered her red hyundai sedan with bullets, grazing her head and striking her son, sincere gaston, in the chest. sincere died in his car seat. he was 20 months old.
on june 20, a man fired gunshots through the back of a dark blue suv, wounding the 27-year-old man driving and hitting his stepson, mekhi james, in the back, kil

# Data splitting

In [51]:
X = df_prompt_1['heading_text'].values
y = df_prompt_1['bias_rating'].values

In [52]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(y_test)
print(X_test)

[2 2 0 ... 0 0 0]
['This is a political bias classifier for news headlines.\nPlease provide the response in the following JSON format:\n{\n    "Clues": "<List the CLUES here (i.e., keywords, tones, references, contextual information)>",\n    "Reasoning": "<Provide the diagnostic REASONING process here in 130 words based on the clues and input>",\n    "Political Bias": "<Determine the overall POLITICAL BIAS as Left or Right. You MUST write only one word in this section either right or left.>"\n}\nINPUT: japan pm vows to save isis hostages threatened with beheading in new video japan\'s prime minister vowed tuesday to save the lives of two japanese hostages, one a freelance journalist and the other a soldier for hire, threatened with beheading in an online video purportedly released by the islamic state terror group.'
 'This is a political bias classifier for news headlines.\nPlease provide the response in the following JSON format:\n{\n    "Clues": "<List the CLUES here (i.e., keywords,

# Load Model and Tokenizer

In [None]:
model = AutoModelForCausalLM.from_pretrained("google/gemma-2b-it")
tokenizer = GemmaTokenizerFast.from_pretrained('google/gemma-2b-it')

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
model.to(device)

# Evaluate model

In [None]:
pipe = pipeline(
    "text-generation",
    tokenizer=tokenizer,
    model=model,
    device="cuda",
)
results = []
counter = 0

for text in X_test:
    counter += 1
    print(f"{counter}/{len(X_test)}")
    outputs = pipe(text, max_new_tokens=256)
    response = outputs[0]["generated_text"]
    
    if "**Clues:**" in response and "**Reasoning:**" in response and "**Political Bias:**" in response:
        clues = response.split("**Clues:**")[1].split("**Reasoning:**")[0].strip()
        reasoning = response.split("**Reasoning:**")[1].split("**Political Bias:**")[0].strip()
        bias = response.split("**Political Bias:**")[1].strip()
    else:
        clues = "NaN"
        reasoning = "NaN"
        bias = "NaN"

    result = {
        "Clues": clues,
        "Reasoning": reasoning,
        "Political Bias": bias
    }
    results.append(result)

with open('prompt_no_center.json', 'w') as f:
    json.dump(results, f, indent=4)

# Evaluate accuracy

In [None]:
prompt_file = 'prompt_no_center.json'

file_path = os.path.join(current_directory, '..', '..', 'results', prompt_file)

with open(file_path, 'r') as f:
    data = json.load(f)
    
bias_mapping = {'Left': 0, 'Right': 2}

In [None]:
# In this section we do data prepocessing of all models outputs
# Sometimes model did not give results for query, thus it should be marked as NaN
json_labels = []
for item in data:
    bias = item.get("Political Bias")
    
    if bias in bias_mapping:
        json_labels.append(bias_mapping[bias])
    elif isinstance(bias, str):
        bias_lower = bias.lower()
        if "left" in bias_lower:
            json_labels.append(bias_mapping['Left'])
        elif "right" in bias_lower:
            json_labels.append(bias_mapping['Right'])
        else:
            json_labels.append(-1)
    else:
        json_labels.append(-1)

json_array = np.array(json_labels)

In [42]:
mask = json_array != -1
json_array_filtered = json_array[mask] # remove NaN (-1) values from the dataset
y_test_filtered = y_test[mask] # remove values corresponding to NaN (-1) values from the set produced by model

print(len(json_array_filtered))
print(len(y_test_filtered))

accuracy = accuracy_score(y_test_filtered, json_array_filtered)
print(f'Accuracy: {accuracy * 100:.2f}%')

3089
3089
Accuracy: 45.19%
