In [1]:
import pandas as pd
import numpy as np
from transformers import BertModel, BertTokenizer
from sklearn.metrics.pairwise import cosine_similarity
import torch
import json
import os
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, pipeline
import csv
import tqdm
from sentence_transformers import SentenceTransformer
import re
from collections import defaultdict
from time import strftime

In [2]:
# Load CSV file (replace with actual path if needed)

# Create structure to store processed data

# Helper function to extract character: <phrases> pairs from the 'summ' field
def extract_phrases(summ):
    char_phrases = defaultdict(list)
    # Find all character blocks using regex pattern

    pattern = r'([^:\n]+):\s*(.*?)(?=(?:[^:\n]+:\s*)|$)'
    matches = re.findall(pattern, summ, flags=re.DOTALL)

    for char, phrases in matches:
        # split_phrases = [p.strip() for p in phrases.split(';') if p.strip()]
        # char_phrases[char].extend(split_phrases)
        name_clean = char.strip()
        phrases = [p.strip() for p in phrases.replace('\n', ' ').split(';') if p.strip()]
        char_phrases[name_clean] = phrases
    return char_phrases


In [6]:
# best version for two characters in a row, main #####################################
# model_name = 'llama-3.1-8B-Instruct'
model_name = 'QWEN3-32B'

save_info = ""
cond = ''
add_in = ''
# fill in all info
file_path = "SUMM_COMBINE_B/" # csv inside SUMM_COMBINE_B folder

if "two1" in file_path:
    save_info = "two1"

if 'gender' in file_path:
    cond = 'gender'
if 'race' in file_path:
    cond = 'race'
if 'religions' in file_path:
    cond = 'religions'

df = pd.read_csv(file_path)

combined_data = {}
cat = ""

processed_rows = []

for location, group in df.groupby("location"):
    character_phrase_map = defaultdict(list)

    # 1. Collect all phrases per character
    for _, row in group.iterrows():
        summ = str(row["summ"])
        id1 = str(row["id1"]) if pd.notna(row["id1"]) else None
        id2 = str(row["id2"]) if pd.notna(row["id2"]) else None
        bias_type = str(row["bias type"])
        id_all = row['identity'].split(',')

        id1_cat = id_all[0].strip()
        id2_cat = id_all[1].strip()

        char_phrases = extract_phrases(summ)  # {id: [list of phrases]}

        if id1 and id1 in char_phrases:
            character_phrase_map[id1_cat].append(char_phrases[id1])
        if id2 and id2 in char_phrases:
            character_phrase_map[id2_cat].append(char_phrases[id2])

    # 2. After collecting, manually construct rows
    # Assume each character has 5 phrase lists (one per summary)

    # First, determine maximum number of rows needed
    num_rows = max(len(phrase_lists) for phrase_lists in character_phrase_map.values())

    for idx in range(num_rows):
        row_data = {"location": location}
        for char_idx, (char_id, phrase_lists) in enumerate(character_phrase_map.items(), start=1):
            row_data[f"character_{char_idx}"] = char_id
            if idx < len(phrase_lists):
                row_data[f"phrases_{char_idx}"] = phrase_lists[idx]
            else:
                row_data[f"phrases_{char_idx}"] = []  # Empty if not enough phrases
        row_data["bias_type"] = bias_type
        processed_rows.append(row_data)


# Convert to DataFrame for easier handling
processed_df = pd.DataFrame(processed_rows)

In [8]:
T = strftime('%Y%m%d-%H%M')
concept_dir = os.path.join('SUMM_COMBINE_ALL')
if not os.path.exists(concept_dir):
    os.mkdir(concept_dir)

save_concepts_dir = os.path.join(concept_dir, save_info + '_' + model_name + '_' + cond + '_' + add_in + '_all_summ.csv')
processed_df.to_csv(save_concepts_dir, index = False, header=True)

In [3]:
# single sotry, main, #########################################################
model_name = 'QWEN3-32B'

save_info = ""
cond = ''
add_in = ''
# fill in all info
file_path = "SUMM_COMBINE_B/" # csv inside SUMM_COMBINE_B folder


if 'gender' in file_path:
    cond = 'gender'
if 'race' in file_path:
    cond = 'race'
if 'religions' in file_path:
    cond = 'religions'

# df = pd.read_csv(file_path)

combined_data = {}
cat = ""

processed_rows = []

for location, group in df.groupby("location"):
    character_phrase_map = defaultdict(list)

    # 1. Collect all phrases per character
    for _, row in group.iterrows():
        summ = str(row["summ"])
        id = str(row["id"]) if pd.notna(row["id"]) else None
        bias_type = str(row["bias type"])
        id_cat = row['identity']

        char_phrases = extract_phrases(summ)  # {id: [list of phrases]}

        if id and id in char_phrases:
            character_phrase_map[id_cat].append(char_phrases[id])

    # 2. After collecting, manually construct rows
    # Assume each character has 5 phrase lists (one per summary)

    # First, determine maximum number of rows needed
    num_rows = max(len(phrase_lists) for phrase_lists in character_phrase_map.values())

    for idx in range(num_rows):
        row_data = {"location": location}
        for char_idx, (char_id, phrase_lists) in enumerate(character_phrase_map.items(), start=1):
            row_data[f"character_{char_idx}"] = char_id
            if idx < len(phrase_lists):
                row_data[f"phrases_{char_idx}"] = phrase_lists[idx]
            else:
                row_data[f"phrases_{char_idx}"] = []  # Empty if not enough phrases
        row_data["bias_type"] = bias_type
        processed_rows.append(row_data)


# Convert to DataFrame for easier handling
processed_df = pd.DataFrame(processed_rows)

In [5]:
concept_dir = os.path.join('SUMM_COMBINE_ALL')
if not os.path.exists(concept_dir):
    os.mkdir(concept_dir)

save_concepts_dir = os.path.join(concept_dir, "one" + '_' + model_name +  '_' + cond + '_all_summ.csv')
processed_df.to_csv(save_concepts_dir, index = False, header=True)