In [1]:
import pandas as pd
import numpy as np
import json
import ast
import os
import re
from sklearn.metrics import (
    accuracy_score, precision_score,
    recall_score, f1_score, confusion_matrix,
    jaccard_score,
    precision_score,
    hamming_loss
)
from sklearn.preprocessing import MultiLabelBinarizer

from tqdm import tqdm
tqdm.pandas()

from utils import (
    get_tweets_dataset, 
    extract_responses,
    extract_JSON_labels_and_explanations,
    extract_RAG_drug_labels,
    extract_drug_labels,
    extract_T_F_labels, 
    match_terms,
    get_confusion_matrix_and_metrics,
)
from llm_manager import (
    run_prompt_on_llm,
### paid #################
    # get_claude_response, 
    # get_openai_response, 
    # get_perplexity_response,
    get_openai_gpt4omini_response,
    get_openai_o4mini_response,
### local #################
    get_llama_response, 
    get_qwen_4b_response,                        
    #get_deepseek_qwen_response,
    # get_qwen_response,
    # get_mistral_response,
    cleanup_llama,
    cleanup_qwen,
    cleanup_qwen_4b,
    cleanup_deepseek_qwen,
    cleanup_mistral,
)

In [2]:
tweets = get_tweets_dataset()
SEED = 777
tweets = ( tweets.sample(n=1_000, random_state=SEED, replace=False)
                 .sort_index()
                 .reset_index(drop=True)
)
tweets.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   text                       1000 non-null   object
 1   label                      1000 non-null   object
 2   tweet_num                  1000 non-null   int64 
 3   found_terms                1000 non-null   object
 4   found_index_terms          1000 non-null   object
 5   GPT_found_terms            1000 non-null   object
 6   GPT_found_index_terms      1000 non-null   object
 7   pubchem_found_terms        1000 non-null   object
 8   pubchem_found_index_terms  1000 non-null   object
 9   redmed_found_terms         1000 non-null   object
 10  redmed_found_index_terms   1000 non-null   object
 11  DEA_found_terms            1000 non-null   object
 12  DEA_found_index_terms      1000 non-null   object
dtypes: int64(1), object(12)
memory usage: 101.7+ KB


# Long prompt

In [3]:
# Collect responses from saved files, get labels and explanations
responses = extract_responses(tweets, "gpt4omini")
tweets["4o_mini_response"] = responses
labels, explanations = extract_JSON_labels_and_explanations(tweets, "gpt4omini")
tweets["4o_mini_label"] = labels
tweets["4o_mini_explanation"] = explanations

responses = extract_responses(tweets, "o4mini")
tweets["o4mini_response"] = responses   
labels, explanations = extract_JSON_labels_and_explanations(tweets, "o4mini")
tweets["o4mini_label"] = labels
tweets["o4mini_explanation"] = explanations

responses = extract_responses(tweets, "qwen_4b")
tweets["qwen_4b_response"] = responses
labels, explanations = extract_JSON_labels_and_explanations(tweets, "qwen_4b")
tweets["qwen_4b_label"] = labels
tweets["qwen_4b_explanation"] = explanations

responses = extract_responses(tweets, "llama")
tweets["llama_response"] = responses
labels, explanations = extract_JSON_labels_and_explanations(tweets, "llama")
tweets["llama_label"] = labels
tweets["llama_explanation"] = explanations

tweets.info(verbose=True)


100%|██████████| 1000/1000 [00:00<00:00, 16075.43it/s]
100%|██████████| 1000/1000 [00:00<00:00, 15629.28it/s]
100%|██████████| 1000/1000 [00:00<00:00, 17007.50it/s]
100%|██████████| 1000/1000 [00:00<00:00, 15169.33it/s]
100%|██████████| 1000/1000 [00:00<00:00, 15361.11it/s]
100%|██████████| 1000/1000 [00:00<00:00, 14511.41it/s]
100%|██████████| 1000/1000 [00:00<00:00, 16340.40it/s]
100%|██████████| 1000/1000 [00:00<00:00, 15063.10it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   text                       1000 non-null   object
 1   label                      1000 non-null   object
 2   tweet_num                  1000 non-null   int64 
 3   found_terms                1000 non-null   object
 4   found_index_terms          1000 non-null   object
 5   GPT_found_terms            1000 non-null   object
 6   GPT_found_index_terms      1000 non-null   object
 7   pubchem_found_terms        1000 non-null   object
 8   pubchem_found_index_terms  1000 non-null   object
 9   redmed_found_terms         1000 non-null   object
 10  redmed_found_index_terms   1000 non-null   object
 11  DEA_found_terms            1000 non-null   object
 12  DEA_found_index_terms      1000 non-null   object
 13  4o_mini_response           1000 non-null   object
 14  4o_mini_l




# Short prompt

In [4]:
# Collect responses from saved files, get labels
responses = extract_responses(tweets, "gpt4omini_short")
tweets["4o_mini_response_short"] = responses
labels = extract_T_F_labels(tweets, "gpt4omini_short")
tweets["4o_mini_label_short"] = labels

responses = extract_responses(tweets, "o4mini_short")
tweets["o4mini_response_short"] = responses   
labels = extract_T_F_labels(tweets, "o4mini_short")
tweets["o4mini_label_short"] = labels

responses = extract_responses(tweets, "qwen_4b_short")
tweets["qwen_4b_response_short"] = responses
labels = extract_T_F_labels(tweets, "qwen_4b_short")
tweets["qwen_4b_label_short"] = labels

responses = extract_responses(tweets, "llama_short")
tweets["llama_response_short"] = responses
labels = extract_T_F_labels(tweets, "llama_short")
tweets["llama_label_short"] = labels

tweets.info(verbose=True)

100%|██████████| 1000/1000 [00:00<00:00, 16482.77it/s]
100%|██████████| 1000/1000 [00:00<00:00, 16947.85it/s]
100%|██████████| 1000/1000 [00:00<00:00, 16553.22it/s]
100%|██████████| 1000/1000 [00:00<00:00, 15955.81it/s]
100%|██████████| 1000/1000 [00:00<00:00, 15498.64it/s]
100%|██████████| 1000/1000 [00:00<00:00, 12433.50it/s]
100%|██████████| 1000/1000 [00:00<00:00, 16565.38it/s]
100%|██████████| 1000/1000 [00:00<00:00, 16043.94it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 33 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   text                       1000 non-null   object
 1   label                      1000 non-null   object
 2   tweet_num                  1000 non-null   int64 
 3   found_terms                1000 non-null   object
 4   found_index_terms          1000 non-null   object
 5   GPT_found_terms            1000 non-null   object
 6   GPT_found_index_terms      1000 non-null   object
 7   pubchem_found_terms        1000 non-null   object
 8   pubchem_found_index_terms  1000 non-null   object
 9   redmed_found_terms         1000 non-null   object
 10  redmed_found_index_terms   1000 non-null   object
 11  DEA_found_terms            1000 non-null   object
 12  DEA_found_index_terms      1000 non-null   object
 13  4o_mini_response           1000 non-null   object
 14  4o_mini_l




# Entity linking

In [5]:
# Collect responses from saved files, get labels and explanations
responses = extract_responses(tweets, "gpt4omini_entity_linking")
tweets["4o_mini_response_entity_linking"] = responses
drug_labels = extract_drug_labels(tweets, "gpt4omini_entity_linking")
tweets["4o_mini_drug_labels"] = drug_labels

responses = extract_responses(tweets, "o4mini_entity_linking")
tweets["o4mini_response_entity_linking"] = responses   
drug_labels = extract_drug_labels(tweets, "o4mini_entity_linking")
tweets["o4mini_drug_labels"] = drug_labels

responses = extract_responses(tweets, "qwen_4b_entity_linking")
tweets["qwen_4b_response_entity_linking"] = responses
drug_labels = extract_drug_labels(tweets, "qwen_4b_entity_linking")
tweets["qwen_4b_drug_labels"] = drug_labels

responses = extract_responses(tweets, "llama_entity_linking")
tweets["llama_response_entity_linking"] = responses
drug_labels = extract_drug_labels(tweets, "llama_entity_linking")
tweets["llama_drug_labels"] = drug_labels

tweets.info(verbose=True)

100%|██████████| 1000/1000 [00:00<00:00, 17214.96it/s]
100%|██████████| 1000/1000 [00:00<00:00, 11536.06it/s]
100%|██████████| 1000/1000 [00:00<00:00, 16575.26it/s]
100%|██████████| 1000/1000 [00:00<00:00, 15162.14it/s]
100%|██████████| 1000/1000 [00:00<00:00, 15370.23it/s]
100%|██████████| 1000/1000 [00:00<00:00, 11131.65it/s]


Error: No JSON found in 'qwen_4b_entity_linking/44   <think>
Okay, let's tackle this query. The user wants me to moderate social media content for drug-related references. The task is to detect any words or phrases that refer to illicit drugs, link them to the correct index term, and return a JSON array.

First, I need to look at the post: "steal the night K WORD the lights feel it under your skin time is right uhhh Cuz its pulling you in pump it up you cant stop cuz it feels like an overdose FEELS LIKE AN OVERDOOOSE Bah BAh evacuate the dance floor bah bah im infected by the sound". 

I need to scan each word or phrase for possible drug references. Let's break it down. The phrase "K WORD" might be a slang term. Looking at the index terms, "K" is a common slang for cocaine, but cocaine isn't listed here. Wait, the index terms include "Codeine" with synonyms like "Lean", but "K" might refer to "Ketamine" (Special K). Let me check the index terms. Yes, "Ketamine" is listed with synonyms 

100%|██████████| 1000/1000 [00:00<00:00, 14657.51it/s]
100%|██████████| 1000/1000 [00:00<00:00, 14270.03it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 41 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   text                             1000 non-null   object
 1   label                            1000 non-null   object
 2   tweet_num                        1000 non-null   int64 
 3   found_terms                      1000 non-null   object
 4   found_index_terms                1000 non-null   object
 5   GPT_found_terms                  1000 non-null   object
 6   GPT_found_index_terms            1000 non-null   object
 7   pubchem_found_terms              1000 non-null   object
 8   pubchem_found_index_terms        1000 non-null   object
 9   redmed_found_terms               1000 non-null   object
 10  redmed_found_index_terms         1000 non-null   object
 11  DEA_found_terms                  1000 non-null   object
 12  DEA_found_index_terms            10




In [None]:

# Collect responses from saved files, get labels and explanations
# responses = extract_responses(tweets, "gpt4omini_rag")
# tweets["4o_mini_response_rag"] = responses
# drug_labels = extract_RAG_drug_labels(tweets, "gpt4omini_rag")
# tweets["4o_mini_drug_labels"] = drug_labels

# # responses = extract_responses(tweets, "o4mini_rag")
# # tweets["o4mini_response_rag"] = responses   
# drug_labels = extract_RAG_drug_labels(tweets, "o4mini_rag")
# tweets["o4mini_drug_labels"] = drug_labels

# # responses = extract_responses(tweets, "qwen_4b_rag")
# # tweets["qwen_4b_response_rag"] = responses
# drug_labels = extract_RAG_drug_labels(tweets, "qwen_4b_rag")
# tweets["qwen_4b_drug_labels"] = drug_labels

# # responses = extract_responses(tweets, "llama_rag")
# # tweets["llama_response_rag"] = responses

import re
from pathlib import Path
from typing import List, Set
for number in range(1000):
    if  number <193:
        continue
    def extract_RAG_drug_labels(tweets: pd.DataFrame, directory: str) -> List[str]:
        # Regex to capture a JSON array block (including nested objects)
        ARRAY_BLOCK = re.compile(r"\[\s*(?:\{[\s\S]*?\}\s*,?\s*)+\]", re.DOTALL)
        THINK_RE   = re.compile(r"<think>(.*?)</think>", re.I | re.S)
        drug_labels = []
        for i, row in tqdm(tweets[:number].iterrows()):
            unique_terms: Set[str] = set()
            path = Path(f"data/{directory}/{i}.json")
            if not path.exists():
                drug_labels.append("NO_RAG_CHUNKS")
                continue
            with open(path, "r", encoding="utf-8-sig") as f:
                raw = f.read().strip()
            # remove <think> block
            m_think = THINK_RE.search(raw)
            if m_think:
                cleaned = raw[:m_think.start()] + raw[m_think.end():].strip()
            else:
                cleaned = raw.strip()
            # Find JSON block
            m = ARRAY_BLOCK.search(cleaned)
            if not m:
                if cleaned == "[]" or cleaned == "```json\n[]\n```":
                    drug_labels.append('')
                    continue
                else:
                    print(f"Error: No JSON found in '{directory}/{i}   {raw}'")
                    drug_labels.append('')
                    continue
            try:
                entries = json.loads(m.group(0))
            except json.JSONDecodeError as e:
                print(f"Error2: No JSON found in '{directory}/{i}   {raw}'")
                drug_labels.append("Error2")
                continue
            if isinstance(entries, list):
                for obj in entries:
                    term = obj.get("index_term")
                    if isinstance(term, str):
                        unique_terms.add(term)
                drug_labels.append(list(unique_terms))
            else:
                print(f"Error3: in '{directory}/{i}   {raw}'")
                drug_labels.append("Error3")
        print(f"processed'{len(drug_labels)}'")
        return drug_labels


    drug_labels = extract_RAG_drug_labels(tweets, "llama_rag")
# tweets["llama_drug_labels"] = drug_labels

tweets.info(verbose=True)

NameError: name 'pd' is not defined

In [None]:
# Print value counts for every label column in tweets
label_cols = [col for col in tweets.columns if "label" in col]
for col in label_cols:
    print(f"\nValue counts for '{col}':")
    print(tweets[col].value_counts(dropna=False))


Value counts for 'label':
label
T    713
F    287
Name: count, dtype: int64

Value counts for '4o_mini_label':
4o_mini_label
False    546
True     454
Name: count, dtype: int64

Value counts for 'o4mini_label':
o4mini_label
False    582
True     418
Name: count, dtype: int64

Value counts for 'qwen_4b_label':
qwen_4b_label
False    568
True     432
Name: count, dtype: int64

Value counts for 'llama_label':
llama_label
True     507
False    493
Name: count, dtype: int64

Value counts for '4o_mini_label_short':
4o_mini_label_short
F    580
T    420
Name: count, dtype: int64

Value counts for 'o4mini_label_short':
o4mini_label_short
F    581
T    419
Name: count, dtype: int64

Value counts for 'qwen_4b_label_short':
qwen_4b_label_short
T    529
F    471
Name: count, dtype: int64

Value counts for 'llama_label_short':
llama_label_short
F    561
T    439
Name: count, dtype: int64

Value counts for '4o_mini_drug_labels':
4o_mini_drug_labels
                                                  

In [None]:
# Find rows where not all label columns agree (all True or all False)
label_cols = [col for col in tweets.columns if "label" in col and col != "label"]
def not_all_agree(row):
    vals = [str(row[col]).strip().lower() for col in label_cols]
    # Only consider rows where all values are either 'true' or all 'false'
    return not (all(v == "true" for v in vals) or all(v == "false" for v in vals))

disagreeing_tweets = tweets[tweets.apply(not_all_agree, axis=1)].copy()
print(f"Number of rows where label columns do not all agree: {len(disagreeing_tweets)}")
disagreeing_tweets[label_cols + ["text"]].head()

Number of rows where label columns do not all agree: 1000


Unnamed: 0,4o_mini_label,o4mini_label,qwen_4b_label,llama_label,4o_mini_label_short,o4mini_label_short,qwen_4b_label_short,llama_label_short,4o_mini_drug_labels,o4mini_drug_labels,qwen_4b_drug_labels,llama_drug_labels,text
0,True,True,False,True,F,T,T,F,[other],[other],[other],"[Heroin, Cocaine]",The guy was hanged in Singapore because he was...
1,False,False,True,False,F,T,T,F,,[other],[other],[other],Ex heroin addict here and I endorse this messa...
2,False,False,False,False,F,F,F,F,,[Morphine],[Morphine],"[Morphine, other]","Yeah, my uncle's GF died a horribly painful, s..."
3,False,True,False,False,F,T,F,F,,[Marijuana],[Marijuana],"[other, delta‑8‑THC‑COOH, delta‑9‑THC‑COOH, CBD]",Innovation drives shift in cannabis product de...
4,True,False,False,True,T,T,F,T,[Codeine],[Codeine],[Codeine],[Codeine],"When that codeine had u knocked all day, now u..."


In [None]:
temporary = tweets.copy()

In [None]:
tweets = temporary.copy()


In [None]:
truth_labels = pd.read_csv('data/labels.csv', encoding="utf-8-sig")
truth_labels = truth_labels.drop(['text'], axis=1)
tweets = pd.merge(tweets, truth_labels, on="tweet_num", how="left")
tweets.info(verbose=True)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 43 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   text                             1000 non-null   object
 1   label                            1000 non-null   object
 2   tweet_num                        1000 non-null   int64 
 3   found_terms                      1000 non-null   object
 4   found_index_terms                1000 non-null   object
 5   GPT_found_terms                  1000 non-null   object
 6   GPT_found_index_terms            1000 non-null   object
 7   pubchem_found_terms              1000 non-null   object
 8   pubchem_found_index_terms        1000 non-null   object
 9   redmed_found_terms               1000 non-null   object
 10  redmed_found_index_terms         1000 non-null   object
 11  DEA_found_terms                  1000 non-null   object
 12  DEA_found_index_terms            10

In [None]:

tweets = temporary.copy()
truth_labels = pd.read_csv('data/labels.csv', encoding="utf-8-sig")
truth_labels = truth_labels.drop(['text'], axis=1)
tweets = pd.merge(tweets, truth_labels, on="tweet_num", how="left")

TRUTH_COL    = "is_about_illicit_drugs"
LIST_OF_DRUGS = "drugs_mentioned"
PRED_COLS    = [
    "o4mini_label", "4o_mini_label", "llama_label", "qwen_4b_label",
    "o4mini_label_short", "4o_mini_label_short", "llama_label_short",
    "qwen_4b_label_short", "label"
]
LIST_COLS    = [
    'o4mini_drug_labels', '4o_mini_drug_labels', 'llama_drug_labels', 'qwen_4b_drug_labels',
    'found_index_terms', 'DEA_found_index_terms', 'pubchem_found_index_terms',
    'GPT_found_index_terms', 'redmed_found_index_terms'
]

def as_list(x):
    if isinstance(x, (list, tuple, set, np.ndarray)):
        return list(x)
    if isinstance(x, str):
        try:
            return ast.literal_eval(x)
        except Exception:
            try:
                return json.loads(x)
            except Exception:
                return [x.strip()]
    if pd.isna(x):
        return []
    return [x]

tweets[LIST_OF_DRUGS] = tweets[LIST_OF_DRUGS].apply(as_list)
for col in LIST_COLS:
    tweets[col] = tweets[col].apply(as_list)
for col in LIST_COLS:
    tweets[col] = tweets[col].apply(
        lambda lst: [str(item).lower().strip() for item in lst]
    )
all_label_lists = tweets[LIST_OF_DRUGS].tolist()
for col in LIST_COLS:
    all_label_lists.extend(tweets[col].tolist())

def clean(lst):
    return [s for s in lst if str(s).strip() != ""]

all_label_lists = [clean(lst) for lst in all_label_lists]

mlb = MultiLabelBinarizer()
mlb.fit(all_label_lists)                          # fit on union

Y_true = mlb.transform(tweets[LIST_OF_DRUGS].apply(clean))

results = {}
for col in LIST_COLS:
    Y_pred = mlb.transform(tweets[col].apply(clean))

    exact = accuracy_score(Y_true, Y_pred)
    # jacc  = jaccard_score(Y_true, Y_pred, average="samples")
    mask = ~((Y_true.sum(axis=1) == 0) & (Y_pred.sum(axis=1) == 0))
    jacc  = jaccard_score(Y_true[mask], Y_pred[mask], average="samples")


    prec  = precision_score(Y_true, Y_pred, average="micro", zero_division=0)
    rec   = recall_score(Y_true, Y_pred, average="micro", zero_division=0)
    f1    = f1_score(Y_true, Y_pred, average="micro", zero_division=0)
    ham   = hamming_loss(Y_true, Y_pred)

    results[col] = {
        "exact_match": round(exact, 4),
        "jaccard":     round(jacc, 4),
        "micro_prec":  round(prec, 4),
        "micro_rec":   round(rec, 4),
        "micro_f1":    round(f1, 4),
        "hamming":     round(ham, 4),
    }

metrics_df = pd.DataFrame(results).T
display(metrics_df)


Unnamed: 0,exact_match,jaccard,micro_prec,micro_rec,micro_f1,hamming
o4mini_drug_labels,0.329,0.0171,0.0175,0.2027,0.0322,0.0106
4o_mini_drug_labels,0.465,0.0154,0.016,0.1486,0.0288,0.0087
llama_drug_labels,0.026,0.0065,0.0065,0.1081,0.0123,0.0151
qwen_4b_drug_labels,0.19,0.0136,0.0148,0.2027,0.0276,0.0124
found_index_terms,0.5,0.0,0.0,0.0,0.0,0.0085
DEA_found_index_terms,0.55,0.0,0.0,0.0,0.0,0.0071
pubchem_found_index_terms,0.624,0.0,0.0,0.0,0.0,0.0054
GPT_found_index_terms,0.58,0.0,0.0,0.0,0.0,0.0064
redmed_found_index_terms,0.618,0.0,0.0,0.0,0.0,0.0055


In [None]:
tw =  tweets[['text', 'tweet_num', 
'o4mini_label', '4o_mini_label', 'llama_label', 'qwen_4b_label', 
'o4mini_label_short', '4o_mini_label_short', 'llama_label_short', 'qwen_4b_label_short',
 'label',
 'o4mini_drug_labels', '4o_mini_drug_labels', 'llama_drug_labels', 'qwen_4b_drug_labels',
 'found_index_terms',
  'DEA_found_index_terms', 'pubchem_found_index_terms', 'GPT_found_index_terms','redmed_found_index_terms']].copy()


In [None]:


# helpers
def to_bool(x):
    if isinstance(x, bool):         # already Boolean
        return x
    if isinstance(x, (int, float)): # 1 / 0 or 1.0 / 0.0
        return bool(x)
    if isinstance(x, str):
        return x.strip().upper() in {"TRUE", "T", "1"}
    return False

def list_to_bool(obj):
    if obj is None or obj == []:
        return False
    if isinstance(obj, str):
        try:
            import ast
            obj = ast.literal_eval(obj)
        except Exception:
            return False
    try:
        result = any(str(item).lower() != "other" for item in obj)
    except Exception as e:
        print(obj)
        print(e)
        return False
    return result

def metrics(y_true, y_pred):
    acc  = accuracy_score(y_true, y_pred)
    prec = precision_score(y_true, y_pred, zero_division=0)
    rec  = recall_score(y_true, y_pred, zero_division=0)
    f1   = f1_score(y_true, y_pred, zero_division=0)
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred, labels=[False, True]).ravel()
    return pd.Series({
        "accuracy":        acc,
        "precision":       prec,
        "recall":          rec,
        "f1":              f1,
        "true_positives":  tp,
        "false_positives": fp,
        "true_negatives":  tn,
        "false_negatives": fn,
    })
                   
labeled_tweets = tweets[tweets[TRUTH_COL].notnull()].copy()
labeled_tweets["combined_truth"] = labeled_tweets.apply(
    lambda r: to_bool(r[TRUTH_COL]) and list_to_bool(r[LIST_OF_DRUGS]), axis=1,
)

labeled_tweets[TRUTH_COL] = labeled_tweets[TRUTH_COL].apply(to_bool)

for col in PRED_COLS:
    labeled_tweets[col] = labeled_tweets[col].apply(to_bool)

for col in LIST_COLS:
    labeled_tweets[col] = labeled_tweets[col].apply(list_to_bool)

PRED_COLS.extend(LIST_COLS)

results = pd.concat(
    {col: metrics(labeled_tweets[TRUTH_COL], labeled_tweets[col]) for col in PRED_COLS},
    axis=1
).T.round(4)

display(results)

Unnamed: 0,accuracy,precision,recall,f1,true_positives,false_positives,true_negatives,false_negatives
o4mini_label,0.8205,0.6957,0.8205,0.7529,32.0,14.0,64.0,7.0
4o_mini_label,0.7863,0.6522,0.7692,0.7059,30.0,16.0,62.0,9.0
llama_label,0.7265,0.566,0.7692,0.6522,30.0,23.0,55.0,9.0
qwen_4b_label,0.7778,0.6444,0.7436,0.6905,29.0,16.0,62.0,10.0
o4mini_label_short,0.8205,0.7143,0.7692,0.7407,30.0,12.0,66.0,9.0
4o_mini_label_short,0.7863,0.675,0.6923,0.6835,27.0,13.0,65.0,12.0
llama_label_short,0.7949,0.6829,0.7179,0.7,28.0,13.0,65.0,11.0
qwen_4b_label_short,0.6838,0.5172,0.7692,0.6186,30.0,28.0,50.0,9.0
label,0.5214,0.3951,0.8205,0.5333,32.0,49.0,29.0,7.0
o4mini_drug_labels,0.3932,0.3367,0.8462,0.4818,33.0,65.0,13.0,6.0


In [None]:
:)

SyntaxError: unmatched ')' (1896645534.py, line 1)