In [1]:
import pandas as pd
import json
import os
import re
from tqdm import tqdm
tqdm.pandas()

from utils import (
    get_tweets_dataset, 
    extract_responses,
    extract_JSON_labels_and_explanations,
    extract_drug_labels,
    extract_T_F_labels, 
    match_terms,
    get_confusion_matrix_and_metrics,
)
from llm_manager import (
    run_prompt_on_llm,
### paid #################
    # get_claude_response, 
    # get_openai_response, 
    # get_perplexity_response,
    get_openai_gpt4omini_response,
    get_openai_o4mini_response,
### local #################
    get_llama_response, 
    get_qwen_4b_response,                        
    #get_deepseek_qwen_response,
    # get_qwen_response,
    # get_mistral_response,
    cleanup_llama,
    cleanup_qwen,
    cleanup_qwen_4b,
    cleanup_deepseek_qwen,
    cleanup_mistral,
)

In [2]:
tweets = get_tweets_dataset()
SEED = 777
tweets = ( tweets.sample(n=1_000, random_state=SEED, replace=False)
                 .sort_index()
                 .reset_index(drop=True)
)
tweets.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   text                       1000 non-null   object
 1   label                      1000 non-null   object
 2   tweet_num                  1000 non-null   int64 
 3   found_terms                1000 non-null   object
 4   found_index_terms          1000 non-null   object
 5   GPT_found_terms            1000 non-null   object
 6   GPT_found_index_terms      1000 non-null   object
 7   pubchem_found_terms        1000 non-null   object
 8   pubchem_found_index_terms  1000 non-null   object
 9   redmed_found_terms         1000 non-null   object
 10  redmed_found_index_terms   1000 non-null   object
 11  DEA_found_terms            1000 non-null   object
 12  DEA_found_index_terms      1000 non-null   object
dtypes: int64(1), object(12)
memory usage: 101.7+ KB


# Long prompt

In [3]:
# Collect responses from saved files, get labels and explanations
responses = extract_responses(tweets, "gpt4omini")
tweets["4o_mini_response"] = responses
labels, explanations = extract_JSON_labels_and_explanations(tweets, "gpt4omini")
tweets["4o_mini_label"] = labels
tweets["4o_mini_explanation"] = explanations

responses = extract_responses(tweets, "o4mini")
tweets["o4mini_response"] = responses   
labels, explanations = extract_JSON_labels_and_explanations(tweets, "o4mini")
tweets["o4mini_label"] = labels
tweets["o4mini_explanation"] = explanations

responses = extract_responses(tweets, "qwen_4b")
tweets["qwen_4b_response"] = responses
labels, explanations = extract_JSON_labels_and_explanations(tweets, "qwen_4b")
tweets["qwen_4b_label"] = labels
tweets["qwen_4b_explanation"] = explanations

responses = extract_responses(tweets, "llama")
tweets["llama_response"] = responses
labels, explanations = extract_JSON_labels_and_explanations(tweets, "llama")
tweets["llama_label"] = labels
tweets["llama_explanation"] = explanations

tweets.info(verbose=True)


100%|██████████| 1000/1000 [00:00<00:00, 9428.96it/s]
100%|██████████| 1000/1000 [00:00<00:00, 13210.82it/s]
100%|██████████| 1000/1000 [00:00<00:00, 9497.95it/s]
100%|██████████| 1000/1000 [00:00<00:00, 12377.14it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8780.90it/s]
100%|██████████| 1000/1000 [00:00<00:00, 11916.52it/s]
100%|██████████| 1000/1000 [00:00<00:00, 9392.92it/s]
100%|██████████| 1000/1000 [00:00<00:00, 14398.42it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   text                       1000 non-null   object
 1   label                      1000 non-null   object
 2   tweet_num                  1000 non-null   int64 
 3   found_terms                1000 non-null   object
 4   found_index_terms          1000 non-null   object
 5   GPT_found_terms            1000 non-null   object
 6   GPT_found_index_terms      1000 non-null   object
 7   pubchem_found_terms        1000 non-null   object
 8   pubchem_found_index_terms  1000 non-null   object
 9   redmed_found_terms         1000 non-null   object
 10  redmed_found_index_terms   1000 non-null   object
 11  DEA_found_terms            1000 non-null   object
 12  DEA_found_index_terms      1000 non-null   object
 13  4o_mini_response           1000 non-null   object
 14  4o_mini_l




# Short prompt

In [4]:
# Collect responses from saved files, get labels
responses = extract_responses(tweets, "gpt4omini_short")
tweets["4o_mini_response_short"] = responses
labels = extract_T_F_labels(tweets, "gpt4omini_short")
tweets["4o_mini_label_short"] = labels

responses = extract_responses(tweets, "o4mini_short")
tweets["o4mini_response_short"] = responses   
labels = extract_T_F_labels(tweets, "o4mini_short")
tweets["o4mini_label_short"] = labels

responses = extract_responses(tweets, "qwen_4b_short")
tweets["qwen_4b_response_short"] = responses
labels = extract_T_F_labels(tweets, "qwen_4b_short")
tweets["qwen_4b_label_short"] = labels

responses = extract_responses(tweets, "llama_short")
tweets["llama_response_short"] = responses
labels = extract_T_F_labels(tweets, "llama_short")
tweets["llama_label_short"] = labels

tweets.info(verbose=True)

100%|██████████| 1000/1000 [00:00<00:00, 10118.22it/s]
100%|██████████| 1000/1000 [00:00<00:00, 14295.47it/s]
100%|██████████| 1000/1000 [00:00<00:00, 10443.15it/s]
100%|██████████| 1000/1000 [00:00<00:00, 16364.69it/s]
100%|██████████| 1000/1000 [00:00<00:00, 9136.23it/s]
100%|██████████| 1000/1000 [00:00<00:00, 11894.28it/s]
100%|██████████| 1000/1000 [00:00<00:00, 7336.29it/s]
100%|██████████| 1000/1000 [00:00<00:00, 14602.19it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 33 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   text                       1000 non-null   object
 1   label                      1000 non-null   object
 2   tweet_num                  1000 non-null   int64 
 3   found_terms                1000 non-null   object
 4   found_index_terms          1000 non-null   object
 5   GPT_found_terms            1000 non-null   object
 6   GPT_found_index_terms      1000 non-null   object
 7   pubchem_found_terms        1000 non-null   object
 8   pubchem_found_index_terms  1000 non-null   object
 9   redmed_found_terms         1000 non-null   object
 10  redmed_found_index_terms   1000 non-null   object
 11  DEA_found_terms            1000 non-null   object
 12  DEA_found_index_terms      1000 non-null   object
 13  4o_mini_response           1000 non-null   object
 14  4o_mini_l




# Entity linking

In [5]:
# Collect responses from saved files, get labels and explanations
responses = extract_responses(tweets, "gpt4omini_entity_linking")
tweets["4o_mini_response_entity_linking"] = responses
drug_labels = extract_drug_labels(tweets, "gpt4omini_entity_linking")
tweets["4o_mini_drug_labels"] = drug_labels

responses = extract_responses(tweets, "o4mini_entity_linking")
tweets["o4mini_response_entity_linking"] = responses   
drug_labels = extract_drug_labels(tweets, "o4mini_entity_linking")
tweets["o4mini_drug_labels"] = drug_labels

responses = extract_responses(tweets, "qwen_4b_entity_linking")
tweets["qwen_4b_response_entity_linking"] = responses
drug_labels = extract_drug_labels(tweets, "qwen_4b_entity_linking")
tweets["qwen_4b_drug_labels"] = drug_labels

responses = extract_responses(tweets, "llama_entity_linking")
tweets["llama_response_entity_linking"] = responses
drug_labels = extract_drug_labels(tweets, "llama_entity_linking")
tweets["llama_drug_labels"] = drug_labels

tweets.info(verbose=True)

100%|██████████| 1000/1000 [00:00<00:00, 8645.54it/s]
100%|██████████| 1000/1000 [00:00<00:00, 14340.19it/s]
100%|██████████| 1000/1000 [00:00<00:00, 9901.52it/s]
100%|██████████| 1000/1000 [00:00<00:00, 12734.94it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8575.50it/s]
100%|██████████| 1000/1000 [00:00<00:00, 8978.51it/s]


Error: No JSON found in 'qwen_4b_entity_linking/9   <think>
Okay, let's tackle this query. The user wants me to moderate social media content for drug-related references. The task is to detect any words or phrases that refer to illicit drugs or substances, link them to the appropriate Index Terms, and return a JSON array.

First, I need to look at the provided post: "These numbers equate to more than one overdose each day, and it's not uncommon for the same people to overdose again and again. In these first three months of 2021, we have responded to another 136 overdoses and within the same time period."

I need to scan each word and phrase for any drug references. Let's break it down sentence by sentence.

The first sentence talks about overdoses, but doesn't mention any specific drugs. Words like "overdose" are mentioned, but the Index Terms don't include "overdose" itself. The task is to find references to drugs, not the act of overdosing. So "overdose" here is just a general term, 

100%|██████████| 1000/1000 [00:00<00:00, 9565.10it/s]
100%|██████████| 1000/1000 [00:00<00:00, 12736.49it/s]

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 41 columns):
 #   Column                           Non-Null Count  Dtype 
---  ------                           --------------  ----- 
 0   text                             1000 non-null   object
 1   label                            1000 non-null   object
 2   tweet_num                        1000 non-null   int64 
 3   found_terms                      1000 non-null   object
 4   found_index_terms                1000 non-null   object
 5   GPT_found_terms                  1000 non-null   object
 6   GPT_found_index_terms            1000 non-null   object
 7   pubchem_found_terms              1000 non-null   object
 8   pubchem_found_index_terms        1000 non-null   object
 9   redmed_found_terms               1000 non-null   object
 10  redmed_found_index_terms         1000 non-null   object
 11  DEA_found_terms                  1000 non-null   object
 12  DEA_found_index_terms            10




In [6]:
# Print value counts for every label column in tweets
label_cols = [col for col in tweets.columns if "label" in col]
for col in label_cols:
    print(f"\nValue counts for '{col}':")
    print(tweets[col].value_counts(dropna=False))


Value counts for 'label':
label
T    713
F    287
Name: count, dtype: int64

Value counts for '4o_mini_label':
4o_mini_label
False    546
True     454
Name: count, dtype: int64

Value counts for 'o4mini_label':
o4mini_label
False    582
True     418
Name: count, dtype: int64

Value counts for 'qwen_4b_label':
qwen_4b_label
False    568
True     432
Name: count, dtype: int64

Value counts for 'llama_label':
llama_label
True     507
False    493
Name: count, dtype: int64

Value counts for '4o_mini_label_short':
4o_mini_label_short
F    580
T    420
Name: count, dtype: int64

Value counts for 'o4mini_label_short':
o4mini_label_short
F    581
T    419
Name: count, dtype: int64

Value counts for 'qwen_4b_label_short':
qwen_4b_label_short
T    529
F    471
Name: count, dtype: int64

Value counts for 'llama_label_short':
llama_label_short
F    561
T    439
Name: count, dtype: int64

Value counts for '4o_mini_drug_labels':
4o_mini_drug_labels
                                                  

In [7]:
# Find rows where not all label columns agree (all True or all False)
label_cols = [col for col in tweets.columns if "label" in col and col != "label"]
def not_all_agree(row):
    vals = [str(row[col]).strip().lower() for col in label_cols]
    # Only consider rows where all values are either 'true' or all 'false'
    return not (all(v == "true" for v in vals) or all(v == "false" for v in vals))

disagreeing_tweets = tweets[tweets.apply(not_all_agree, axis=1)].copy()
print(f"Number of rows where label columns do not all agree: {len(disagreeing_tweets)}")
disagreeing_tweets[label_cols + ["text"]].head()

Number of rows where label columns do not all agree: 1000


Unnamed: 0,4o_mini_label,o4mini_label,qwen_4b_label,llama_label,4o_mini_label_short,o4mini_label_short,qwen_4b_label_short,llama_label_short,4o_mini_drug_labels,o4mini_drug_labels,qwen_4b_drug_labels,llama_drug_labels,text
0,True,True,False,True,F,T,T,F,[other],[other],[other],"[Heroin, Cocaine]",The guy was hanged in Singapore because he was...
1,False,False,True,False,F,T,T,F,,[other],[other],[other],Ex heroin addict here and I endorse this messa...
2,False,False,False,False,F,F,F,F,,[Morphine],[Morphine],"[other, Morphine]","Yeah, my uncle's GF died a horribly painful, s..."
3,False,True,False,False,F,T,F,F,,[Marijuana],[Marijuana],"[delta‑8‑THC‑COOH, delta‑9‑THC‑COOH, other, CBD]",Innovation drives shift in cannabis product de...
4,True,False,False,True,T,T,F,T,[Codeine],[Codeine],[Codeine],[Codeine],"When that codeine had u knocked all day, now u..."


In [8]:
temporary  = tweets.copy()

In [9]:
tweets  = temporary.copy()

In [12]:
tw =  tweets[['text', 'tweet_num', 
'o4mini_label', '4o_mini_label', 'llama_label', 'qwen_4b_label', 
'o4mini_label_short', '4o_mini_label_short', 'llama_label_short', 'qwen_4b_label_short',
 'label',
 'o4mini_drug_labels', '4o_mini_drug_labels', 'llama_drug_labels', 'qwen_4b_drug_labels',
 'found_index_terms',
  'DEA_found_index_terms', 'pubchem_found_index_terms', 'GPT_found_index_terms','redmed_found_index_terms']].copy()

tw.to_csv("t.csv", index=False, encoding="utf-8-sig")


In [11]:
:)

SyntaxError: unmatched ')' (1896645534.py, line 1)