# Long Prompt
<br>
James Chapman<br>
CIS 830 Advanced Topics in AI – Term Project<br>
Kansas State University<br><br>

This notebook classifies social media posts for ILLICIT/ABUSIVE drug use.<br>

- Uses 'long_prompt'
- Runs 4 LLMs (2 Paid Services & 2 Local HuggingFace Models)
    - o4-mini 
    - GPT-4o-mini
    - Meta-Llama-3.1-8B-Instruct
    - Qwen-4B

*** Saves response from every LLM call in separate text files.

In [None]:
import pandas as pd
import json
import os
import re
from tqdm import tqdm
tqdm.pandas()

from utils import (
    get_tweets_dataset, 
    extract_responses,
    extract_JSON_labels_and_explanations,
    extract_drug_labels,
    extract_T_F_labels, 
    match_terms,
    get_confusion_matrix_and_metrics,
)
from llm_manager import (
    run_prompt_on_llm,
### paid #################
    # get_claude_response, 
    # get_openai_response, 
    # get_perplexity_response,
    get_openai_gpt4omini_response,
    get_openai_o4mini_response,
### local #################
    get_llama_response, 
    get_qwen_4b_response,                        
    #get_deepseek_qwen_response,
    # get_qwen_response,
    # get_mistral_response,
    cleanup_llama,
    cleanup_qwen,
    cleanup_qwen_4b,
    cleanup_deepseek_qwen,
    cleanup_mistral,
)

In [None]:
long_prompt = """
### Role
You are an expert content moderator specializing in identifying drug-related content on social media.

### Task
Classify the content as either ILLICIT/ABUSIVE-DRUG-RELATED (True) or NOT ILLICIT/ABUSIVE-DRUG-RELATED (False).
Provide a one sentence explanation for your classification.

### Output: JSON format
{
    "label": "True" or "False",
    "explanation": "Your reasoning for the classification."
}       

### Definitions
True include content that: 
- Mentions illegal, RX, or OTC drugs in the context of abuse or misuse (using, buying, promoting, etc.) 
- Uses slang, euphemisms, or cultural references that signal recreational drug use or drugabuse.
- Encourages or glorifies drug misuse

False include content that:
- Mentions drugs in a medical, educational, or preventative context (discouraging)
- Uses drug-related terms in a different context or unrelated topics

--- Example #1 ---
Content: I got her hooked on that lean everyday she say she want codeine
{"label": "True", "explanation": "The content mentions 'lean' and 'codeine,' and explains 'hooked' 'everyday' with an approving, supportive tone.."}

--- Example #2 ---
 Content: Pharmacology lecture today covered how buprenorphine helps treat opioid addiction
{"label": "False", "explanation": "Educational context discussing medication assisted treatment, no promotion of misuse."}

(Do NOT include anything except True or False)
Content: {{tweet_text}}
"""

In [2]:
tweets = get_tweets_dataset()
SEED = 777
tweets = ( tweets.sample(n=1_000, random_state=SEED, replace=False)
                 .sort_index()
                 .reset_index(drop=True)
)
tweets.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   text                       1000 non-null   object
 1   label                      1000 non-null   object
 2   tweet_num                  1000 non-null   int64 
 3   found_terms                1000 non-null   object
 4   found_index_terms          1000 non-null   object
 5   GPT_found_terms            1000 non-null   object
 6   GPT_found_index_terms      1000 non-null   object
 7   pubchem_found_terms        1000 non-null   object
 8   pubchem_found_index_terms  1000 non-null   object
 9   redmed_found_terms         1000 non-null   object
 10  redmed_found_index_terms   1000 non-null   object
 11  DEA_found_terms            1000 non-null   object
 12  DEA_found_index_terms      1000 non-null   object
dtypes: int64(1), object(12)
memory usage: 101.7+ KB


# RUN 4 MODELS: long_prompt

In [None]:
# o4-mini (smarter than GPT-4o-mini)
responses = run_prompt_on_llm(get_openai_o4mini_response, "o4mini", long_prompt, tweets)
#tweets["o4mini_response"] = responses

# GPT-4o-mini
responses = run_prompt_on_llm(get_openai_gpt4omini_response, "gpt4omini", long_prompt, tweets)
#tweets["4o_mini_response"] = responses

# Meta-Llama-3.1-8B-Instruct
responses = run_prompt_on_llm(get_llama_response, "llama", long_prompt, tweets)
# tweets["llama_response"] = responses
cleanup_llama()

# Qwen-4B
responses = run_prompt_on_llm(get_qwen_4b_response, "qwen_4b", long_prompt, tweets)
# tweets["qwen_4b_response"] = responses
cleanup_qwen_4b()

In [None]:
:)