# Prompt used

- Derived following the works of https://aclanthology.org/2022.nlperspectives-1.11/
- See Huggingface data and human annotation for creating hatefullness rubric here:
  https://huggingface.co/datasets/ucberkeley-dlab/measuring-hate-speech

In [None]:
!pip install openai
from openai import OpenAI

import pandas as pd
import tqdm


from google.colab import drive
drive.mount('/content/drive')

def emotion_hate(event, model_name= 'gpt-4o-mini'):
      prompt = f"""
      You are an expert in online hate speech detection. You will be provided with a text '{event}', your task is to analyze it and assign scores based on the criteria below.
      Follow this structured step-by-step approach and judge by the following criteria:
      Step 1: Identify the speech type:
        DIRECT: The text directly uses hateful language or expresses harmful sentiments
        REPORTING: The text quotes, references, or discusses hateful language within an educational, journalistic, or analytical context

      Step 2:Assign a hatefulness score based on the criteria that best matches the given text
        score 0: Neutral – Content lacking any offensive language, negative sentiment, or harmful implications.
        score 1: Mild Insensitivity – Content with potentially insensitive language but lacks clear negative intent or strong negative implications..
        score 2: Problematic Content – Content with clearly negative language, dismissive attitudes, or coded offensive phrases that marginalize certain groups or reinforce harmful ideas..
        score 3: Targeted Negative Content – Content targets specific identities in a harmful way through stereotypes, indirect attacks, or coded hate.
        score 4: Explicitly Hateful – Content with direct slurs, explicit dehumanization, violent threats, incitement to harm, or advocacy of discrimination.

      If content contains elements from multiple score levels, focus on the most severe elements present.

      Consider these important contextual factors:
        The likely intent behind the message
        Whether terms are being used self-referentially by in-group members
        Whether the content is quoting harmful language to discuss or critique it
        Cultural and historical context that may affect interpretation

      Step 3: Justify in 1-3 sentences why this score was chosen.

      Step 4: Output in this exact format:
        speech_type: chosen speech type from Step 1
        hate_score: chosen hatefullness score from Step 2
        explanation: brief reasoning from Step 3

      Ensure that the response strictly follows this format, with no additional explanation, commentary, or justification beyond the required output."""

      client = OpenAI(
          api_key = 'API Key'
          )

      system_prompt = "You are an expert in detecting and classifying harmful content online, with specific expertise in recognizing hate speech, implicit bias, and coded language."

      input = [{"role": "system", "content": system_prompt},
              {"role": "user","content": prompt,}]
      chat_completion = client.chat.completions.create(
          messages=input,
          model=model_name,
          temperature=0.1
      )
      output = chat_completion.choices[0].message.content

      print(output)
      try:
          lines = output.strip().split('\n')
          speech_type_line = next((line for line in iter(lines) if line.lower().startswith('speech_type:')), None)
          score_line = next((line for line in iter(lines) if line.lower().startswith('hate_score:')), None)
          explanation_line = next((line for line in iter(lines) if line.lower().startswith('explanation:')), None)

          if speech_type_line and score_line and explanation_line:
              speech_type = speech_type_line.split(':', 1)[1].strip()
              score = score_line.split(':', 1)[1].strip()
              explanation = explanation_line.split(':', 1)[1].strip()

              return {
                  "speech_type": speech_type,
                  "score": int(score),
                  "explanation": explanation
              }
          else:
              return {
                  "speech_type": None,
                  "score": None,
                  "explanation": "Failed to parse model output",
                  "raw_output": output
              }
      except (IndexError, ValueError) as e:
          print(f"Error processing response: {output} | {e}")
          return {"speech_type": "unknown","score": "unknown", "explanation": "No valid explanation provided"}


df = pd.read_parquet(".../AIES/WELFAKE/welfake_sample_dataset.csv")
result_df = dataset['text'].apply(lambda x: pd.Series(emotion_hate(x)))
dataset['speech_type'] = result_df['speech_type']
dataset['score'] = result_df['score']
dataset['explanation'] = result_df['explanation']


