In [4]:
# load dataset
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
# load from txt file
datapath = 'dataset/irony'
df = pd.read_csv(datapath+'/SemEval2018-T3-train-taskA.txt', sep='\t', header=None)
print(df.head())
labels = df[1]
texts = df[2]
# remove the first row in labels and texts
labels = labels[1:]
texts = texts[1:]
print(labels.head())
print(texts.head())

             0      1                                                  2
0  Tweet index  Label                                         Tweet text
1            1      1  Sweet United Nations video. Just in time for C...
2            2      1  @mrdahl87 We are rumored to have talked to Erv...
3            3      1  Hey there! Nice to see you Minnesota/ND Winter...
4            4      0                3 episodes left I'm dying over here
1    1
2    1
3    1
4    0
5    1
Name: 1, dtype: object
1    Sweet United Nations video. Just in time for C...
2    @mrdahl87 We are rumored to have talked to Erv...
3    Hey there! Nice to see you Minnesota/ND Winter...
4                  3 episodes left I'm dying over here
5    I can't breathe! was chosen as the most notabl...
Name: 2, dtype: object


# learn from various samples

In [11]:
import ollama

# Function to interact with the Ollama API
def chat_with_context(history):
    response = ollama.chat(model='llama3', messages=history)
    return response['message']['content']

# Initialize the chat history
chat_history = []

# Step 1: Generate Correctly Classified and Misclassified Samples
num_samples = 5
task_description = "We need to predict whether a given tweet is ironic or not. The labels are 'ironic' and 'not ironic'."

generate_samples_request = (
    f"As an advanced language model you should create {num_samples} samples for the task outlined below.\n"
    "Generate samples that are likely to be correctly classified as 'ironic' or 'not ironic' and samples that might be misclassified according to the task instructions.\n\n"
    f"### Task Description:\n{task_description}\n\n"
    "### Requirements for Samples:\n"
    "1. Each sample must present a unique and intricate challenge.\n"
    "2. The complexity of the samples should be such that simply applying the given task instruction would likely lead to incorrect or incomplete results for some samples.\n"
    "3. The samples should cover a diverse range of scenarios within the scope of the task, avoiding repetition and predictability.\n"
    "4. Ensure that the samples, while challenging, remain realistic and pertinent to the task's context.\n"
    "Generate the samples keeping these requirements in mind.\n###"
)

chat_history.append({'role': 'user', 'content': task_description})
chat_history.append({'role': 'user', 'content': generate_samples_request})

samples_response = chat_with_context(chat_history)
chat_history.append({'role': 'assistant', 'content': samples_response})
print("Generated Samples:")
print(samples_response)

# Step 2: Analyze Samples with Chain of Thought
analyze_samples_request = (
    f"Here are some samples: {samples_response}\n"
    "Think step by step and derive general principles for classifying tweets as 'ironic' or 'not ironic'.\n"
    "Avoid focusing on specific details of the provided samples. Instead, develop broader, example-agnostic guidelines that can be applied universally to classify any news article. Conclude your analysis with clear, concise bullet points outlining:\n"
    "- The general characteristics that typically define each category.\n"
    "- Common mistakes that might lead to misclassifications and how to avoid them.\n"
    "- Guidelines under which circumstances each label should be predicted.\n"
    "These principles should help in accurately predicting the category of a tweet based on its content without additional context."
)

chat_history.append({'role': 'user', 'content': analyze_samples_request})

analysis_response = chat_with_context(chat_history)
chat_history.append({'role': 'assistant', 'content': analysis_response})
print("Analysis of Samples:")
print(analysis_response)

# Step 3: Generate Optimized Prompt
generate_prompt_request = (
    f"Based on the following analysis: {analysis_response}\nGenerate an optimized prompt for predicting "
    "whether a tweet is 'ironic' or 'not ironic'. Ensure the model responds only with 'ironic' or 'not ironic'.\n\n"
    "### Requirements for Optimized Prompt:\n"
    "1. The prompt must include a clear description of the task and the labels.\n"
    "2. It should provide criteria for classifying tweets as 'ironic' or 'not ironic' based on the analysis.\n"
    "3. The prompt must ensure that the model responds strictly with 'ironic' or 'not ironic'.\n"
    "4. The prompt should help the model avoid common pitfalls and misclassifications identified during the analysis.\n"
    "5. Ensure the language is unambiguous and tailored to maximize the model's prediction accuracy.\n"
    "6. Encourage the model to think step by step.\n"
    "Respond with no other explanation but only the content of the prompt that is ready for the model to predict\n"
    "Prompt:"
)

chat_history.append({'role': 'user', 'content': generate_prompt_request})

optimized_prompt_response = chat_with_context(chat_history)
chat_history.append({'role': 'assistant', 'content': optimized_prompt_response})
print("Optimized Prompt:")
print(optimized_prompt_response)


Generated Samples:
Here are five tweet samples that aim to present unique challenges and cover a diverse range of scenarios:

**Sample 1: Double Meaning**
"Just got my period... I mean, just got my package delivered! #newkitchenware" (Ironic)

This sample is designed to test the model's ability to recognize double meanings. The phrase "got my period" can refer to both menstruation and receiving a delivery, making it essential for the model to consider the context and intent behind the tweet.

**Sample 2: Sarcastic Statement**
"I'm so excited to be stuck in this traffic jam! Who needs to get to work on time, anyway? #rushhourblues" (Ironic)

This sample is meant to challenge the model's ability to detect sarcasm. The statement "so excited" is clearly meant to convey the opposite emotion, and the model must recognize the tone as playful rather than genuine.

**Sample 3: Hyperbole**
"Just ran a marathon... in my backyard! I'm basically an Olympic athlete now! #fitnessgoals" (Not Ironic)



In [14]:
def get_prediction(text):
    prompt = (
        f"{optimized_prompt_response}\n\n"
        f"Tweet: {text}\n\n"
        "### Requirements:\n"
        "1. Respond with only a single-digit (0 for not ironic, 1 for ironic).\n"
        "2. Do not provide any additional text or explanation.\n"
        "Respond with only '0' or '1':"
    )
    response = ollama.generate(model='llama3', prompt=prompt)
    # Ensure the response is either '0' or '1'
    response_text = response['response'].strip()
    if '1' in response_text:
        return '1'
    else:
        return '0'

predictions = []
for i, text in enumerate(texts):
    if i % 1000 == 0:
        print(f"Predicting tweet {i+1} out of {len(texts)}")
    # if i == 1000:
    #     break
    prediction = get_prediction(text)
    predictions.append(prediction)

# Step 6: Evaluate Model
from sklearn.metrics import accuracy_score
predictions = ["1" if "1" in p else "0" for p in predictions]
accuracy = accuracy_score(labels[:len(predictions)], predictions)
print(f"Model Accuracy: {accuracy}")

Predicting tweet 1 out of 3817
Predicting tweet 1001 out of 3817
Predicting tweet 2001 out of 3817
Predicting tweet 3001 out of 3817
Model Accuracy: 0.5441446161907257


# With zero shot COT

In [7]:
import ollama
def get_prediction(text):
    prompt = (
        "Predict whether the following tweet is ironic or not:\n\n"
        "Let's think step by step"
        f"Tweet: {text}\n\n"
        "### Requirements:\n"
        "1. Respond with only a single-digit (0 for not ironic, 1 for ironic).\n"
        "2. Do not provide any additional text or explanation.\n"
        "Respond with only '0' or '1':"
    )
    response = ollama.generate(model='llama3', prompt=prompt)
    # Ensure the response is either '0' or '1'
    response_text = response['response'].strip()
    if '1' in response_text:
        return '1'
    else:
        return '0'

predictions = []
for i, text in enumerate(texts):
    if i % 1000 == 0:
        print(f"Predicting tweet {i+1} out of {len(texts)}")
    # if i == 1000:
    #     break
    prediction = get_prediction(text)
    predictions.append(prediction)

# Step 6: Evaluate Model
from sklearn.metrics import accuracy_score
predictions = ["1" if "1" in p else "0" for p in predictions]
accuracy = accuracy_score(labels, predictions)
print(f"Model Accuracy: {accuracy}")

Predicting tweet 1 out of 3817
Predicting tweet 1001 out of 3817
Predicting tweet 2001 out of 3817
Predicting tweet 3001 out of 3817
Model Accuracy: 0.5375949698716269


# with basic prompt

In [91]:
def get_prediction(text):
    response = ollama.generate(model='llama3', prompt=f"Predict if the tweet text is ironic or not: {text}. make sure to respond with only the prediction value (0 or 1)")
    return response['response']

predictions = []
for i, text in enumerate(texts):
    if i % 1000 == 0:
        print(f"Predicting tweet {i+1} out of {len(texts)}")
    prediction = get_prediction(text)
    predictions.append(prediction)

# Step 6: Evaluate Model
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(labels, predictions)
print(f"Model Accuracy: {accuracy}")

Predicting tweet 1 out of 3817
Predicting tweet 1001 out of 3817
Predicting tweet 2001 out of 3817
Predicting tweet 3001 out of 3817
Model Accuracy: 0.48388787005501704


# few shot prompt

In [92]:
import ollama
import pandas as pd
from sklearn.metrics import accuracy_score

# Load the dataset
datapath = 'dataset/irony'
df = pd.read_csv(datapath+'/SemEval2018-T3-train-taskA.txt', sep='\t', header=None)
labels = df[1][1:]
texts = df[2][1:]

# construct few shot learning task
few_shot_text = texts[:3]
few_shot_labels = labels[:3]



# Function to interact with the Ollama API
def get_prediction(text):
    # use the first few samples as few shot learning examples
    prompt = (f"Here are the first few tweets in the dataset:\n\n1. {few_shot_text.iloc[0]} label:{few_shot_labels.iloc[0]}\n2. {few_shot_text.iloc[1]} label:{few_shot_labels.iloc[1]}\n3. {few_shot_text.iloc[2]} label:{few_shot_labels.iloc[2]}\n\n"
            "Based on these examples, predict whether the following tweet is ironic or not:"
            f"Tweet: {text}\n\n"
            "### Requirements:\n"
            "1. Respond with only a single-digit (0 for not ironic, 1 for ironic).\n"
            "2. Do not provide any additional text or explanation.\n"
            "Respond with only '0' or '1':"
    )
    response = ollama.generate(model='llama3', prompt=prompt)
    # Ensure the response is either '0' or '1'
    response_text = response['response'].strip()
    if '1' in response_text:
        return '1'
    else:
        return '0'
    
# Make predictions
predictions = []
for i, text in enumerate(texts):
    if i == 0 or i == 1 or i == 2:
        continue
    if i % 1000 == 0:
        print(f"Predicting tweet {i+1} out of {len(texts)}")
    prediction = get_prediction(text)
    predictions.append(prediction)

# Step 6: Evaluate Model
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(labels[3:], predictions)
print(f"Model Accuracy: {accuracy}")



Predicting tweet 1001 out of 3817
Predicting tweet 2001 out of 3817
Predicting tweet 3001 out of 3817
Model Accuracy: 0.4884635553224961


# two step

In [17]:
import ollama

# Function to interact with the Ollama API
def chat_with_context(history):
    response = ollama.chat(model='llama3', messages=history)
    return response['message']['content']

# Initialize the chat history
chat_history = []

# Step 1: Generate Correctly Classified and Misclassified Samples
num_samples = 5
task_description = "We need to predict whether a given tweet is ironic or not. The labels are 'ironic' and 'not ironic'."

generate_samples_request = (
    f"As an advanced language model you should create {num_samples} samples for the task outlined below.\n"
    "Generate samples that are likely to be correctly classified as 'ironic' or 'not ironic' and samples that might be misclassified according to the task instructions.\n\n"
    f"### Task Description:\n{task_description}\n\n"
    "### Requirements for Samples:\n"
    "1. Each sample must present a unique and intricate challenge.\n"
    "2. The complexity of the samples should be such that simply applying the given task instruction would likely lead to incorrect or incomplete results for some samples.\n"
    "3. The samples should cover a diverse range of scenarios within the scope of the task, avoiding repetition and predictability.\n"
    "4. Ensure that the samples, while challenging, remain realistic and pertinent to the task's context.\n"
    "Generate the samples keeping these requirements in mind.\n###"
)

chat_history.append({'role': 'user', 'content': task_description})
chat_history.append({'role': 'user', 'content': generate_samples_request})

samples_response = chat_with_context(chat_history)
chat_history.append({'role': 'assistant', 'content': samples_response})
print("Generated Samples:")
print(samples_response)

# Step 2: Analyze Samples with Chain of Thought
analyze_samples_request = (
    f"Here are some samples: {samples_response}\n"
    "Think step by step and derive general principles for classifying tweets as 'ironic' or 'not ironic'.\n"
    "Avoid focusing on specific details of the provided samples. Instead, develop broader, example-agnostic guidelines that can be applied universally to classify any news article. Conclude your analysis with clear, concise bullet points outlining:\n"
    "- The general characteristics that typically define each category.\n"
    "- Common mistakes that might lead to misclassifications and how to avoid them.\n"
    "- Guidelines under which circumstances each label should be predicted.\n"
    "These principles should help in accurately predicting the category of a tweet based on its content without additional context."
)

chat_history.append({'role': 'user', 'content': analyze_samples_request})

analysis_response = chat_with_context(chat_history)
chat_history.append({'role': 'assistant', 'content': analysis_response})
print("Analysis of Samples:")
print(analysis_response)

# Step 3: Generate Optimized Prompt
generate_prompt_request = (
    f"Based on the following analysis: {analysis_response}\nGenerate an optimized prompt for predicting "
    "whether a tweet is 'ironic' or 'not ironic'.\n\n"
    "### Requirements for Optimized Prompt:\n"
    "1. The prompt must include a clear description of the task and the labels.\n"
    "2. It should provide criteria for classifying tweets as 'ironic' or 'not ironic' based on the analysis.\n"
    "3. The prompt should help the model avoid common pitfalls and misclassifications identified during the analysis.\n"
    "4. Ensure the language is unambiguous and tailored to maximize the model's prediction accuracy.\n"
    "5. Encourage the model to think step by step.\n"
    "Respond with no other explanation but only the content of the prompt that is ready for the model to predict\n"
    "Prompt:"
)

chat_history.append({'role': 'user', 'content': generate_prompt_request})

optimized_prompt_response = chat_with_context(chat_history)
chat_history.append({'role': 'assistant', 'content': optimized_prompt_response})
print("Optimized Prompt:")
print(optimized_prompt_response)

Generated Samples:
Here are 5 samples for the irony detection task:

**Sample 1:** "Just got a promotion! Now I can finally relax on Fridays and watch Netflix all day." (Label: 'ironic')

This sample is likely to be correctly classified as ironic because it presents an unexpected twist on the typical reaction to getting a promotion. The speaker's tone seems to convey sarcasm, implying that they are not actually looking forward to relaxing on Fridays.

**Sample 2:** "I'm so done with social media. It's all just noise and distraction." (Label: 'not ironic')

This sample appears to be genuinely expressing frustration with social media, without any apparent irony or sarcasm. The speaker seems to mean what they say, making it unlikely to be misclassified.

**Sample 3:** "Just spent the whole day shopping for new clothes and I'm still wearing my old favorites." (Label: 'ironic')

This sample presents a contradictory statement that might lead to incorrect classification if not carefully analy

In [18]:
# Function to predict the label of a given news article
def get_prediction(text):
    # First part: Get a detailed response following the general guidelines of the prompt
    initial_prompt = (
        "Follow the guidelines of the prompt:\n"
        f"{optimized_prompt_response}\n\n"
        f"Tweet: {text}\n"
        "### Initial Analysis:\n"
        "Provide your detailed analysis and suggest a category based on the content of the tweet."
    )
    
    initial_response = ollama.generate(model='llama3', prompt=initial_prompt)
    detailed_analysis = initial_response['response'].strip()

    # Second part: Narrow down to just the predicted label
    final_prompt = (
        "Based on the detailed analysis, respond with only the category name:\n"
        f"{detailed_analysis}\n"
        "### Requirements:\n"
        "1. Respond with only a single-digit (0 for not ironic, 1 for ironic).\n"
        "2. Do not provide any additional text or explanation.\n"
        "Respond with only '0' or '1':"
    )

    final_response = ollama.generate(model='llama3', prompt=final_prompt)
    prediction = final_response['response'].strip().replace("**", "").replace("'", "").replace('"', '')
    if '1' in prediction:
        return '1'
    else:
        return '0'

predictions = []
for i, text in enumerate(texts):
    if i % 1000 == 0:
        print(f"Predicting tweet {i+1} out of {len(texts)}")
    # if i == 1000:
    #     break
    prediction = get_prediction(text)
    predictions.append(prediction)

# Step 6: Evaluate Model
from sklearn.metrics import accuracy_score
predictions = ["1" if "1" in p else "0" for p in predictions]
accuracy = accuracy_score(labels[:len(predictions)], predictions)
print(f"Model Accuracy: {accuracy}")

Predicting tweet 1 out of 3817
Predicting tweet 1001 out of 3817
Predicting tweet 2001 out of 3817
Predicting tweet 3001 out of 3817
Model Accuracy: 0.6266701598113702
