In [85]:
# load dataset
import pandas as pd
# load from txt file
datapath = 'dataset/irony'
df = pd.read_csv(datapath+'/SemEval2018-T3-train-taskA.txt', sep='\t', header=None)
print(df.head())
labels = df[1]
texts = df[2]
# remove the first row in labels and texts
labels = labels[1:]
texts = texts[1:]
print(labels.head())
print(texts.head())

             0      1                                                  2
0  Tweet index  Label                                         Tweet text
1            1      1  Sweet United Nations video. Just in time for C...
2            2      1  @mrdahl87 We are rumored to have talked to Erv...
3            3      1  Hey there! Nice to see you Minnesota/ND Winter...
4            4      0                3 episodes left I'm dying over here
1    1
2    1
3    1
4    0
5    1
Name: 1, dtype: object
1    Sweet United Nations video. Just in time for C...
2    @mrdahl87 We are rumored to have talked to Erv...
3    Hey there! Nice to see you Minnesota/ND Winter...
4                  3 episodes left I'm dying over here
5    I can't breathe! was chosen as the most notabl...
Name: 2, dtype: object


# learn from various samples

In [86]:
import ollama

# Function to interact with the Ollama API
def chat_with_context(history):
    response = ollama.chat(model='llama3', messages=history)
    return response['message']['content']

# Initialize the chat history
chat_history = []

# Step 1: Generate Correctly Classified and Misclassified Samples
num_samples = 5
task_description = "We need to predict whether a given tweet is ironic or not. The labels are 'ironic' and 'not ironic'."
instruction = "Classify each tweet as either 'ironic' or 'not ironic'."

generate_samples_request = (
    f"As an advanced language model you should create {num_samples} samples for the task outlined below.\n"
    "Generate samples that are likely to be correctly classified as 'ironic' or 'not ironic' and samples that might be misclassified according to the task instructions.\n\n"
    f"### Task Description:\n{task_description}\n\n"
    f"### Task Instructions:\n{instruction}\n\n"
    "### Requirements for Samples:\n"
    "1. Each sample must present a unique and intricate challenge.\n"
    "2. The complexity of the samples should be such that simply applying the given task instruction would likely lead to incorrect or incomplete results for some samples.\n"
    "3. The samples should cover a diverse range of scenarios within the scope of the task, avoiding repetition and predictability.\n"
    "4. Ensure that the samples, while challenging, remain realistic and pertinent to the task's context.\n"
    "Generate the samples keeping these requirements in mind.\n###"
)

chat_history.append({'role': 'user', 'content': task_description})
chat_history.append({'role': 'user', 'content': generate_samples_request})

samples_response = chat_with_context(chat_history)
chat_history.append({'role': 'assistant', 'content': samples_response})
print("Generated Samples:")
print(samples_response)

# Step 2: Analyze Samples with Chain of Thought
analyze_samples_request = (
    f"Here are some samples: {samples_response}\nUsing chain of thought, analyze these samples "
    "and conclude a procedure for predicting whether a tweet is 'ironic' or 'not ironic'. Identify key characteristics of both correctly and incorrectly classified samples, capture the mistakes from failed cases, and conclude under what circumstances we should predict each label."
)

chat_history.append({'role': 'user', 'content': analyze_samples_request})

analysis_response = chat_with_context(chat_history)
chat_history.append({'role': 'assistant', 'content': analysis_response})
print("Analysis of Samples:")
print(analysis_response)

# Step 3: Generate Optimized Prompt
generate_prompt_request = (
    f"Based on the following analysis: {analysis_response}\nGenerate an optimized prompt for predicting "
    "whether a tweet is 'ironic' or 'not ironic'. Ensure the model responds only with 'ironic' or 'not ironic'.\n\n"
    "### Requirements for Optimized Prompt:\n"
    "1. The prompt must include a clear description of the task and the labels.\n"
    "2. It should provide criteria for classifying tweets as 'ironic' or 'not ironic' based on the analysis.\n"
    "3. The prompt must ensure that the model responds strictly with 'ironic' or 'not ironic'.\n"
    "4. The prompt should help the model avoid common pitfalls and misclassifications identified during the analysis.\n"
    "5. Ensure the language is unambiguous and tailored to maximize the model's prediction accuracy."
)

chat_history.append({'role': 'user', 'content': generate_prompt_request})

optimized_prompt_response = chat_with_context(chat_history)
chat_history.append({'role': 'assistant', 'content': optimized_prompt_response})
print("Optimized Prompt:")
print(optimized_prompt_response)


Generated Samples:
Here are five samples for the task:

**Sample 1:**
"Just had the best time at my aunt's funeral! The food was amazing and I got to see some old friends #blessed" (Label: ironic)

This tweet is a classic example of irony, as the speaker claims to have had a good time at a funeral, which is typically a somber occasion. The use of hashtags like "#blessed" adds to the irony.

**Sample 2:**
"I'm so grateful for my new job! I get to do the same work as before, but now I have to wear a suit and tie #adulting" (Label: not ironic)

This tweet seems straightforwardly positive, with the speaker expressing gratitude for their new job. However, upon closer inspection, it's clear that they're poking fun at the idea of "growing up" and taking on adult responsibilities.

**Sample 3:**
"Just won a free trip to Hawaii... in my dreams! Guess I'll just have to keep on dreaming #travelgoals" (Label: ironic)

This tweet appears to be a genuine expression of disappointment, but the use of 

In [87]:
def get_prediction(text):
    prompt = (
        f"{optimized_prompt_response}\n\n"
        f"Tweet: {text}\n\n"
        "### Requirements:\n"
        "1. Respond with only a single-digit (0 for not ironic, 1 for ironic).\n"
        "2. Do not provide any additional text or explanation.\n"
        "Respond with only '0' or '1':"
    )
    response = ollama.generate(model='llama3', prompt=prompt)
    # Ensure the response is either '0' or '1'
    response_text = response['response'].strip()
    if '1' in response_text:
        return '1'
    else:
        return '0'

predictions = []
for i, text in enumerate(texts):
    if i % 1000 == 0:
        print(f"Predicting tweet {i+1} out of {len(texts)}")
    # if i == 1000:
    #     break
    prediction = get_prediction(text)
    predictions.append(prediction)

# Step 6: Evaluate Model
from sklearn.metrics import accuracy_score
predictions = ["1" if "1" in p else "0" for p in predictions]
accuracy = accuracy_score(labels, predictions)
print(f"Model Accuracy: {accuracy}")

Predicting tweet 1 out of 3817
Predicting tweet 1001 out of 3817
Predicting tweet 2001 out of 3817
Predicting tweet 3001 out of 3817
Model Accuracy: 0.5279014933193608


# With zero shot COT

In [88]:
def get_prediction(text):
    prompt = (
        "Predict whether the following tweet is ironic or not:\n\n"
        "Let's think step by step"
        f"Tweet: {text}\n\n"
        "### Requirements:\n"
        "1. Respond with only a single-digit (0 for not ironic, 1 for ironic).\n"
        "2. Do not provide any additional text or explanation.\n"
        "Respond with only '0' or '1':"
    )
    response = ollama.generate(model='llama3', prompt=prompt)
    # Ensure the response is either '0' or '1'
    response_text = response['response'].strip()
    if '1' in response_text:
        return '1'
    else:
        return '0'

predictions = []
for i, text in enumerate(texts):
    if i % 1000 == 0:
        print(f"Predicting tweet {i+1} out of {len(texts)}")
    # if i == 1000:
    #     break
    prediction = get_prediction(text)
    predictions.append(prediction)

# Step 6: Evaluate Model
from sklearn.metrics import accuracy_score
predictions = ["1" if "1" in p else "0" for p in predictions]
accuracy = accuracy_score(labels, predictions)
print(f"Model Accuracy: {accuracy}")

Predicting tweet 1 out of 3817
Predicting tweet 1001 out of 3817
Predicting tweet 2001 out of 3817
Predicting tweet 3001 out of 3817
Model Accuracy: 0.5391668849882106


# learn only negative samples

In [89]:
import ollama
from sklearn.metrics import accuracy_score

# Function to interact with the Ollama API
def chat_with_context(history):
    response = ollama.chat(model='llama3', messages=history)
    return response['message']['content']

# Initialize the chat history
chat_history = []

# Step 1: Define Task Description
task_description = "We need to predict whether a given tweet is ironic or not. The labels are 'ironic' and 'not ironic'."
chat_history.append({'role': 'user', 'content': task_description})

# Step 2: Generate Misleading Samples
num_samples = 5
generate_misleading_samples_request = (
    f"As an advanced language model, generate {num_samples} misleading samples for the task outlined below.\n"
    "For each sample, provide the likely mispredicted label ('ironic' or 'not ironic').\n\n"
    f"### Task Description:\n{task_description}\n\n"
    "### Requirements for Misleading Samples:\n"
    "1. Each sample must be challenging and likely to cause mispredictions.\n"
    "2. Include a likely mispredicted label for each sample.\n"
    "3. Ensure the samples are realistic and relevant to the task.\n"
    "Generate the misleading samples keeping these requirements in mind.\n###"
)

chat_history.append({'role': 'user', 'content': generate_misleading_samples_request})

misleading_samples_response = chat_with_context(chat_history)
chat_history.append({'role': 'assistant', 'content': misleading_samples_response})
print("Generated Misleading Samples:")
print(misleading_samples_response)

# Step 3: Analyze Misleading Samples
analyze_misleading_samples_request = (
    f"Here are some misleading samples: {misleading_samples_response}\nUsing a step-by-step approach, analyze these samples "
    "to understand why the mispredictions occur. Conclude the findings in a few bullet points."
)

chat_history.append({'role': 'user', 'content': analyze_misleading_samples_request})

analysis_response = chat_with_context(chat_history)
chat_history.append({'role': 'assistant', 'content': analysis_response})
print("Analysis of Misleading Samples:")
print(analysis_response)

# Step 4: Generate Optimized Prompt
generate_prompt_request = (
    f"Based on the following analysis: {analysis_response}\nGenerate an optimized prompt for predicting "
    "whether a tweet is 'ironic' or 'not ironic'. Ensure the model responds only with 'ironic' or 'not ironic'.\n\n"
    "### Requirements for Optimized Prompt:\n"
    "1. The prompt must include a clear description of the task and the labels.\n"
    "2. It should provide criteria for classifying tweets as 'ironic' or 'not ironic' based on the analysis.\n"
    "3. The prompt must ensure that the model responds strictly with 'ironic' or 'not ironic'.\n"
    "4. The prompt should help the model avoid common pitfalls and misclassifications identified during the analysis.\n"
    "5. Ensure the language is unambiguous and tailored to maximize the model's prediction accuracy."
)

chat_history.append({'role': 'user', 'content': generate_prompt_request})

optimized_prompt_response = chat_with_context(chat_history)
chat_history.append({'role': 'assistant', 'content': optimized_prompt_response})
print("Optimized Prompt:")
print(optimized_prompt_response)

Generated Misleading Samples:
Here are 5 misleading samples with likely mispredicted labels:

**Sample 1:** "Just had the best day ever! Nothing can bring me down" **Likely Mispredicted Label:** 'ironic'

This tweet is actually expressing genuine happiness, but its over-the-top language might lead a model to incorrectly predict it's ironic.

**Sample 2:** "I'm so excited for my boring Monday at work" **Likely Mispredicted Label:** 'not ironic'

In reality, the speaker is being sarcastic and ironic, saying they're looking forward to something that sounds unpleasant. A model might mispredict this as not ironic due to the unexpected twist.

**Sample 3:** "Just got my dream job offer! It's every college student's worst nightmare" **Likely Mispredicted Label:** 'not ironic'

This tweet is actually expressing irony, as the speaker is being sarcastic about a positive event. A model might mispredict this as not ironic due to the unexpected negative spin.

**Sample 4:** "I'm so glad I got caugh

In [90]:
# Example of using the optimized prompt for prediction
def get_prediction(text):
    prompt = (
        f"{optimized_prompt_response}\n\n"
        f"Tweet: {text}\n\n"
        "### Requirements:\n"
        "1. Respond with only a single-digit (0 for not ironic, 1 for ironic).\n"
        "2. Do not provide any additional text or explanation.\n"
        "Respond with only '0' or '1':"
    )
    response = ollama.generate(model='llama3', prompt=prompt)
    # Ensure the response is either '0' or '1'
    response_text = response['response'].strip()
    if '1' in response_text:
        return '1'
    else:
        return '0'

# Make predictions
predictions = []
for i, text in enumerate(texts):
    if i % 1000 == 0:
        print(f"Predicting text {i+1} out of {len(texts)}")
    # if i == 1000:
    #     break
    prediction = get_prediction(text)
    predictions.append(prediction)

# Evaluate Model
predictions = ["1" if "1" in p else "0" for p in predictions]
accuracy = accuracy_score(labels, predictions)
print(f"Model Accuracy: {accuracy}")

Predicting text 1 out of 3817
Predicting text 1001 out of 3817
Predicting text 2001 out of 3817
Predicting text 3001 out of 3817
Model Accuracy: 0.5197799318836783


# with basic prompt

In [91]:
def get_prediction(text):
    response = ollama.generate(model='llama3', prompt=f"Predict if the tweet text is ironic or not: {text}. make sure to respond with only the prediction value (0 or 1)")
    return response['response']

predictions = []
for i, text in enumerate(texts):
    if i % 1000 == 0:
        print(f"Predicting tweet {i+1} out of {len(texts)}")
    prediction = get_prediction(text)
    predictions.append(prediction)

# Step 6: Evaluate Model
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(labels, predictions)
print(f"Model Accuracy: {accuracy}")

Predicting tweet 1 out of 3817
Predicting tweet 1001 out of 3817
Predicting tweet 2001 out of 3817
Predicting tweet 3001 out of 3817
Model Accuracy: 0.48388787005501704


# few shot prompt

In [92]:
import ollama
import pandas as pd
from sklearn.metrics import accuracy_score

# Load the dataset
datapath = 'dataset/irony'
df = pd.read_csv(datapath+'/SemEval2018-T3-train-taskA.txt', sep='\t', header=None)
labels = df[1][1:]
texts = df[2][1:]

# construct few shot learning task
few_shot_text = texts[:3]
few_shot_labels = labels[:3]

# use the first few samples as few shot learning examples
prompt = (f"Here are the first few tweets in the dataset:\n\n1. {few_shot_text.iloc[0]} label:{few_shot_labels.iloc[0]}\n2. {few_shot_text.iloc[1]} label:{few_shot_labels.iloc[1]}\n3. {few_shot_text.iloc[2]} label:{few_shot_labels.iloc[2]}\n\n"
          "Based on these examples, predict whether the following tweet is ironic or not:"
          "### Requirements:\n"
        "1. Respond with only a single-digit (0 for not ironic, 1 for ironic).\n"
        "2. Do not provide any additional text or explanation.\n"
        "Respond with only '0' or '1':"
)

# Function to interact with the Ollama API
def get_prediction(text):
    response = ollama.generate(model='llama3', prompt=prompt)
    # Ensure the response is either '0' or '1'
    response_text = response['response'].strip()
    if '1' in response_text:
        return '1'
    else:
        return '0'
    
# Make predictions
predictions = []
for i, text in enumerate(texts):
    if i == 0 or i == 1 or i == 2:
        continue
    if i % 1000 == 0:
        print(f"Predicting tweet {i+1} out of {len(texts)}")
    prediction = get_prediction(text)
    predictions.append(prediction)

# Step 6: Evaluate Model
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(labels[3:], predictions)
print(f"Model Accuracy: {accuracy}")



Predicting tweet 1001 out of 3817
Predicting tweet 2001 out of 3817
Predicting tweet 3001 out of 3817
Model Accuracy: 0.4884635553224961
