In [9]:
from openai import OpenAI
import json
from tqdm import tqdm

client = OpenAI(api_key='your key here')

# Load prompts from a JSONL file
def load_prompts(file_path):
    with open(file_path, 'r') as file:
        return [json.loads(line) for line in file]

op_prompts = load_prompts('prompts_datasets/op_test_gpt.jsonl')
pairs_prompts = load_prompts('prompts_datasets/pairs_test_gpt.jsonl')

# Get GPT 3.5 responses with various prompting methods

In [10]:
# Function to send requests and gather responses
def get_responses(prompts, instruction, desc="Getting GPT3.5 Responses: "):
    responses = []
    for prompt in tqdm(prompts, desc=desc):
        try:
            response = client.chat.completions.create(
                      model="gpt-3.5-turbo",
                      messages=[
                          {"role":"system", "content":instruction},
                          prompt['body']['messages'][1]
                      ],
                    max_completion_tokens=prompt['body']['max_tokens'],
                    )
            responses.append(response)
        except Exception as e:
            print(f"Error processing prompt {prompt['custom_id']}: {e}")
            responses.append(None)
    return responses

## Get ops responses (malleability tasks)

In [11]:
# Knowledge about telling the malleability of the opinion, extracted from the paper: Winning Arguments: Interaction Dynamics and Persuasion Strategies in Good-faith Online Discussions
knowledge_op = "Hints: 1. Use of First-Person Pronouns: The use of first-person singular pronouns (e.g., I, me) is strongly correlated with malleability while the use of first-person plural pronouns (e.g., we, us) is more associated with resistant opinions. 2. Dominance in Language: Higher dominance in the language used by the OP correlates with malleability. 3. Calm Tone: calmer, less emotional language in the original post is associated with malleability. 4. Valence (Emotional Positivity): Higher valence, which reflects more positive emotional tone, indicates malleability. 5.Formatting: Posts that are well-organized, with more paragraphs and formatting such as bolds and bullet lists, are correlated with malleable opinions."

In [4]:
# Get responses of Original Posts from GPT3.5
directly_predict = "You're a semantic analyst. Now I will show you a person's opinion statement. We know that the person publicly announced his/her argument and encouraged other people to challenge it. Judging from the speech style and lexical features, do you think he/she is resistant or malleable to persuasion? Answer with 'malleable' or 'resistant'."
predict_then_explain = "You're a semantic analyst. Now I will show you a person's opinion statement. We know that the person publicly announced his/her argument and encouraged other people to challenge it. Judging from the speech style and lexical features, do you think he/she is resistant or malleable to persuasion? Answer with 'malleable' or 'resistant' and explain your answer. Response with the following format: Prediction: resistant/malleable \n Explanation: briefly explain here."
explain_then_predict = "You're a semantic analyst. Now I will show you a person's opinion statement. We know that the person publicly announced his/her argument and encouraged other people to challenge it. Judging from the speech style and lexical features, do you think he/she is resistant or malleable to persuasion? First briefly explain your analysis and then give your answer with resistant/malleable. Response with the following format: Explanation: briefly explain here. \n  Prediction: resistant/malleable"

op_responses_direct = get_responses(op_prompts, directly_predict)
op_responses_pred_explain = get_responses(op_prompts, predict_then_explain)
op_responses_explain_pred = get_responses(op_prompts, explain_then_predict)
op_responses_with_knowledge = get_responses(op_prompts, directly_predict + knowledge_op)

Getting GPT3.5 Responses: 100%|██████████| 200/200 [01:44<00:00,  1.92it/s]


## Get pairs responses(persuasion tasks)

In [3]:
# Knowledge on how to persuade the poster, extracted from the paper: Winning Arguments: Interaction Dynamics and Persuasion Strategies in Good-faith Online Discussions
knowledge_pairs = "Hint: 1. Language Dissimilarity with Original Post: Persuasive replies use different content words but match in stopwords 2.Reply Length: Longer replies tend to be more persuasive, as they can convey more information and elaborate on points effectively. 3. Language Dissimilarity with Original Post: Persuasive replies use different content words but match in stopwords. 4. Links and Evidence: Including links as evidence in an argument increases the chances of persuasion. 5. Calmer Tone: Replies that use calmer, less intense language are more likely to persuade, as they come across as more composed. 6. Positive Emotion and Sentiment: Persuasive replies include a mix of positive and negative sentiment."

In [8]:
# Get responses of Pairs(op, reply1, reply2) from GPT3.5
directly_predict = "This is a conversation from an online discussion community. The first was a poster who posted an opinion, and the next two replies were each trying to convince the poster to revise his opinion. The two responses were similar, but one managed to convince the poster and the other didn't. Now judge which response succeeded in persuading. Answer only with first/second."
predict_then_explain = "This is a conversation from an online discussion community. The first was a poster who posted an opinion, and the next two replies were each trying to convince the poster to revise his opinion. The two responses were similar, but one managed to convince the poster and the other didn't. Now first judge which response succeeded in persuading and then explain your analysis very briefly. Response with the following format: Prediction: answer only with first or second \n Explanation: briefly explain here."
explain_then_predict = "This is a conversation from an online discussion community. The first was a poster who posted an opinion, and the next two replies were each trying to convince the poster to revise his opinion. The two responses were similar, but one managed to convince the poster and the other didn't. Now first show your analysis very briefly and then judge which response succeeded in persuading. Response with the following format: Explanation: briefly analyse here. \n Prediction: answer only with first or second"

pairs_responses_direct = get_responses(pairs_prompts, directly_predict)
pairs_responses_pred_explain = get_responses(pairs_prompts, predict_then_explain)
pairs_responses_explain_pred = get_responses(pairs_prompts, explain_then_predict)
pairs_responses_with_knowledge = get_responses(pairs_prompts, directly_predict + knowledge_pairs)

## Two Stage method (on persuasion tasks)

In [4]:
instruction_1 = f"This is a conversation from an online discussion community. The first was a poster who posted an opinion, and the next two replies were each trying to convince the poster to revise his opinion. Analyze the persuasiveness of the two replies respectively in two paragraphs with the following format: First Reply: your analysis here.\n Second Reply: your analysis here.\n Note only analysis, not conclusions. \n {knowledge_pairs}"
instruction_2 = f"You're a semantic analyst. The following two paragraphs are analysis on the persuasiveness of two replies. These replies are from an online discussion community that are trying to convince the original poster to revise its opinion. Based on these analysis, which reply do you think that successfully persuaded the original poster? \n Answer only with first or second.\n {knowledge_pairs}"
# Implement a two stage prompting: 
# 1) ask GPT to analyse the linguistic features of the two replies but do not make conclusions. 
# 2) ask GPT to evaluate previous analysis and make conclusions. 
# Both stages have knowledge instilled.
def get_two_stage_responses(prompts, instruction_1, instruction_2):
    stage_one_responses = get_responses(prompts, instruction_1, desc="Getting GPT3.5 Responses(1st stage): ")
    stage_one_responses = [res.choices[0].message.content for res in stage_one_responses]
    stage_two_responses = []
    for prompt in tqdm(stage_one_responses, desc="Getting GPT3.5 Responses(2nd stage): "):
        response = client.chat.completions.create(
                  model="gpt-3.5-turbo",
                  messages=[
                      {"role":"system", "content":instruction_2},
                      {"role": "user", "content":prompt},
                  ],
                max_completion_tokens=1000,
                )
        stage_two_responses.append(response)
    return stage_two_responses

paris_two_stage_responses = get_two_stage_responses(pairs_prompts, instruction_1, instruction_2)

Getting GPT3.5 Responses(1st stage): 100%|██████████| 200/200 [13:23<00:00,  4.02s/it]
Getting GPT3.5 Responses(2nd stage): 100%|██████████| 200/200 [01:24<00:00,  2.36it/s]


## Two Stage method (on malleability tasks)

In [25]:
instruction_1 = f"This is post from an online discussion community. The person openly stats his/her opinion and encourage others to challenge him/her. Now read the post and identify the speech style and lexical features that may indicate the extent of malleability of poster's opinion. Briefly report your analysis\n Note you only identify these style and features, don't make any conclusions about them. \n {knowledge_op}"
instruction_2 = f"You're a semantic analyst. The following is an analysis on the extent of malleability of a poster's opinion. Based on these analysis, do you think the poster is resistant or malleable to persuasion? \n Answer only with resistant or malleable.\n {knowledge_op}"
# Implement a two stage prompting: 
# 1) ask GPT to analyse the linguistic features of the two replies but do not make conclusions. 
# 2) ask GPT to evaluate previous analysis and make conclusions. 
# Both stages have knowledge instilled.
def get_two_stage_responses(prompts, instruction_1, instruction_2):
    stage_one_responses = get_responses(prompts, instruction_1, desc="Getting GPT3.5 Responses(1st stage): ")
    stage_one_responses = [res.choices[0].message.content for res in stage_one_responses]
    stage_two_responses = []
    for prompt in tqdm(stage_one_responses, desc="Getting GPT3.5 Responses(2nd stage): "):
        response = client.chat.completions.create(
                  model="gpt-3.5-turbo",
                  messages=[
                      {"role":"system", "content":instruction_2},
                      {"role": "user", "content":prompt},
                  ],
                max_completion_tokens=1000,
                )
        stage_two_responses.append(response)
    return stage_two_responses

op_two_stage_responses = get_two_stage_responses(op_prompts, instruction_1, instruction_2)

Getting GPT3.5 Responses(1st stage): 100%|██████████| 200/200 [06:44<00:00,  2.02s/it]
Getting GPT3.5 Responses(2nd stage): 100%|██████████| 200/200 [01:30<00:00,  2.21it/s]


# Accuracy

In [26]:
def prediction_accuracy(responses, file_path, test_n_sample=200, method="direct"):
    gpt_predictions = []
    if method == "direct":
        for res in responses:
            gpt_predictions.append(res.choices[0].message.content.lower()[:1])
    else:
        for res in responses:
            pred = res.choices[0].message.content.lower()
            start_point = pred.find("prediction: ") + len("prediction: ")
            if method == "predict_then_explain":
                pred = pred[start_point: start_point + 1]
            elif method == "explain_then_predict":
                pred = pred[start_point: start_point + 1]
            gpt_predictions.append(pred)
    
    def check_pred(preds):
        n_mismatches = 0
        for i, p in (enumerate(preds)):
            if p not in ['f', 's']:
                if p not in ['m', 'r']:
                    n_mismatches += 1
        if n_mismatches:
            print(f"{n_mismatches}/{len(preds)} "
              f"of the predictions are not in correct format! "
              f"They will not be included in counting accuracy.")
        return n_mismatches
    n_mismatch = check_pred(gpt_predictions)
    
    with open(file_path, 'r') as f:
        truths = [json.loads(line)["output"][:1] for line in f][:test_n_sample]
    scores = [1 if gpt_predictions[j] == truths[j] else 0 for j in range(len(truths))]
    
    return (sum(scores) - n_mismatch) / (len(truths) - n_mismatch)

def z_test(n_sample, accuracy_1, accuracy_2):
    import numpy as np
    from scipy.stats import norm
    pooled = (accuracy_1 * n_sample + accuracy_2 * n_sample) / (n_sample * 2)
    z_score = (accuracy_1 - accuracy_2) / np.sqrt(pooled * (1 - pooled) * 2 / n_sample)
    p_value = norm.sf(abs(z_score)) * 2
    return p_value

## OPs Accuracies

In [6]:
file_path = "finetune_llama3_1/finetune_datasets/op_test_alpaca.jsonl"
n_sample = len(op_responses_with_knowledge)

op_accuracy_direct = prediction_accuracy(op_responses_direct, file_path)
op_accuracy_pred_explain = prediction_accuracy(op_responses_pred_explain,
                                               file_path,
                                               method="predict_then_explain")
op_accuracy_explain_pred = prediction_accuracy(op_responses_explain_pred,
                                               file_path,
                                               method="explain_then_predict")
op_accuracy_with_knowledge = prediction_accuracy(op_responses_with_knowledge, file_path)
p_2 = z_test(n_sample, op_accuracy_direct, op_accuracy_pred_explain)
p_3 = z_test(n_sample, op_accuracy_direct, op_accuracy_explain_pred)

print(f"Accuracy for direct prediction with GPT-3.5 turbo is {op_accuracy_direct:.3f}.")
print(f"Accuracy for predict-then-explain with GPT-3.5 turbo is {op_accuracy_pred_explain:.3f} with p-value: {p_2:.3f}")
print(f"Accuracy for explain-then-predict with GPT-3.5 turbo is {op_accuracy_explain_pred:.3f} with p-value: {p_3:.3f}")
print(f"Accuracy for direct prediction with knowledge with GPT-3.5 turbo is {op_accuracy_with_knowledge:.3f}")

22/200 of the predictions are not in correct format! They will not be included in accuracy.
Accuracy for direct prediction with knowledge with GPT-3.5 turbo is 0.438


## Pairs Accuracies

In [100]:
file_path = "finetune_llama3_1/finetune_datasets/pairs_test_alpaca.jsonl"
n_sample = len(pairs_responses_direct)

pairs_accuracy_direct = prediction_accuracy(pairs_responses_direct, file_path)
pairs_accuracy_with_knowledge = prediction_accuracy(pairs_responses_with_knowledge, file_path)
pairs_accuracy_pred_explain = prediction_accuracy(pairs_responses_pred_explain,
                                               file_path,
                                               method="predict_then_explain")
pairs_accuracy_explain_pred = prediction_accuracy(pairs_responses_explain_pred,
                                               file_path,
                                               method="explain_then_predict")
p_1 = z_test(n_sample, pairs_accuracy_direct, pairs_accuracy_with_knowledge)
p_2 = z_test(n_sample, pairs_accuracy_direct, pairs_accuracy_pred_explain)
p_3 = z_test(n_sample, pairs_accuracy_direct, pairs_accuracy_explain_pred)

print(f"Accuracy for direct prediction with GPT-3.5 turbo is {pairs_accuracy_direct:.3f}.")
print(f"Accuracy for direct prediction with GPT-3.5 turbo with knowledge is {pairs_accuracy_with_knowledge:.3f}, with p-value: {p_1:.3f}")
print(f"Accuracy for predict-then-explain with GPT-3.5 turbo is {pairs_accuracy_pred_explain:.3f} with p-value: {p_2:.3f}")
print(f"Accuracy for explain-then-predict with GPT-3.5 turbo is {pairs_accuracy_explain_pred:.3f} with p-value: {p_3:.3f}")

Accuracy for direct prediction with GPT-3.5 turbo is 0.485.
Accuracy for direct prediction with GPT-3.5 turbo with knowledge is 0.530, with p-value: 0.368
Accuracy for predict-then-explain with GPT-3.5 turbo is 0.470 with p-value: 0.764
Accuracy for explain-then-predict with GPT-3.5 turbo is 0.455 with p-value: 0.548


## Two Stage Accuracy

In [27]:
file_path = "finetune_llama3/finetune_datasets/op_test_alpaca.jsonl"
n_sample = len(op_two_stage_responses)
op_accuracy_two_stage = prediction_accuracy(op_two_stage_responses, file_path, test_n_sample=n_sample)
print(f"Accuracy for two stage prediction with GPT-3.5 turbo is {op_accuracy_two_stage:.3f}.")

1/200 of the predictions are not in correct format! They will not be included in accuracy.
Accuracy for two stage prediction with GPT-3.5 turbo is 0.487.


In [7]:
file_path = "finetune_llama3/finetune_datasets/pairs_test_alpaca.jsonl"
n_sample = len(paris_two_stage_responses)
pairs_accuracy_two_stage = prediction_accuracy(paris_two_stage_responses, file_path, test_n_sample=n_sample)
print(f"Accuracy for two stage prediction with GPT-3.5 turbo is {pairs_accuracy_two_stage:.3f}.")

# Generate explanations for llama finetuning

In [101]:
def get_explanation_for_finetuning(prompts, truths, dataset="op"):
    responses = []
    for prompt, truth in tqdm(zip(prompts, truths), 
                              desc="Getting GPT3.5 Responses: ", 
                              total=len(prompts)):
        if dataset == "op":
            if truth == "malleable":
                insert = "We know that he/she did get persuaded by some commentators. How might his/her speeching style and lexical features suggest he/she is malleable to persuasion?"
                
            elif truth == "resistant":
                insert = "We know that he/she never get persuaded by others. How might his/her speeching style and lexical features suggest he/she is resistant to persuasion?"
                
            instruction = f"You're a semantic analyst. Now I will show you a person's opinion statement, who publicly announced his/her argument and encouraged other people to challenge it. {insert} Very briefly explain your analysis with no more than 2 paragraphs."
        
        elif dataset == "pairs":
            instruction = f"This is a conversation from an online discussion community. The first was a poster who posted an opinion, and the next two replies were each trying to convince the poster to revise his opinion. We know that the {truth} reply successfully persuaded the poster. How might his/her speeching style and lexical features suggest his/her persuasiveness? {knowledge_pairs} Very briefly explain your analysis with no more than 1000 tokens."
        try:
            response = client.chat.completions.create(
                      model="gpt-3.5-turbo",
                      messages=[
                          {"role":"system", "content": instruction},
                          prompt['body']['messages'][1]
                      ],
                    max_completion_tokens=prompt['body']['max_tokens']
                    )
            responses.append(response)
        except Exception as e:
            print(f"Error processing prompt {prompt['custom_id']}: {e}")
            responses.append(None)
    return responses

In [None]:
op_explanation_prompts = load_prompts("finetune_llama3/finetune_datasets/op_train_alpaca.jsonl")
op_truths = [line["output"] for line in op_explanation_prompts]
op_train_prompts = load_prompts("prompts_datasets/op_train_gpt.jsonl")
op_explanations = get_explanation_for_finetuning(op_train_prompts, op_truths, dataset="op")

In [108]:
pairs_explanation_prompts = load_prompts("finetune_llama3_1/finetune_datasets/pairs_train_alpaca.jsonl")
pairs_truths = [line["output"] for line in pairs_explanation_prompts]
pairs_train_prompts = load_prompts("prompts_datasets/pairs_train_gpt.jsonl")
pairs_explanations = get_explanation_for_finetuning(pairs_train_prompts, pairs_truths, dataset="pairs")

Getting GPT3.5 Responses: 100%|██████████| 1000/1000 [44:55<00:00,  2.70s/it] 


In [117]:
def create_jsonl(explanations, file_path):
    with open(file_path, "w") as f:
        for line in explanations:
            content = line.choices[0].message.content
            json.dump(content, f)
            f.write("\n")

In [None]:
file_path = 'finetune_llama3/finetune_datasets/op_gpt_explanations.jsonl'
create_jsonl(op_explanations, file_path)

In [118]:
file_path = 'finetune_llama3/finetune_datasets/pairs_gpt_explanations.jsonl'
create_jsonl(pairs_explanations, file_path)