In [1]:
import tarfile
import os.path
import json
import re
from bz2 import BZ2File
from urllib import request
from io import BytesIO

import numpy as np


fname = "cmv.tar.bz2"
url = "https://chenhaot.com/data/cmv/" + fname

# download if not exists
if not os.path.isfile(fname):
    f = BytesIO()
    with request.urlopen(url) as resp, open(fname, 'wb') as f_disk:
        data = resp.read()
        f_disk.write(data)  # save to disk too
        f.write(data)
        f.seek(0)
else:
    f = open(fname, 'rb')

tar = tarfile.open(fileobj=f, mode="r")


In [2]:
# List all members of the tar file
print("Contents of the archive:")
for member in tar.getmembers():
    print(member.name)

Contents of the archive:
README
all
all/heldout_period_data.jsonlist.bz2
all/train_period_data.jsonlist.bz2
op_task
op_task/heldout_op_data.jsonlist.bz2
op_task/train_op_data.jsonlist.bz2
pair_task
pair_task/train_pair_data.jsonlist.bz2
pair_task/heldout_pair_data.jsonlist.bz2


In [3]:
train_fname = 'pair_task/train_pair_data.jsonlist.bz2'
test_fname = "pair_task/heldout_pair_data.jsonlist.bz2"

train_bzlist = tar.extractfile(train_fname)

# Deserialize the JSON list
pair_argument_train = [
    json.loads(line.decode('utf-8'))
    for line in BZ2File(train_bzlist)
]

test_bzlist = tar.extractfile(test_fname)

pair_argument_test = [
    json.loads(line.decode('utf-8'))
    for line in BZ2File(test_bzlist)
]
f.close()

In [4]:
save_dir = '/data/chenxi/lab_report/data'

# Check if the directory exists, if not, create it
os.makedirs(save_dir, exist_ok=True)

# Save the training data to a JSON file
train_file_path = os.path.join(save_dir, 'train_data.json')
with open(train_file_path, 'w', encoding='utf-8') as f:
    json.dump(pair_argument_train, f, ensure_ascii=False, indent=4)

# Save the test data to a JSON file
test_file_path = os.path.join(save_dir, 'test_data.json')
with open(test_file_path, 'w', encoding='utf-8') as f:
    json.dump(pair_argument_test, f, ensure_ascii=False, indent=4)

In [5]:
print("Number of posts in training dataset:", len(pair_argument_train))
print("Number of posts in testing dataset:", len(pair_argument_test))

Number of posts in training dataset: 3456
Number of posts in testing dataset: 807


In [6]:
print("Keys in a sample record:", pair_argument_train[0].keys())
print("Keys in a sample record:", pair_argument_test[0])

Keys in a sample record: dict_keys(['op_author', 'op_text', 'op_title', 'positive', 'negative', 'op_name'])
Keys in a sample record: {'op_author': '923iwek', 'op_text': 'I\'ll start off by saying I\'m a vegetarian and I also try to limit the eggs and dairy I eat, I try to buy no leather and no other animal byproducts. When there are leftovers from dinners with guests, etc. I\'ll eat that (throwing meat away is definitely worse than eating it). I try to do as much as I can for the environment and animals, I have very little money so I do not donate to animal rights organizations but I certainly would if I could.\n\nI\'m trying to be as reasonable as possible on the subject, I try not to see individuals as evil \'cause they think it\'s normal to torture animals, I understand it\'s a cultural thing although I loathe humanity in general.\n\nI also think the hypothetical contribution of even a billion of the strictest vegans imaginable wouldn\'t change things a bit.\n\nAs of today I\'m maki

In [7]:
def clean_text(text):
    lines = text.split('\n')
    cleaned_lines = []
    for line in lines:
        if line.startswith('>'):
            continue
        if line.startswith('____'):
            continue
        words = line.split()  # Splitting line into words
        if len(words) > 1 and 'edit' in words[:2]:
            continue
        cleaned_lines.append(line)
    return '\n'.join(cleaned_lines)

test_data_file_path = '/data/chenxi/lab_report/data/test_data.json'

with open(test_data_file_path, 'r', encoding='utf-8') as file:
    pair_argument_test = json.load(file)

def prepare_data_for_model(json_data):
    entries = []
    for record in json_data:
        op_title = clean_text(record.get('op_title', ''))
        op_text = clean_text(record.get('op_text', ''))
        positive_comments = record.get('positive', {}).get('comments', [])
        for comment in positive_comments:
            argument = clean_text(comment.get('body', ''))
            entry = {
                'title': op_title,
                'text': op_text,
                'argument': argument,
                'label': 'positive'
            }
            entries.append(entry)

        negative_comments = record.get('negative', {}).get('comments', [])
        for comment in negative_comments:
            argument = clean_text(comment.get('body', ''))
            entry = {
                'title': op_title,
                'text': op_text,
                'argument': argument,
                'label': 'negative'
            }
            entries.append(entry)

    return entries

processed_test_data = prepare_data_for_model(pair_argument_test)

output_file_path = '/data/chenxi/lab_report/data/processed_test_data.json'

with open(output_file_path, 'w', encoding='utf-8') as f:
    json.dump(processed_test_data, f, ensure_ascii=False, indent=4)
    print(f"Processed data saved to {output_file_path}")

Processed data saved to /data/chenxi/lab_report/data/processed_test_data.json


In [8]:
import openai
from pathlib import Path
import json
from utils import *
from meta_prompt import Explain_Then_Predict, Predict_Then_Explain
import random
import os

file_path = '/data/chenxi/lab_report/data/processed_test_data.json'

with open(file_path, 'r', encoding='utf-8') as f:
    data = json.load(f)

sample_data = random.sample(data, 500)
output_file_path = '/data/chenxi/lab_report/data/sampled_data.json'

with open(output_file_path, 'w', encoding='utf-8') as f:
    json.dump(sample_data, f, ensure_ascii=False, indent=4)
    print(f"Processed data saved to {output_file_path}")

openai_key_config(key_file = '/data/chenxi/lab_report/api_keys.txt')

def create_prompt(entry, format):
    if format == 'etp':
        return Explain_Then_Predict.etp_prompt.format(title=entry['title'], text=entry['text'], argument=entry['argument'])
    elif format == 'pte':
        return Predict_Then_Explain.pte_prompt.format(title=entry['title'], text=entry['text'], argument=entry['argument']) 
# Iterate over the sampled data and generate responses
responses_etp = []
responses_pte = []
for entry in sample_data:
    etp_prompt = create_prompt(entry, 'etp')
    pte_prompt = create_prompt(entry, 'pte')
    max_length = 1000  # Adjust based on the complexity and expected length of the responses

    response_etp = decoder_for_gpt(etp_prompt, max_length)
    responses_etp.append({'prompt': etp_prompt, 'response': response_etp})

    response_pte = decoder_for_gpt(pte_prompt, max_length)
    responses_pte.append({'prompt': pte_prompt, 'response': response_pte})

# Save responses to a JSON file
output_file_path = '/data/chenxi/lab_report/result/responses_etp.json'
with open(output_file_path, 'w', encoding='utf-8') as f:
    json.dump(responses_etp, f, indent=4)

# Print the generated responses
for idx, resp in enumerate(responses_etp):
    print(f"Response {idx + 1}:")
    print(resp['response'])
    print("-" * 50)  # Separator for readability

# Save responses to a JSON file
output_file_path = '/data/chenxi/lab_report/result/responses_pte.json'
with open(output_file_path, 'w', encoding='utf-8') as f:
    json.dump(responses_pte, f, indent=4)

# Print the generated responses
for idx, resp in enumerate(responses_pte):
    print(f"Response {idx + 1}:")
    print(resp['response'])
    print("-" * 50)  # Separator for readability

INFO:openai:error_code=insufficient_quota error_message='You exceeded your current quota, please check your plan and billing details. For more information on this error, read the docs: https://platform.openai.com/docs/guides/error-codes/api-errors.' error_param=None error_type=insufficient_quota message='OpenAI API error received' stream_error=False


Processed data saved to /data/chenxi/lab_report/data/sampled_data.json
api_key: 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
Response 1:
Explanation: The argument presented attempts to justify the debt by indicating that as long as it's manageable, it's not a problem. However, the OP's main view revolves around the idea that persons with a history of financial mismanagement, particularly in the contexts of inflating personal wealth or accumulating unnecessary debt, should not be trusted with the financial management of an entire country. This argument, therefore, does not specifically address the main concerns of the OP about the possible irresponsible financial behavior of these individuals and the potential implications when this behavior is translated to economic policy and administration.

Prediction: Negative.
--------------------------------------------------
Response 2:
Explanation: The argument provides a slightly different perspective on the issue, by suggesting that "X

In [25]:

from meta_prompt import cot, fs

In [26]:
def create_prompt(entry, format):
    if format == 'cot':
        return cot.cot_prompt.format(title=entry['title'], text=entry['text'], argument=entry['argument'])
    elif format == 'fs':
        return fs.fs_prompt.format(title=entry['title'], text=entry['text'], argument=entry['argument']) 
# Iterate over the sampled data and generate responses
responses_cot = []
responses_fs = []
for entry in sample_data:
    cot_prompt = create_prompt(entry, 'cot')
    fs_prompt = create_prompt(entry, 'fs')
    max_length = 1000  # Adjust based on the complexity and expected length of the responses

    response_cot = decoder_for_gpt(cot_prompt, max_length)
    responses_cot.append({'prompt': cot_prompt, 'response': response_cot})

    response_fs = decoder_for_gpt(fs_prompt, max_length)
    responses_fs.append({'prompt': fs_prompt, 'response': response_fs})

# Save responses to a JSON file
output_file_path = '/data/chenxi/lab_report/result/responses_cot.json'
with open(output_file_path, 'w', encoding='utf-8') as f:
    json.dump(responses_cot, f, indent=4)

# Print the generated responses
for idx, resp in enumerate(responses_cot):
    print(f"Response {idx + 1}:")
    print(resp['response'])
    print("-" * 50)  # Separator for readability

# Save responses to a JSON file
output_file_path = '/data/chenxi/lab_report/result/responses_fs.json'
with open(output_file_path, 'w', encoding='utf-8') as f:
    json.dump(responses_fs, f, indent=4)

# Print the generated responses
for idx, resp in enumerate(responses_fs):
    print(f"Response {idx + 1}:")
    print(resp['response'])
    print("-" * 50)  # Separator for readability

Response 1:
Negative
--------------------------------------------------
Response 2:
negative
--------------------------------------------------
Response 3:
Negative
--------------------------------------------------
Response 4:
Negative
--------------------------------------------------
Response 5:
Negative
--------------------------------------------------
Response 6:
Negative
--------------------------------------------------
Response 7:
Negative
--------------------------------------------------
Response 8:
negative
--------------------------------------------------
Response 9:
Negative
--------------------------------------------------
Response 10:
Negative
--------------------------------------------------
Response 11:
negative
--------------------------------------------------
Response 12:
Negative
--------------------------------------------------
Response 13:
Negative
--------------------------------------------------
Response 14:
Negative
--------------------------------------

### COT

In [27]:
import json

# Load the data from the provided JSON file
file_path = '/data/chenxi/lab_report/result/responses_cot.json'
with open(file_path, 'r') as file:
    data = json.load(file)
import re

# Using regular expressions to extract 'positive' or 'negative' from each response
def extract_prediction_with_regex(response_text):
    match = re.search(r"(Positive|Negative)", response_text)
    if match:
        return match.group(0)
    else:
        return "No Prediction Found"

# Apply the regex function to each item in the data
predictions_regex = [extract_prediction_with_regex(item['response']) for item in data]
# Load the data from the provided JSON file for sampled data
file_path_sampled = '/data/chenxi/lab_report/data/sampled_data.json'
with open(file_path_sampled, 'r') as file:
    data_sampled = json.load(file)

# Extract the labels from the sampled data
labels_sampled = [item['label'] for item in data_sampled]

# Calculating agreement
agreement = sum([pred.lower() == label.lower() for pred, label in zip(predictions_regex, labels_sampled)]) / len(labels_sampled) * 100

agreement

41.0

### few-shot

In [None]:
import json

# Load the data from the provided JSON file
file_path = '/data/chenxi/lab_report/result/responses_fs.json'
with open(file_path, 'r') as file:
    data = json.load(file)
import re

# Using regular expressions to extract 'positive' or 'negative' from each response
def extract_prediction_with_regex(response_text):
    match = re.search(r"(Positive|Negative)", response_text)
    if match:
        return match.group(0)
    else:
        return "No Prediction Found"

# Apply the regex function to each item in the data
predictions_regex = [extract_prediction_with_regex(item['response']) for item in data]
# Load the data from the provided JSON file for sampled data
file_path_sampled = '/data/chenxi/lab_report/data/sampled_data.json'
with open(file_path_sampled, 'r') as file:
    data_sampled = json.load(file)

# Extract the labels from the sampled data
labels_sampled = [item['label'] for item in data_sampled]

# Calculating agreement
agreement = sum([pred.lower() == label.lower() for pred, label in zip(predictions_regex, labels_sampled)]) / len(labels_sampled) * 100

agreement

### Explanation then Prediction

In [15]:
import json

# Load the data from the provided JSON file
file_path = '/data/chenxi/lab_report/result/responses_etp.json'
with open(file_path, 'r') as file:
    data = json.load(file)
import re

# Using regular expressions to extract 'positive' or 'negative' from each response
def extract_prediction_with_regex(response_text):
    match = re.search(r"(Positive|Negative)", response_text)
    if match:
        return match.group(0)
    else:
        return "No Prediction Found"

# Apply the regex function to each item in the data
predictions_regex = [extract_prediction_with_regex(item['response']) for item in data]


In [16]:
# Load the data from the provided JSON file for sampled data
file_path_sampled = '/data/chenxi/lab_report/data/sampled_data.json'
with open(file_path_sampled, 'r') as file:
    data_sampled = json.load(file)

# Extract the labels from the sampled data
labels_sampled = [item['label'] for item in data_sampled]

# Calculating agreement
agreement = sum([pred.lower() == label.lower() for pred, label in zip(predictions_regex, labels_sampled)]) / len(labels_sampled) * 100

agreement


40.400000000000006

### Prediction then Explanation

In [17]:
import json

# Load the data from the provided JSON file
file_path = '/data/chenxi/lab_report/result/responses_pte.json'
with open(file_path, 'r') as file:
    data = json.load(file)
import re

# Using regular expressions to extract 'positive' or 'negative' from each response
def extract_prediction_with_regex(response_text):
    match = re.search(r"(Positive|Negative)", response_text)
    if match:
        return match.group(0)
    else:
        return "No Prediction Found"

# Apply the regex function to each item in the data
predictions_regex = [extract_prediction_with_regex(item['response']) for item in data]

# Load the data from the provided JSON file for sampled data
file_path_sampled = '/data/chenxi/lab_report/data/sampled_data.json'
with open(file_path_sampled, 'r') as file:
    data_sampled = json.load(file)

# Extract the labels from the sampled data
labels_sampled = [item['label'] for item in data_sampled]

# Now, we have predictions from the previous step and labels from sampled data
# We need to compare these two lists to determine the agreement rate

# Calculating agreement
agreement = sum([pred.lower() == label.lower() for pred, label in zip(predictions_regex, labels_sampled)]) / len(labels_sampled) * 100

agreement


45.2

### Chain of Thought