In [2]:
# load the data
import tarfile
import os.path
import json
import re
from bz2 import BZ2File
from urllib import request
from io import BytesIO

import numpy as np
import openai
import pandas as pd

fname = "cmv.tar.bz2"
url = "https://chenhaot.com/data/cmv/" + fname

# download if not exists
if not os.path.isfile(fname):
    f = BytesIO()
    with request.urlopen(url) as resp, open(fname, 'wb') as f_disk:
        data = resp.read()
        f_disk.write(data)  # save to disk too
        f.write(data)
        f.seek(0)
else:
    f = open(fname, 'rb')


tar = tarfile.open(fileobj=f, mode="r")

# Extract the file we are interested in

train_fname = "op_task/train_op_data.jsonlist.bz2"
test_fname = "op_task/heldout_op_data.jsonlist.bz2"

train_bzlist = tar.extractfile(train_fname)

# Deserialize the JSON list
original_posts_train = [
    json.loads(line.decode('utf-8'))
    for line in BZ2File(train_bzlist)
]

test_bzlist = tar.extractfile(test_fname)

original_posts_test = [
    json.loads(line.decode('utf-8'))
    for line in BZ2File(test_bzlist)
]
f.close()

In [3]:
def cleanup(cmv_post):
    lines = [line for line in cmv_post.splitlines()
             if not line.lstrip().startswith("&gt;")
             and not line.lstrip().startswith("____")
             and "edit" not in " ".join(line.lower().split()[:2])
            ]
    return "\n".join(lines)

def clean_dataset(dataset):
    for i in range(len(dataset)):
        dataset[i]['selftext'] = cleanup(dataset[i]['selftext'])
    return dataset

original_posts_train = clean_dataset(original_posts_train)
original_posts_test = clean_dataset(original_posts_test)

In [4]:
df = pd.DataFrame(original_posts_train, columns=['title', 'delta_label', 'name', 'selftext'])
df[:5]

Unnamed: 0,title,delta_label,name,selftext
0,CMV: I shouldn't get a job in this economic cl...,False,t3_2rpsl8,I think the world is automating fast enough th...
1,CMV: Iran has the right to develop nuclear wea...,False,t3_2rpfn7,"First off, I do not believe that Iran *should*..."
2,CMV: The events in Paris suck...but the comic ...,False,t3_2rpevf,Please leave the footnote below the following ...
3,CMV: It is ok to hate a religion so long as yo...,False,t3_2rpcgr,It seems to me that it is entirely justified t...
4,"CMV: There is no productive reason to have, ""U...",False,t3_2romiq,"The, ""Under God"" line is actually a relatively..."


In [5]:
def make_balanced_dataset(data, n_samples):
    df = pd.DataFrame(data, columns=['title', 'delta_label', 'name', 'selftext'])
    is_malleable = df['delta_label']
    malleable = df[is_malleable].sample(n=n_samples,
                         random_state=42)
    not_malleable = df[~is_malleable].sample(n=n_samples, 
                         random_state=42)
    df_balanced = pd.concat([malleable, not_malleable])
    # shuffle the dataframe
    df_balanced = df_balanced.sample(frac=1).reset_index(drop=True)
    return df_balanced

n_samples_train = 500
n_samples_test = 100
op_train = make_balanced_dataset(original_posts_train,  
                                             n_samples_train // 2)

op_test = make_balanced_dataset(original_posts_test, 
                                             n_samples_test // 2)

In [6]:
for a,b in op_train[:2].iterrows():
    print(b['title'])

CMV: The anger and vitriol towards Unidan is overdramatic
I belive that everyone is naturally extroverted. CMV


In [7]:
def create_jsonl(df, filename, prefix):
    with open(filename, 'w') as f:
        for idx, row in df.iterrows():
            prompt = prefix + f"{row['title']} + \n + {row['selftext']}"
            request_content = {
            "custom_id" : f"request={idx}",
            "method" : "POST",
            "url": "/v1/chat/completions",
            "body": {"model": "gpt-3.5-turbo-0125", 
                     "messages": [{"role": "user", 
                                   "content": prompt}
                                  ],
                     "max_tokens": 1000},
            }
            json.dump(request_content, f)
            f.write('\n')

prefix = "You're a semantic analyst. Now I will show you a person's opinion statement. We know that the person publicly announced his/her argument and encouraged other people to challenge it. Judging from the following context, do you think he/she is resistant or malleable to persuasion? Answer only with 'malleable' or 'resistant'. \n text: \n" 

create_jsonl(op_train, 'op_train.jsonl', prefix)

In [8]:
from openai import OpenAI
api_key = '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'

client = OpenAI(api_key=api_key)
batch_input_file = client.files.create(
  file=open("op_train.jsonl", "rb"),
  purpose="batch"
)

In [9]:
batch_input_file_id = batch_input_file.id

client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "nightly eval job"
    }
)

BadRequestError: Error code: 400 - {'error': {'message': 'Billing hard limit has been reached', 'type': 'invalid_request_error', 'param': None, 'code': 'billing_hard_limit_reached'}}