# r/ChangeMyView data converter
Converts subreddit data into readable format for ML training

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/LAION-AI/Open-Assistant/blob/main/notebooks/data-augmentation/changemyview-builder/data_processor.ipynb)

In [65]:
### REMEMBER: setup the .env before running this code!

"""CONSTANTS"""

# Set the head number to the amount of entries you want to load in minus one
ENTRIES_COUNT = 10

# Set the threshold for toxic comments to be removed
TOXIC_THRESHOLD = 0.95

In [66]:
# Install any dependencies
!pip install pandas
!pip install praw
!pip install python-dotenv
!pip install pyarrow
!pip install detoxify
!pip install tqdm



In [67]:
import pandas as pd
import praw
import os
from os.path import join, dirname
from dotenv import main

# Make sure you create a .env file and fill in all the necessary information in the same folder as this script!
main.load_dotenv(join(dirname(os.path.realpath('__file__')), '.env'))

reddit = praw.Reddit(
   client_id=os.environ.get("CLIENT_ID"),
   client_secret=os.environ.get("CLIENT_SECRET"),
   user_agent="CMV_Scraper",
)


In [68]:
# load the data
import tarfile
import os.path
import json
import re
from bz2 import BZ2File
from urllib import request
from io import BytesIO

import numpy as np


fname = "cmv.tar.bz2"
url = "https://chenhaot.com/data/cmv/" + fname

# download if not exists
if not os.path.isfile(fname):
    f = BytesIO()
    with request.urlopen(url) as resp, open(fname, 'wb') as f_disk:
        data = resp.read()
        f_disk.write(data)  # save to disk too
        f.write(data)
        f.seek(0)
else:
    f = open(fname, 'rb')




In [69]:
#tar = tarfile.open(fileobj=f, mode="r:bz2")
tar = tarfile.open(fileobj=f, mode="r")

# Extract the file we are interested in

train_fname = "op_task/train_op_data.jsonlist.bz2"
test_fname = "op_task/heldout_op_data.jsonlist.bz2"

train_bzlist = tar.extractfile(train_fname)

  self._buffer = None


In [70]:
# Deserialize the JSON list
original_posts_train = [
    json.loads(line.decode('utf-8'))
    for line in BZ2File(train_bzlist)
]

In [71]:
original_posts_train[:1]

[{'title': "CMV: I shouldn't get a job in this economic climate because it'll be automated anyway; I should just wait for a post-scarcity utopia.",
  'delta_label': False,
  'name': 't3_2rpsl8',
  'selftext': "I think the world is automating fast enough that a utopia will arise where no one will have to work anymore. Within the next 2 decades or so, having a job won't mean much, and most people will be artists and scientists. \n\nMy parents let me live with them, so I can just wait until the utopia happens.\n\nCMV."}]

In [72]:
# Load the jsonlist file into a dataframe
#df = pd.read_json(original_posts_train, orient='list', lines=True)
df = pd.DataFrame(original_posts_train)

In [73]:
# Function to check if the posts still exists on reddit
def try_get_post(post_id):
    try:
        submission = reddit.submission(id=post_id)
        submission.name
        return True
    except Exception as e:
        return False

In [74]:
# Set up the detoxifier model:
from detoxify import Detoxify

In [75]:
import re

# Removes > sign and the template message at the end of a message
def cleanup_body_text(cmv_post):
    lines = [line for line in cmv_post.splitlines()
            if not line.lstrip().startswith("&gt;")
            and not line.lstrip().startswith("____")
            and not line.lstrip().startswith("So go forth and CMV, noble redditors!")
            and "edit" not in " ".join(line.lower().split()[:2])
            ]
    return "\n".join(lines)




# Create the function that will be handling all the data gathering
def get_top_comment_and_clean_data(post_id):
    #print(post_id.lstrip("t3_"))
    last_author = ""
    # Grab the post
    submission = reddit.submission(id=post_id.lstrip("t3_"))
    #print(submission.title)

    # Grab the highest rated comment on root layer
    submission.submission_type = 'best'
    submission.comments.replace_more(limit=0)
    replies = list(submission.comments)[0].replies.list()

    # Just some variables
    pros = []

    # If the post author doesn't exist this submission was deleted (submission.deleted doesn't work)
    if type(submission.author) == type(None):
        last_author = "[deleted]"
    else:
        last_author = submission.author.name

    is_pro_argument = False

    for comment in replies:

        # If redditor object doesn't exist, the account is invalid/deleted
        if type(comment.author) != type(None):
            author = comment.author.name
        else:
            author = "[deleted]"

        # Assume that whenever the user changes, they are countering the previous person
        if author != last_author:
            is_pro_argument = !is_pro_argument

        if author == "[deleted]" or author=="DeltaBot":
            #print("Skipping comment...")
            continue

        # Remove meta and duplicate comments
        comment.body = " ".join([line for line in comment.body.splitlines()
                                  if not re.search(r"(?i)(Change\smy\sview|CMV)", line)
                                  and line not in pros # Why doesn't this line work
                                  ])

        # Sometimes for some reason duplicate entries exist
        # Also remove automated message with "Δ" in it

        if comment.body in pros:
            #print("Skipping duplicate entry")
            continue

        #print("\t\t>>\t",comment.body)

            # Remove toxic comments
        if Detoxify("multilingual").predict(comment.body)["toxicity"] > TOXIC_THRESHOLD:
            #print("Identified toxic comment, ignoring...")
            comment.body = ""

        # Add to the respective argument type        
        if is_pro_argument:
            pros.append(comment.body)
        
        last_author = comment.author.name
        
        # Pros = arguments for the Title of this post
        # Cons = arguments against the title of this post

        pros.append(comment.body)
    return pros

In [76]:
print(f"Loading in {ENTRIES_COUNT} posts")
dataset = df.head(ENTRIES_COUNT)


Loading in 10 posts


In [77]:
# the name column does some weird sh** because dataframes already have a name property, so migrate to a different column name

import warnings
warnings.filterwarnings('ignore')

dataset["post_id"] = dataset["name"]
warnings.filterwarnings('default')

In [78]:
%%time

from tqdm.auto import tqdm
# Reset variables for if we run this multiple times
all_pros = []
all_names = []
all_titles = []
all_sources = []

print("Loading in data... This will take a while.")

for i in tqdm(range(dataset.shape[0])):

    post = dataset.iloc[i]
    modified_title = post.title.replace('CMV', "Change my mind")
    #print(f"\n Loading entry {i+1}/{dataset.shape[0]}:\n\t\"{modified_title}\"")

    if type(post) == type(None):
        continue

    assert(post.post_id != i)

    pros = get_top_comment_and_clean_data(post.post_id)

    if post.title == "[deleted]":
        continue

    pros = " ".join([*set(pros)])
    pros = pros.replace("[deleted]","")

    post.selftext = cleanup_body_text(post.selftext)
    all_titles.append(modified_title + " " + post.selftext)
    all_pros.append(pros)
    all_names.append(post.name)
    all_sources.append(f"https://reddit.com/r/changemyview/comments/{post.post_id}")
    #print(post.title)




Loading in data... This will take a while.


  0%|          | 0/10 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  storage = zip_file.get_storage_from_record(name, numel, torch._UntypedStorage).storage()._untyped()


CPU times: user 7min 49s, sys: 2min 29s, total: 10min 19s
Wall time: 8min 45s


In [83]:
all_pros[1]

'it\'s already been signed. They even claim to be adhering to it, though they\'ve been found to be violating it before. There is no such thing as "de facto acceptance of Israel\'s nuclear program." the Non-Proliferation Treaty is only binding for signatory states. Israel is not a signatory. Article 10 of the NPT allows them to withdraw if they so choose. they have not done so. a whole new country which explicitly has a right to withdraw from the NPT and has not chosen to do so. It\'s more accurate, I think, to say that the problem with Iran here from a legal standpoint is that they aren\'t honoring their own commitments, rather than that they\'re building weapons. They could pull out of the NPT at any time, and the ball would be essentially in America\'s court, because their nuclear program would no longer be illegal by international legal standards. However, Iran insists both on developing nukes *and* remaining an NPT signatory non-nuclear state, and that\'s what makes their program i

In [80]:
# Place it all into a Pandas Dataframe
clean_df = pd.DataFrame({
    "INSTRUCTION": all_titles,
    "RESPONSE": all_pros,
    "SOURCE": all_sources
}, index=all_names
)

In [81]:
# Create Apache Paquete file

import pyarrow as pa
import pyarrow.parquet as pq

table = pa.Table.from_pandas(clean_df)
pq.write_table(table,"output.parquet")

In [82]:
# Test to see if it was sucessful
table = pq.read_table("output.parquet")
table.to_pandas()

Unnamed: 0,INSTRUCTION,RESPONSE,SOURCE
0,Change my mind: I shouldn't get a job in this ...,That is what someone in the 1500s would have s...,https://reddit.com/r/changemyview/comments/t3_...
1,Change my mind: Iran has the right to develop ...,it's already been signed. They even claim to b...,https://reddit.com/r/changemyview/comments/t3_...
2,Change my mind: The events in Paris suck...but...,Hm I guess I made the OP incorrectly. The mai...,https://reddit.com/r/changemyview/comments/t3_...
3,Change my mind: It is ok to hate a religion so...,I don't understand your analogy. Promoting a ...,https://reddit.com/r/changemyview/comments/t3_...
4,Change my mind: There is no productive reason ...,"∆ I hadn't thought it from a ""let's trick peop...",https://reddit.com/r/changemyview/comments/t3_...
5,Change my mind: Diet soda is perfectly healthy...,Thanks for a fresh argument! I hadn't conside...,https://reddit.com/r/changemyview/comments/t3_...
6,Change my mind:Essential Oils are bullshit My ...,Most do. Some smell kinda funky.,https://reddit.com/r/changemyview/comments/t3_...
7,Change my mind: I think the Paris shooting mak...,I already said in different comments that thi...,https://reddit.com/r/changemyview/comments/t3_...
8,Change my mind: Printing an image of the Musli...,The first bacon sandwich came about because 9...,https://reddit.com/r/changemyview/comments/t3_...
9,Change my mind: Philosophy has no tangible val...,>Why restrict it to 50 years? I can name all s...,https://reddit.com/r/changemyview/comments/t3_...
