In [105]:
# Install any dependencies
!pip install pandas
!pip install praw
!pip install python-dotenv
!pip install pyarrow
!pip install detoxify



  return process_handler(cmd, _system_body)
  return process_handler(cmd, _system_body)
  return process_handler(cmd, _system_body)












In [106]:
# Set the head number to the amount of entries you want to load in minus one
ENTRIES_COUNT = 100

# Set the threshold for toxic comments to be removed
TOXIC_THRESHOLD = 0.95

In [107]:
import pandas as pd
import praw
import os
from os.path import join, dirname
from dotenv import main

# Make sure you create a .env file and fill in all the necessary information in the same folder as this script!
main.load_dotenv(join(dirname(os.path.realpath('__file__')), '.env'))

reddit = praw.Reddit(
   client_id=os.environ.get("CLIENT_ID"),
   client_secret=os.environ.get("CLIENT_SECRET"),
   user_agent="CMV_Scraper",
)


In [108]:
# load the data
import tarfile
import os.path
import json
import re
from bz2 import BZ2File
from urllib import request
from io import BytesIO

import numpy as np


fname = "cmv.tar.bz2"
url = "https://chenhaot.com/data/cmv/" + fname

# download if not exists
if not os.path.isfile(fname):
    f = BytesIO()
    with request.urlopen(url) as resp, open(fname, 'wb') as f_disk:
        data = resp.read()
        f_disk.write(data)  # save to disk too
        f.write(data)
        f.seek(0)
else:
    f = open(fname, 'rb')




In [109]:
#tar = tarfile.open(fileobj=f, mode="r:bz2")
tar = tarfile.open(fileobj=f, mode="r")

# Extract the file we are interested in

train_fname = "op_task/train_op_data.jsonlist.bz2"
test_fname = "op_task/heldout_op_data.jsonlist.bz2"

train_bzlist = tar.extractfile(train_fname)

  self._buffer = None


In [110]:
# Deserialize the JSON list
original_posts_train = [
    json.loads(line.decode('utf-8'))
    for line in BZ2File(train_bzlist)
]

In [111]:
original_posts_train

[{'title': "CMV: I shouldn't get a job in this economic climate because it'll be automated anyway; I should just wait for a post-scarcity utopia.",
  'delta_label': False,
  'name': 't3_2rpsl8',
  'selftext': "I think the world is automating fast enough that a utopia will arise where no one will have to work anymore. Within the next 2 decades or so, having a job won't mean much, and most people will be artists and scientists. \n\nMy parents let me live with them, so I can just wait until the utopia happens.\n\nCMV."},
 {'title': 'CMV: Iran has the right to develop nuclear weapons',
  'delta_label': False,
  'name': 't3_2rpfn7',
  'selftext': "First off, I do not believe that Iran *should* have nuclear weapons. In fact, I believe Iran having nuclear weapons makes the world less safe overall. However, I believe that as a sovereign nation they have the right to develop nuclear weapons if they so choose.\n\nWhy do I believe this:\n\n1. It is in Iran's best strategic interests to develop nu

In [112]:
# Load the jsonlist file into a dataframe
#df = pd.read_json(original_posts_train, orient='list', lines=True)
df = pd.DataFrame(original_posts_train)

In [113]:
# Function to check if the posts still exists on reddit
def try_get_post(post_id):
    try:
        submission = reddit.submission(id=post_id)
        submission.name
        return True
    except Exception as e:
        return False

In [114]:
# Set up the detoxifier model:
from detoxify import Detoxify

In [115]:
import re

# Removes > sign and the template message at the end of a message
def cleanup_body_text(cmv_post):
    lines = [line for line in cmv_post.splitlines()
            if not line.lstrip().startswith("&gt;")
            and not line.lstrip().startswith("____")
            and not line.lstrip().startswith("So go forth and CMV, noble redditors!")
            and "edit" not in " ".join(line.lower().split()[:2])
            ]
    #print(lines)
    return "\n".join(lines)




# Create the function that will be handling all the data gathering
def get_top_comment_and_clean_data(post_id):
    #print(post_id.lstrip("t3_"))
    last_author = ""
    # Grab the post
    submission = reddit.submission(id=post_id.lstrip("t3_"))
    #print(submission.title)

    # Grab the highest rated comment on root layer
    submission.submission_type = 'best'
    submission.comments.replace_more(limit=0)
    replies = list(submission.comments)[0].replies.list()

    # Just some variables
    pros = []
    dextoxify_labels = []

    # If the post author doesn't exist this submission was deleted (submission.deleted doesn't work)
    if type(submission.author) == type(None):
        last_author = "[deleted]"
    else:
        last_author = submission.author.name

    is_pro_argument = False

    for comment in replies:

        # If redditor object doesn't exist, the account is invalid/deleted
        if type(comment.author) != type(None):
            author = comment.author.name
        else:
            author = "[deleted]"

        # Assume that whenever the user changes, they are countering the previous person
        if author != last_author:
            is_pro_argument = !is_pro_argument

        if author == "[deleted]" or author=="DeltaBot":
            #print("Skipping comment...")
            continue

        # Remove meta and duplicate comments
        comment.body = " ".join([line for line in comment.body.splitlines()
                                  if not re.search(r"(?i)(Change\smy\sview|CMV)", line)
                                  and line not in pros # Why doesn't this line work
                                  ])

        # Sometimes for some reason duplicate entries exist
        # Also remove automated message with "Δ" in it

        if comment.body in pros:
            #print("Skipping duplicate entry")
            continue

        print("\t\t>>\t",comment.body)

        # Remove toxic comments
        # if Detoxify("multilingual").predict(comment.body)["dextoxify_labels"] > TOXIC_THRESHOLD:
        #     #print(comment.body)
        #     print("Identified toxic comment, ignoring...")
        #     comment.body = ""


        # Add to the respective argument type        
        if is_pro_argument:
            pros.append(comment.body)
        
        last_author = comment.author.name
        
        # Pros = arguments for the Title of this post
        # Cons = arguments against the title of this post

        pros.append(comment.body)
        #print(responses)
        dextoxify_labels.append({"detoxify_labels": Detoxify("multilingual").predict(comment.body)})
    return pros, dextoxify_labels

In [116]:
print(f"Loading in {ENTRIES_COUNT} posts")
dataset = df.head(ENTRIES_COUNT)


Loading in 100 posts


In [117]:
# the name column does some weird sh** because dataframes already have a name property, so migrate to a different column name

import warnings
warnings.filterwarnings('ignore')

dataset["post_id"] = dataset["name"]
warnings.filterwarnings('default')

In [118]:
%%time

# Reset variables for if we run this multiple times
all_pros = []
all_names = []
all_titles = []
all_sources = []
all_metadata = []

# load in our data. this will take a while.

for i in range(dataset.shape[0]):

    post = dataset.iloc[i]
    modified_title = post.title.replace('CMV', "Change my mind")
    print(f"\n Loading entry {i+1}/{dataset.shape[0]}:\n\t\"{modified_title}\"")

    if type(post) == type(None):
        continue

    assert(post.post_id != i)

    pros, dextoxify_labels = get_top_comment_and_clean_data(post.post_id)

    # if type(post.name) == int:
    #     continue
    # if type(pros) == int:
    #     continue
    if post.title == "[deleted]":
        continue

    pros = [*set(pros)]
    pros = [pro.replace("[deleted]","") for pro in pros]

    post.selftext = cleanup_body_text(post.selftext)
    all_titles.append(modified_title + " " + post.selftext)
    all_pros.append(pros)
    all_names.append(post.name)
    all_sources.append(f"https://reddit.com/r/changemyview/comments/{post.post_id}")
    all_metadata.append(dextoxify_labels)
    #print(post.title)





 Loading entry 1/100:
	"Change my mind: I shouldn't get a job in this economic climate because it'll be automated anyway; I should just wait for a post-scarcity utopia."
		>>	 You are also assuming there will ever be a "post scarcity" period...  There is very little data supporting the idea that automation = less job opportunity overall.  There is some data that automation advances in a a given industry can reduce job opportunities in that industry for a period however there are very few instances where this is not made up for in scale. I.E. higher automation leads to growth in the industry which then leads to it hiring more people again, despite needing less people per unit produced. 
		>>	 Two things: A post scarcity economy is not defined by the absence of jobs, but by the lack of economic need for them.   Secondly, the automation I am referring to does not have a historical precedence. This is not about replacing a weak human welder with a untiring robot, this is about replacing t

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



 Loading entry 2/100:
	"Change my mind: Iran has the right to develop nuclear weapons"
		>>	 There is no such thing as "de facto acceptance of Israel's nuclear program." the Non-Proliferation Treaty is only binding for signatory states. Israel is not a signatory.
		>>	 because international law doesn't require states to sign treaties, it only requires them to adhere to treaties they've already signed. Israel isn't defying the UN, at least not in this particular case. Think of the NPT less like a standard law within a state and more like a contract. Once you've signed, you're bound by the contract, but if you never sign it then you haven't broken a law, you've just decided not to agree to the terms you were offered.
		>>	 it's already been signed. They even claim to be adhering to it, though they've been found to be violating it before.
		>>	 > Because Iran did sign the treaty, and thus are bound by it. They signed on July 1, 1968.  Hmm. So is the argument here that it's not "ok" for I

In [119]:
# Place it all into a Pandas Dataframe
clean_df = pd.DataFrame({
    "INSTRUCTION": all_titles,
    "RESPONSE": all_pros,
    "SOURCE": all_sources,
    "METADATA": all_metadata
}, index=all_names
)

In [120]:
# Create Apache Paquete file

import pyarrow as pa
import pyarrow.parquet as pq

table = pa.Table.from_pandas(clean_df)
pq.write_table(table,"output.parquet")

In [121]:
# Test to see if it was sucessful
table = pq.read_table("output.parquet")
table.to_pandas()

Unnamed: 0,INSTRUCTION,RESPONSE,SOURCE,METADATA
0,Change my mind: I shouldn't get a job in this ...,"[No, resources would still be consumed, just w...",https://reddit.com/r/changemyview/comments/t3_...,[{'detoxify_labels': {'identity_attack': 9.972...
1,Change my mind: Iran has the right to develop ...,"[There is no such thing as ""de facto acceptanc...",https://reddit.com/r/changemyview/comments/t3_...,[{'detoxify_labels': {'identity_attack': 0.000...
2,Change my mind: The events in Paris suck...but...,[> random/semi-irrational posts Could you el...,https://reddit.com/r/changemyview/comments/t3_...,[{'detoxify_labels': {'identity_attack': 0.000...
3,Change my mind: It is ok to hate a religion so...,[Just as a side Christianity isn't your best e...,https://reddit.com/r/changemyview/comments/t3_...,[{'detoxify_labels': {'identity_attack': 0.001...
4,Change my mind: There is no productive reason ...,"[What God? Most of the founders were Deist. ,...",https://reddit.com/r/changemyview/comments/t3_...,[{'detoxify_labels': {'identity_attack': 0.000...
...,...,...,...,...
95,Change my mind: Riots and looting in Ferguson ...,"[The post was about the initial riots, and not...",https://reddit.com/r/changemyview/comments/t3_...,[{'detoxify_labels': {'identity_attack': 9.832...
96,Change my mind: I think we should get rid of r...,[Huh. I had thought that I was describing suc...,https://reddit.com/r/changemyview/comments/t3_...,[{'detoxify_labels': {'identity_attack': 0.000...
97,Change my mind:I believe that the British Mona...,[Why they would or why they wouldn't? You nee...,https://reddit.com/r/changemyview/comments/t3_...,[{'detoxify_labels': {'identity_attack': 7.260...
98,Change my mind: Female-only gyms and women's s...,"[Yeah, but not because of some arbitrary shit ...",https://reddit.com/r/changemyview/comments/t3_...,[{'detoxify_labels': {'identity_attack': 0.001...
