In [19]:
# load the data
import tarfile
import os.path
import json
import re
from bz2 import BZ2File
from urllib import request
from io import BytesIO

import numpy as np


fname = "cmv.tar.bz2"
url = "https://chenhaot.com/data/cmv/" + fname

# download if not exists
if not os.path.isfile(fname):
    f = BytesIO()
    with request.urlopen(url) as resp, open(fname, 'wb') as f_disk:
        data = resp.read()
        f_disk.write(data)  # save to disk too
        f.write(data)
        f.seek(0)
else:
    f = open(fname, 'rb')


tar = tarfile.open(fileobj=f, mode="r")

# Extract the file we are interested in

train_fname = "op_task/train_op_data.jsonlist.bz2"
test_fname = "op_task/heldout_op_data.jsonlist.bz2"

train_bzlist = tar.extractfile(train_fname)

# Deserialize the JSON list
original_posts_train = [
    json.loads(line.decode('utf-8'))
    for line in BZ2File(train_bzlist)
]

test_bzlist = tar.extractfile(test_fname)

original_posts_test = [
    json.loads(line.decode('utf-8'))
    for line in BZ2File(test_bzlist)
]
f.close()

In [52]:
file_names = tar.getnames()

# Print the file names
for name in file_names:
    print(name)

README
all
all/heldout_period_data.jsonlist.bz2
all/train_period_data.jsonlist.bz2
op_task
op_task/heldout_op_data.jsonlist.bz2
op_task/train_op_data.jsonlist.bz2
pair_task
pair_task/train_pair_data.jsonlist.bz2
pair_task/heldout_pair_data.jsonlist.bz2


In [61]:
from IPython.display import Markdown

def show_post(cmv_post):
    md_format = "**{title}** \n\n {selftext}".format(**cmv_post)
    md_format = "\n".join(["> " + line for line in md_format.splitlines()])
    return Markdown(md_format)
show_post(original_posts_train[250])

> **CMV: "Classic" actors in older movies were absolutely terrible at their art.** 
> 
>  Old movies aren't typically my taste, but I've recently watched *Gone with the Wind* and *What Ever Happened to Baby Jane?* and I think the acting in these movies (by supposedly "great" actors) is awful. There is zero subtlety and everything is over-acted. It's like watching a bunch of five-year-olds playing pretend. I've noticed it particularly with *Gone with the Wind* because the book is one of my all-time favorites and I was so disappointed with the movie. There are so many subtle layers of emotion in the book that could have at least partially been conveyed on-screen with facial expressions and body language, but they skipped all of that and painted things with the broadest brush possible. Even Clark Gable's famous line, "Frankly my dear, I don't give a damn" came across only as angry, instead of how it should have been - soft, pain-filled, and resigned. So, change my view. Tell me why you think these "classic" actors were great.
> 
> _____
> 
> &gt; *Hello, users of CMV! This is a footnote from your moderators. We'd just like to remind you of a couple of things. Firstly, please remember to* ***[read through our rules](http://www.reddit.com/r/changemyview/wiki/rules)***. *If you see a comment that has broken one, it is more effective to report it than downvote it. Speaking of which,* ***[downvotes don't change views](http://www.reddit.com/r/changemyview/wiki/guidelines#wiki_upvoting.2Fdownvoting)****! If you are thinking about submitting a CMV yourself, please have a look through our* ***[popular topics wiki](http://www.reddit.com/r/changemyview/wiki/populartopics)*** *first. Any questions or concerns? Feel free to* ***[message us](http://www.reddit.com/message/compose?to=/r/changemyview)***. *Happy CMVing!*

In [62]:
print(original_posts_train[250].keys())
average_length = sum([len(line['selftext']) for line in original_posts_train]) / len(original_posts_train)
print(f'Average OP length: {average_length:.3f}')

dict_keys(['title', 'delta_label', 'name', 'selftext'])
Average OP length: 1838.071


In [56]:
len(original_posts_train)

10743

In [89]:
original_posts_train[243]["delta_label"]

True

In [86]:
_, post_id = original_posts_train[250]['name'].split("_")
print("http://reddit.com/r/changemyview/comments/{}".format(post_id))

http://reddit.com/r/changemyview/comments/2plncc


In [88]:
def cleanup(cmv_post):
    lines = [line for line in cmv_post.splitlines()
             if not line.lstrip().startswith("&gt;")
             and not line.lstrip().startswith("____")
             and "edit" not in " ".join(line.lower().split()[:2])
            ]
    return "\n".join(lines)
show_post(dict(title=original_posts_train[243]['title'],
               selftext=cleanup(original_posts_train[243]['selftext'])))

> **CMV: Movie culture is overrated because it is neither diverse nor creative.** 
> 
>  
>  When I say movie and movie industry I am only expressing movies with real life actors. I see more prevalent diversity and creativity in cartoons and anime.
> 
> With every new movie I feel as though I have seen it before. This sense grows with every movie. I appreciate the subtle tricks directors throw in here and there to tilt the angle of the audience's perception of the scene. This is the beauty of the art of film-making. However, I do not appreciate the redundancy of obvious filming tricks that seem to slip by others while I see them as cut-copy. 
> 
> I loathe actors. I think the acting paradigm is stale because of the way the movie industry operates (recycling the same actors and suppressing the unique to guarantee another successful movie).
> 
> I'm also greatly disturbed by the way people rate movies. People never have an accurate critique on whether the movie was "good" or "bad". I believe their decision of the movie's goodness is based on things unrelated to the movie itself such as preset confirmation bias due to certain actors or directors associated with the film, confirmation bias due to a friend's 'review', group think, or even intruding mood. 
> 
> 

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split  # Updated import
from sklearn.linear_model import LogisticRegressionCV
from sklearn.metrics import roc_auc_score  # Updated import

In [28]:
# prepare the data in Python list form
op_text_train = [
    cleanup(original_post['selftext'])
    for original_post
    in original_posts_train
]

op_text_test = [
    cleanup(original_post['selftext'])
    for original_post
    in original_posts_test
]


In [40]:
# Vectorizing the text
vect = TfidfVectorizer(use_idf=False, norm='l1')
X_train = vect.fit_transform(op_text_train)
print(f"X_train shape: {X_train.shape}")
print(f"Vocabulary size: {len(vect.vocabulary_)}")

X_train shape: (10743, 48296)
Vocabulary size: 48296


In [None]:
vocab_size = len(vect.vocabulary_)
print(f"Vocabulary size: {vocab_size}")

In [36]:
# Logistic Regression with Cross-Validation
lr = LogisticRegressionCV(Cs=10, class_weight='balanced',
                          scoring='roc_auc', solver='saga',  # 'saga' for better scalability
                          tol=0.001, max_iter=500,
                          random_state=0)
is_malleable = np.array([post["delta_label"] for post in original_posts_train])
lr.fit(X_train, is_malleable)

In [37]:
# Transform the test data
X_test = vect.transform(op_text_test)

# Extract the true labels for the test set
is_malleable_test = np.array([post["delta_label"] for post in original_posts_test])

# Evaluating on the heldout data using roc_auc_score
test_roc = roc_auc_score(is_malleable_test, lr.predict_proba(X_test)[:, 1])  # Updated to use roc_auc_score directly

print("Test ROC AUC score: {:.3f}".format(test_roc))

Test ROC AUC score: 0.531


In [42]:
def top_features(coef, words, n=15):
    # Sort coefficients in descending order
    order = np.argsort(-coef)
    for i_hi, i_lo in zip(order[:n], order[:-n - 1:-1]):
        row_vals = coef[i_hi], words[i_hi], coef[i_lo], words[i_lo]
        print("[{:>4.1f}] {:<20} [{:>4.1f}] {:<20}".format(*row_vals))

top_features(lr.coef_.flatten(), vect.get_feature_names_out())

[ 8.9] so                   [-9.1] its                 
[ 7.2] even                 [-5.5] having              
[ 5.5] feel                 [-5.3] anyone              
[ 5.5] would                [-5.2] government          
[ 5.4] being                [-5.2] our                 
[ 5.4] more                 [-5.2] right               
[ 5.3] please               [-5.1] anything            
[ 5.2] ll                   [-5.0] nothing             
[ 5.2] doesn                [-5.0] certain             
[ 5.2] hard                 [-5.0] world               
[ 5.1] when                 [-4.9] minimum             
[ 4.9] system               [-4.8] we                  
[ 4.6] philosophy           [-4.7] men                 
[ 4.5] subject              [-4.6] life                
[ 4.4] seems                [-4.5] population          
